X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fgallium%2Fauxiliary%2Fdraw%2Fdraw_vs_aos.c;h=6141ba9cbf70b015a3d1bee7b7b04a4681004c20;hb=0c31661e73dd2979df22a275452efc71c7064f81;hp=5d4a8b38c876f133b37a75dab77683291b188f46;hpb=2161b0fafcdc16703162dd489d2ec1e7114cce4c;p=mesa.git diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c index 5d4a8b38c87..6141ba9cbf7 100644 --- a/src/gallium/auxiliary/draw/draw_vs_aos.c +++ b/src/gallium/auxiliary/draw/draw_vs_aos.c @@ -29,12 +29,14 @@ */ -#include "pipe/p_util.h" +#include "util/u_memory.h" +#include "util/u_math.h" #include "pipe/p_shader_tokens.h" -#include "tgsi/util/tgsi_parse.h" -#include "tgsi/util/tgsi_util.h" -#include "tgsi/exec/tgsi_exec.h" -#include "tgsi/util/tgsi_dump.h" +#include "pipe/p_debug.h" +#include "tgsi/tgsi_parse.h" +#include "tgsi/tgsi_util.h" +#include "tgsi/tgsi_exec.h" +#include "tgsi/tgsi_dump.h" #include "draw_vs.h" #include "draw_vs_aos.h" @@ -43,6 +45,7 @@ #ifdef PIPE_ARCH_X86 #define DISASSEM 0 +#define FAST_MATH 1 static const char *files[] = { @@ -67,34 +70,45 @@ static INLINE boolean eq( struct x86_reg a, } struct x86_reg aos_get_x86( struct aos_compilation *cp, + unsigned which_reg, /* quick hack */ unsigned value ) { - if (cp->ebp != value) { + struct x86_reg reg; + + if (which_reg == 0) + reg = cp->temp_EBP; + else + reg = cp->tmp_EAX; + + if (cp->x86_reg[which_reg] != value) { unsigned offset; switch (value) { case X86_IMMEDIATES: + assert(which_reg == 0); offset = Offset(struct aos_machine, immediates); break; case X86_CONSTANTS: + assert(which_reg == 1); offset = Offset(struct aos_machine, constants); break; - case X86_ATTRIBS: - offset = Offset(struct aos_machine, attrib); + case X86_BUFFERS: + assert(which_reg == 0); + offset = Offset(struct aos_machine, buffer); break; default: assert(0); offset = 0; } - x86_mov(cp->func, cp->temp_EBP, + + x86_mov(cp->func, reg, x86_make_disp(cp->machine_EDX, offset)); - /* x86_deref(x86_make_disp(cp->machine_EDX, offset))); */ - cp->ebp = value; + cp->x86_reg[which_reg] = value; } - return cp->temp_EBP; + return reg; } @@ -106,22 +120,27 @@ static struct x86_reg get_reg_ptr(struct aos_compilation *cp, switch (file) { case TGSI_FILE_INPUT: + assert(idx < MAX_INPUTS); return x86_make_disp(ptr, Offset(struct aos_machine, input[idx])); case TGSI_FILE_OUTPUT: return x86_make_disp(ptr, Offset(struct aos_machine, output[idx])); case TGSI_FILE_TEMPORARY: + assert(idx < MAX_TEMPS); return x86_make_disp(ptr, Offset(struct aos_machine, temp[idx])); case AOS_FILE_INTERNAL: + assert(idx < MAX_INTERNALS); return x86_make_disp(ptr, Offset(struct aos_machine, internal[idx])); case TGSI_FILE_IMMEDIATE: - return x86_make_disp(aos_get_x86(cp, X86_IMMEDIATES), idx * 4 * sizeof(float)); + assert(idx < MAX_IMMEDIATES); /* just a sanity check */ + return x86_make_disp(aos_get_x86(cp, 0, X86_IMMEDIATES), idx * 4 * sizeof(float)); case TGSI_FILE_CONSTANT: - return x86_make_disp(aos_get_x86(cp, X86_CONSTANTS), idx * 4 * sizeof(float)); + assert(idx < MAX_CONSTANTS); /* just a sanity check */ + return x86_make_disp(aos_get_x86(cp, 1, X86_CONSTANTS), idx * 4 * sizeof(float)); default: ERROR(cp, "unknown reg file"); @@ -177,6 +196,18 @@ static void spill( struct aos_compilation *cp, unsigned idx ) } +void aos_spill_all( struct aos_compilation *cp ) +{ + unsigned i; + + for (i = 0; i < 8; i++) { + if (cp->xmm[i].dirty) + spill(cp, i); + aos_release_xmm_reg(cp, i); + } +} + + static struct x86_reg get_xmm_writable( struct aos_compilation *cp, struct x86_reg reg ) { @@ -1369,13 +1400,27 @@ static boolean emit_MAD( struct aos_compilation *cp, const struct tgsi_full_inst return TRUE; } + + /* A wrapper for powf(). * Makes sure it is cdecl and operates on floats. */ static float PIPE_CDECL _powerf( float x, float y ) { +#if FAST_MATH + return util_fast_pow(x, y); +#else return powf( x, y ); +#endif +} + +#if FAST_MATH +static float PIPE_CDECL _exp2(float x) +{ + return util_fast_exp2(x); } +#endif + /* Really not sufficient -- need to check for conditions that could * generate inf/nan values, which will slow things down hugely. @@ -1413,6 +1458,7 @@ static boolean emit_POW( struct aos_compilation *cp, const struct tgsi_full_inst x87_fld_src( cp, &op->FullSrcRegisters[0], 0 ); x87_fstp( cp->func, x86_make_disp( cp->stack_ESP, 0 ) ); + /* tmp_EAX has been pushed & will be restored below */ x86_mov_reg_imm( cp->func, cp->tmp_EAX, (unsigned long) _powerf ); x86_call( cp->func, cp->tmp_EAX ); @@ -1430,6 +1476,48 @@ static boolean emit_POW( struct aos_compilation *cp, const struct tgsi_full_inst } +#if FAST_MATH +static boolean emit_EXPBASE2( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + uint i; + + /* For absolute correctness, need to spill/invalidate all XMM regs + * too. + */ + for (i = 0; i < 8; i++) { + if (cp->xmm[i].dirty) + spill(cp, i); + aos_release_xmm_reg(cp, i); + } + + /* Push caller-save (ie scratch) regs. + */ + x86_cdecl_caller_push_regs( cp->func ); + + x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, -4) ); + + x87_fld_src( cp, &op->FullSrcRegisters[0], 0 ); + x87_fstp( cp->func, x86_make_disp( cp->stack_ESP, 0 ) ); + + /* tmp_EAX has been pushed & will be restored below */ + x86_mov_reg_imm( cp->func, cp->tmp_EAX, (unsigned long) _exp2 ); + x86_call( cp->func, cp->tmp_EAX ); + + x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, 4) ); + + x86_cdecl_caller_pop_regs( cp->func ); + + /* Note retval on x87 stack: + */ + cp->func->x87_stack++; + + x87_fstp_dest4( cp, &op->FullDstRegisters[0] ); + + return TRUE; +} +#endif + + static boolean emit_RCP( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) { struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); @@ -1544,6 +1632,17 @@ static boolean emit_SUB( struct aos_compilation *cp, const struct tgsi_full_inst return TRUE; } +static boolean emit_TRUNC( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); + struct x86_reg tmp0 = aos_get_xmm_reg(cp); + + sse2_cvttps2dq(cp->func, tmp0, arg0); + sse2_cvtdq2ps(cp->func, tmp0, tmp0); + + store_dest(cp, &op->FullDstRegisters[0], tmp0); + return TRUE; +} static boolean emit_XPD( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) { @@ -1650,7 +1749,16 @@ emit_instruction( struct aos_compilation *cp, return emit_RND(cp, inst); case TGSI_OPCODE_EXPBASE2: +#if FAST_MATH + return emit_EXPBASE2(cp, inst); +#elif 0 + /* this seems to fail for "larger" exponents. + * See glean tvertProg1's EX2 test. + */ return emit_EX2(cp, inst); +#else + return FALSE; +#endif case TGSI_OPCODE_LOGBASE2: return emit_LG2(cp, inst); @@ -1673,6 +1781,9 @@ emit_instruction( struct aos_compilation *cp, case TGSI_OPCODE_SIN: return emit_SIN(cp, inst); + case TGSI_OPCODE_TRUNC: + return emit_TRUNC(cp, inst); + case TGSI_OPCODE_END: return TRUE; @@ -1808,6 +1919,8 @@ static boolean build_vertex_program( struct draw_vs_varient_aos_sse *varient, struct aos_compilation cp; unsigned fixup, label; + util_init_math(); + tgsi_parse_init( &parse, varient->base.vs->state.tokens ); memset(&cp, 0, sizeof(cp)); @@ -1852,6 +1965,11 @@ static boolean build_vertex_program( struct draw_vs_varient_aos_sse *varient, save_fpu_state( &cp ); set_fpu_round_nearest( &cp ); + aos_init_inputs( &cp, linear ); + + cp.x86_reg[0] = 0; + cp.x86_reg[1] = 0; + /* Note address for loop jump */ label = x86_get_label(cp.func); @@ -1931,13 +2049,7 @@ static boolean build_vertex_program( struct draw_vs_varient_aos_sse *varient, /* Incr index */ - if (linear) { - x86_inc(cp.func, cp.idx_EBX); - } - else { - x86_lea(cp.func, cp.idx_EBX, x86_make_disp(cp.idx_EBX, 4)); - } - + aos_incr_inputs( &cp, linear ); } /* decr count, loop if not zero */ @@ -1978,15 +2090,13 @@ static void vaos_set_buffer( struct draw_vs_varient *varient, unsigned stride ) { struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient; - unsigned i; - for (i = 0; i < vaos->base.key.nr_inputs; i++) { - if (vaos->base.key.element[i].in.buffer == buf) { - vaos->attrib[i].input_ptr = ((char *)ptr + - vaos->base.key.element[i].in.offset); - vaos->attrib[i].input_stride = stride; - } + if (buf < vaos->nr_vb) { + vaos->buffer[buf].base_ptr = (char *)ptr; + vaos->buffer[buf].stride = stride; } + + if (0) debug_printf("%s %d/%d: %p %d\n", __FUNCTION__, buf, vaos->nr_vb, ptr, stride); } @@ -1999,10 +2109,12 @@ static void PIPE_CDECL vaos_run_elts( struct draw_vs_varient *varient, struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient; struct aos_machine *machine = vaos->draw->vs.aos_machine; + if (0) debug_printf("%s %d\n", __FUNCTION__, count); + machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size; machine->constants = vaos->draw->vs.aligned_constants; machine->immediates = vaos->base.vs->immediates; - machine->attrib = vaos->attrib; + machine->buffer = vaos->buffer; vaos->gen_run_elts( machine, elts, @@ -2018,15 +2130,23 @@ static void PIPE_CDECL vaos_run_linear( struct draw_vs_varient *varient, struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient; struct aos_machine *machine = vaos->draw->vs.aos_machine; + if (0) debug_printf("%s %d %d const: %x\n", __FUNCTION__, start, count, + vaos->base.key.const_vbuffers); + machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size; machine->constants = vaos->draw->vs.aligned_constants; machine->immediates = vaos->base.vs->immediates; - machine->attrib = vaos->attrib; + machine->buffer = vaos->buffer; vaos->gen_run_linear( machine, start, count, output_buffer ); + + /* Sanity spot checks to make sure we didn't trash our constants */ + assert(machine->internal[IMM_ONES][0] == 1.0f); + assert(machine->internal[IMM_IDENTITY][0] == 0.0f); + assert(machine->internal[IMM_NEGS][0] == -1.0f); } @@ -2035,7 +2155,7 @@ static void vaos_destroy( struct draw_vs_varient *varient ) { struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient; - FREE( vaos->attrib ); + FREE( vaos->buffer ); x86_release_func( &vaos->func[0] ); x86_release_func( &vaos->func[1] ); @@ -2048,6 +2168,7 @@ static void vaos_destroy( struct draw_vs_varient *varient ) static struct draw_vs_varient *varient_aos_sse( struct draw_vertex_shader *vs, const struct draw_vs_varient_key *key ) { + unsigned i; struct draw_vs_varient_aos_sse *vaos = CALLOC_STRUCT(draw_vs_varient_aos_sse); if (!vaos) @@ -2055,17 +2176,23 @@ static struct draw_vs_varient *varient_aos_sse( struct draw_vertex_shader *vs, vaos->base.key = *key; vaos->base.vs = vs; - vaos->base.set_input = vaos_set_buffer; + vaos->base.set_buffer = vaos_set_buffer; vaos->base.destroy = vaos_destroy; vaos->base.run_linear = vaos_run_linear; vaos->base.run_elts = vaos_run_elts; vaos->draw = vs->draw; - vaos->attrib = MALLOC( key->nr_inputs * sizeof(vaos->attrib[0]) ); - if (!vaos->attrib) + for (i = 0; i < key->nr_inputs; i++) + vaos->nr_vb = MAX2( vaos->nr_vb, key->element[i].in.buffer + 1 ); + + vaos->buffer = MALLOC( vaos->nr_vb * sizeof(vaos->buffer[0]) ); + if (!vaos->buffer) goto fail; + if (0) + debug_printf("nr_vb: %d const: %x\n", vaos->nr_vb, vaos->base.key.const_vbuffers); + #if 0 tgsi_dump(vs->state.tokens, 0); #endif @@ -2087,8 +2214,8 @@ static struct draw_vs_varient *varient_aos_sse( struct draw_vertex_shader *vs, return &vaos->base; fail: - if (vaos && vaos->attrib) - FREE(vaos->attrib); + if (vaos && vaos->buffer) + FREE(vaos->buffer); if (vaos) x86_release_func( &vaos->func[0] ); @@ -2116,4 +2243,4 @@ struct draw_vs_varient *draw_vs_varient_aos_sse( struct draw_vertex_shader *vs, -#endif +#endif /* PIPE_ARCH_X86 */