*/
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
+#include "util/u_math.h"
#include "pipe/p_shader_tokens.h"
-#include "tgsi/util/tgsi_parse.h"
-#include "tgsi/util/tgsi_util.h"
-#include "tgsi/exec/tgsi_exec.h"
-#include "tgsi/util/tgsi_dump.h"
+#include "pipe/p_debug.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+#include "tgsi/tgsi_exec.h"
+#include "tgsi/tgsi_dump.h"
#include "draw_vs.h"
#include "draw_vs_aos.h"
#include "rtasm/rtasm_x86sse.h"
#ifdef PIPE_ARCH_X86
-
+#define DISASSEM 0
+#define FAST_MATH 1
+
+static const char *files[] =
+{
+ "NULL",
+ "CONST",
+ "IN",
+ "OUT",
+ "TEMP",
+ "SAMP",
+ "ADDR",
+ "IMM",
+ "INTERNAL",
+};
static INLINE boolean eq( struct x86_reg a,
struct x86_reg b )
a.disp == b.disp);
}
+struct x86_reg aos_get_x86( struct aos_compilation *cp,
+ unsigned which_reg, /* quick hack */
+ unsigned value )
+{
+ struct x86_reg reg;
+
+ if (which_reg == 0)
+ reg = cp->temp_EBP;
+ else
+ reg = cp->tmp_EAX;
+
+ if (cp->x86_reg[which_reg] != value) {
+ unsigned offset;
+
+ switch (value) {
+ case X86_IMMEDIATES:
+ assert(which_reg == 0);
+ offset = Offset(struct aos_machine, immediates);
+ break;
+ case X86_CONSTANTS:
+ assert(which_reg == 1);
+ offset = Offset(struct aos_machine, constants);
+ break;
+ case X86_BUFFERS:
+ assert(which_reg == 0);
+ offset = Offset(struct aos_machine, buffer);
+ break;
+ default:
+ assert(0);
+ offset = 0;
+ }
+
+
+ x86_mov(cp->func, reg,
+ x86_make_disp(cp->machine_EDX, offset));
+
+ cp->x86_reg[which_reg] = value;
+ }
+
+ return reg;
+}
+
static struct x86_reg get_reg_ptr(struct aos_compilation *cp,
unsigned file,
switch (file) {
case TGSI_FILE_INPUT:
+ assert(idx < MAX_INPUTS);
return x86_make_disp(ptr, Offset(struct aos_machine, input[idx]));
case TGSI_FILE_OUTPUT:
return x86_make_disp(ptr, Offset(struct aos_machine, output[idx]));
case TGSI_FILE_TEMPORARY:
+ assert(idx < MAX_TEMPS);
return x86_make_disp(ptr, Offset(struct aos_machine, temp[idx]));
- case TGSI_FILE_IMMEDIATE:
- return x86_make_disp(ptr, Offset(struct aos_machine, immediate[idx]));
-
- case TGSI_FILE_CONSTANT:
- return x86_make_disp(ptr, Offset(struct aos_machine, constant[idx]));
-
case AOS_FILE_INTERNAL:
+ assert(idx < MAX_INTERNALS);
return x86_make_disp(ptr, Offset(struct aos_machine, internal[idx]));
+ case TGSI_FILE_IMMEDIATE:
+ assert(idx < MAX_IMMEDIATES); /* just a sanity check */
+ return x86_make_disp(aos_get_x86(cp, 0, X86_IMMEDIATES), idx * 4 * sizeof(float));
+
+ case TGSI_FILE_CONSTANT:
+ assert(idx < MAX_CONSTANTS); /* just a sanity check */
+ return x86_make_disp(aos_get_x86(cp, 1, X86_CONSTANTS), idx * 4 * sizeof(float));
+
default:
ERROR(cp, "unknown reg file");
return x86_make_reg(0,0);
#define X87_CW_ROUND_MASK (3<<10)
#define X87_CW_INFINITY (1<<12)
-static void do_populate_lut( struct shine_tab *tab,
- float unclamped_exponent )
-{
- const float epsilon = 1.0F / 256.0F;
- float exponent = CLAMP(unclamped_exponent, -(128.0F - epsilon), (128.0F - epsilon));
- unsigned i;
-
- tab->exponent = unclamped_exponent; /* for later comparison */
-
- tab->values[0] = 0;
- if (exponent == 0) {
- for (i = 1; i < 258; i++) {
- tab->values[i] = 1.0;
- }
- }
- else {
- for (i = 1; i < 258; i++) {
- tab->values[i] = powf((float)i * epsilon, exponent);
- }
- }
-}
-static void init_internals( struct aos_machine *machine )
-{
- unsigned i;
- float inv = 1.0f/255.0f;
- float f255 = 255.0f;
-
- ASSIGN_4V(machine->internal[IMM_SWZ], 1.0f, -1.0f, 0.0f, 1.0f);
- *(unsigned *)&machine->internal[IMM_SWZ][3] = 0xffffffff;
-
- ASSIGN_4V(machine->internal[IMM_ONES], 1.0f, 1.0f, 1.0f, 1.0f);
- ASSIGN_4V(machine->internal[IMM_NEGS], -1.0f, -1.0f, -1.0f, -1.0f);
- ASSIGN_4V(machine->internal[IMM_IDENTITY], 0.0f, 0.0f, 0.0f, 1.0f);
- ASSIGN_4V(machine->internal[IMM_INV_255], inv, inv, inv, inv);
- ASSIGN_4V(machine->internal[IMM_255], f255, f255, f255, f255);
-
-
- machine->fpu_rnd_nearest = (X87_CW_EXCEPTION_INV_OP |
- X87_CW_EXCEPTION_DENORM_OP |
- X87_CW_EXCEPTION_ZERO_DIVIDE |
- X87_CW_EXCEPTION_OVERFLOW |
- X87_CW_EXCEPTION_UNDERFLOW |
- X87_CW_EXCEPTION_PRECISION |
- (1<<6) |
- X87_CW_ROUND_NEAREST |
- X87_CW_PRECISION_DOUBLE_EXT);
-
- assert(machine->fpu_rnd_nearest == 0x37f);
-
- machine->fpu_rnd_neg_inf = (X87_CW_EXCEPTION_INV_OP |
- X87_CW_EXCEPTION_DENORM_OP |
- X87_CW_EXCEPTION_ZERO_DIVIDE |
- X87_CW_EXCEPTION_OVERFLOW |
- X87_CW_EXCEPTION_UNDERFLOW |
- X87_CW_EXCEPTION_PRECISION |
- (1<<6) |
- X87_CW_ROUND_DOWN |
- X87_CW_PRECISION_DOUBLE_EXT);
-
- for (i = 0; i < MAX_SHINE_TAB; i++)
- do_populate_lut( &machine->shine_tab[i], 1.0f );
-}
static void spill( struct aos_compilation *cp, unsigned idx )
struct x86_reg oldval = get_reg_ptr(cp,
cp->xmm[idx].file,
cp->xmm[idx].idx);
-
+
+ if (0) debug_printf("\nspill %s[%d]",
+ files[cp->xmm[idx].file],
+ cp->xmm[idx].idx);
+
assert(cp->xmm[idx].dirty);
sse_movaps(cp->func, oldval, x86_make_reg(file_XMM, idx));
cp->xmm[idx].dirty = 0;
}
}
-static boolean is_xmm_tmp( struct aos_compilation *cp,
- struct x86_reg reg )
+
+void aos_spill_all( struct aos_compilation *cp )
{
- return (reg.file == file_XMM &&
- cp->xmm[reg.idx].file == TGSI_FILE_NULL);
+ unsigned i;
+
+ for (i = 0; i < 8; i++) {
+ if (cp->xmm[i].dirty)
+ spill(cp, i);
+ aos_release_xmm_reg(cp, i);
+ }
}
-static struct x86_reg get_xmm_clone( struct aos_compilation *cp,
- struct x86_reg reg )
+
+static struct x86_reg get_xmm_writable( struct aos_compilation *cp,
+ struct x86_reg reg )
{
- if (!is_xmm_tmp(cp, reg)) {
+ if (reg.file != file_XMM ||
+ cp->xmm[reg.idx].file != TGSI_FILE_NULL)
+ {
struct x86_reg tmp = aos_get_xmm_reg(cp);
sse_movaps(cp->func, tmp, reg);
reg = tmp;
}
+ cp->xmm[reg.idx].last_used = cp->insn_counter;
return reg;
}
+static struct x86_reg get_xmm( struct aos_compilation *cp,
+ struct x86_reg reg )
+{
+ if (reg.file != file_XMM)
+ {
+ struct x86_reg tmp = aos_get_xmm_reg(cp);
+ sse_movaps(cp->func, tmp, reg);
+ reg = tmp;
+ }
+ cp->xmm[reg.idx].last_used = cp->insn_counter;
+ return reg;
+}
+
+
+/* Allocate an empty xmm register, either as a temporary or later to
+ * "adopt" as a shader reg.
+ */
struct x86_reg aos_get_xmm_reg( struct aos_compilation *cp )
{
unsigned i;
cp->xmm[oldest].file = TGSI_FILE_NULL;
cp->xmm[oldest].idx = 0;
+ cp->xmm[oldest].dirty = 0;
cp->xmm[oldest].last_used = cp->insn_counter;
return x86_make_reg(file_XMM, oldest);
}
cp->xmm[idx].last_used = 0;
}
-static void invalidate_xmm( struct aos_compilation *cp,
- unsigned file, unsigned idx )
-{
- unsigned i;
- /* Invalidate any old copy of this register in XMM0-7.
- */
- for (i = 0; i < 8; i++) {
- if (cp->xmm[i].file == file && cp->xmm[i].idx == idx) {
-
- if (cp->xmm[i].dirty)
- spill(cp, i);
-
- aos_release_xmm_reg(cp, i);
- break;
- }
- }
-
- for (; i < 8; i++) {
- if (cp->xmm[i].file == file && cp->xmm[i].idx == idx) {
- assert(0);
- }
- }
-}
-
+
+/* Mark an xmm reg as holding the current copy of a shader reg.
+ */
void aos_adopt_xmm_reg( struct aos_compilation *cp,
struct x86_reg reg,
unsigned file,
return;
}
+
+ /* If any xmm reg thinks it holds this shader reg, break the
+ * illusion.
+ */
for (i = 0; i < 8; i++) {
if (cp->xmm[i].file == file &&
- cp->xmm[i].idx == idx) {
+ cp->xmm[i].idx == idx)
+ {
+ /* If an xmm reg is already holding this shader reg, take into account its
+ * dirty flag...
+ */
+ dirty |= cp->xmm[i].dirty;
aos_release_xmm_reg(cp, i);
}
}
}
-
+/* Return a pointer to the in-memory copy of the reg, making sure it is uptodate.
+ */
static struct x86_reg aos_get_shader_reg_ptr( struct aos_compilation *cp,
unsigned file,
unsigned idx )
{
- invalidate_xmm( cp, file, idx );
+ unsigned i;
+
+ /* Ensure the in-memory copy of this reg is up-to-date
+ */
+ for (i = 0; i < 8; i++) {
+ if (cp->xmm[i].file == file &&
+ cp->xmm[i].idx == idx &&
+ cp->xmm[i].dirty) {
+ spill(cp, i);
+ }
+ }
+
return get_reg_ptr( cp, file, idx );
}
static struct x86_reg get_dst_ptr( struct aos_compilation *cp,
const struct tgsi_full_dst_register *dst )
{
- return aos_get_shader_reg_ptr( cp, dst->DstRegister.File, dst->DstRegister.Index );
+ unsigned file = dst->DstRegister.File;
+ unsigned idx = dst->DstRegister.Index;
+ unsigned i;
+
+
+ /* Ensure in-memory copy of this reg is up-to-date and invalidate
+ * any xmm copies.
+ */
+ for (i = 0; i < 8; i++) {
+ if (cp->xmm[i].file == file &&
+ cp->xmm[i].idx == idx)
+ {
+ if (cp->xmm[i].dirty)
+ spill(cp, i);
+
+ aos_release_xmm_reg(cp, i);
+ }
+ }
+
+ return get_reg_ptr( cp, file, idx );
}
unsigned file,
unsigned idx )
{
- struct x86_reg reg = aos_get_shader_reg( cp, file, idx );
-
- if (reg.file != file_XMM) {
- struct x86_reg tmp = aos_get_xmm_reg(cp);
- sse_movaps(cp->func, tmp, reg);
- aos_adopt_xmm_reg( cp, tmp, file, idx, FALSE );
- reg = tmp;
- }
+ struct x86_reg reg = get_xmm( cp,
+ aos_get_shader_reg( cp, file, idx ) );
+ aos_adopt_xmm_reg( cp,
+ reg,
+ file,
+ idx,
+ FALSE );
+
return reg;
}
src->SrcRegister.File,
src->SrcRegister.Index);
unsigned i;
- unsigned swz = 0;
+ ubyte swz = 0;
unsigned negs = 0;
unsigned abs = 0;
if (swz != SSE_SWIZZLE_NOOP || negs != 0 || abs != 0) {
struct x86_reg dst = aos_get_xmm_reg(cp);
- if (swz != SSE_SWIZZLE_NOOP) {
+ if (swz != SSE_SWIZZLE_NOOP)
emit_pshufd(cp, dst, arg0, swz);
- arg0 = dst;
- }
+ else
+ sse_movaps(cp->func, dst, arg0);
if (negs && negs != 0xf) {
struct x86_reg imm_swz = aos_get_internal_xmm(cp, IMM_SWZ);
(negs & 2) ? 1 : 0,
(negs & 4) ? 1 : 0,
(negs & 8) ? 1 : 0));
- sse_mulps(cp->func, dst, arg0);
+ sse_mulps(cp->func, dst, tmp);
aos_release_xmm_reg(cp, tmp.idx);
- arg0 = dst;
}
else if (negs) {
struct x86_reg imm_negs = aos_get_internal_xmm(cp, IMM_NEGS);
sse_mulps(cp->func, dst, imm_negs);
- arg0 = dst;
}
struct x86_reg neg = aos_get_internal(cp, IMM_NEGS);
struct x86_reg tmp = aos_get_xmm_reg(cp);
- sse_movaps(cp->func, tmp, arg0);
+ sse_movaps(cp->func, tmp, dst);
sse_mulps(cp->func, tmp, neg);
- sse_maxps(cp->func, dst, arg0);
+ sse_maxps(cp->func, dst, tmp);
aos_release_xmm_reg(cp, tmp.idx);
- arg0 = dst;
}
+
+ return dst;
}
return arg0;
case TGSI_WRITEMASK_XYZW:
aos_adopt_xmm_reg(cp,
- get_xmm_clone(cp, result),
+ get_xmm_writable(cp, result),
reg->DstRegister.File,
reg->DstRegister.Index,
TRUE);
switch (reg->DstRegister.WriteMask) {
case TGSI_WRITEMASK_X:
- sse_movss(cp->func, dst, get_xmm_clone(cp, result));
+ sse_movss(cp->func, dst, get_xmm(cp, result));
break;
- case TGSI_WRITEMASK_XY:
- sse_shufps(cp->func, dst, get_xmm_clone(cp, result), SHUF(X, Y, Z, W));
+ case TGSI_WRITEMASK_ZW:
+ sse_shufps(cp->func, dst, get_xmm(cp, result), SHUF(X, Y, Z, W));
break;
- case TGSI_WRITEMASK_ZW:
- result = get_xmm_clone(cp, result);
+ case TGSI_WRITEMASK_XY:
+ result = get_xmm_writable(cp, result);
sse_shufps(cp->func, result, dst, SHUF(X, Y, Z, W));
dst = result;
break;
case TGSI_WRITEMASK_YZW:
+ result = get_xmm_writable(cp, result);
sse_movss(cp->func, result, dst);
dst = result;
break;
}
+static void inject_scalar( struct aos_compilation *cp,
+ struct x86_reg dst,
+ struct x86_reg result,
+ ubyte swizzle )
+{
+ sse_shufps(cp->func, dst, dst, swizzle);
+ sse_movss(cp->func, dst, result);
+ sse_shufps(cp->func, dst, dst, swizzle);
+}
+
+
+static void store_scalar_dest( struct aos_compilation *cp,
+ const struct tgsi_full_dst_register *reg,
+ struct x86_reg result )
+{
+ unsigned writemask = reg->DstRegister.WriteMask;
+ struct x86_reg dst;
+
+ if (writemask != TGSI_WRITEMASK_X &&
+ writemask != TGSI_WRITEMASK_Y &&
+ writemask != TGSI_WRITEMASK_Z &&
+ writemask != TGSI_WRITEMASK_W &&
+ writemask != 0)
+ {
+ result = get_xmm_writable(cp, result); /* already true, right? */
+ sse_shufps(cp->func, result, result, SHUF(X,X,X,X));
+ store_dest(cp, reg, result);
+ return;
+ }
+
+ result = get_xmm(cp, result);
+ dst = aos_get_shader_reg_xmm(cp,
+ reg->DstRegister.File,
+ reg->DstRegister.Index);
+
+
+
+ switch (reg->DstRegister.WriteMask) {
+ case TGSI_WRITEMASK_X:
+ sse_movss(cp->func, dst, result);
+ break;
+
+ case TGSI_WRITEMASK_Y:
+ inject_scalar(cp, dst, result, SHUF(Y, X, Z, W));
+ break;
+
+ case TGSI_WRITEMASK_Z:
+ inject_scalar(cp, dst, result, SHUF(Z, Y, X, W));
+ break;
+
+ case TGSI_WRITEMASK_W:
+ inject_scalar(cp, dst, result, SHUF(W, Y, Z, X));
+ break;
+
+ default:
+ break;
+ }
+
+ aos_adopt_xmm_reg(cp,
+ dst,
+ reg->DstRegister.File,
+ reg->DstRegister.Index,
+ TRUE);
+}
+
+
static void x87_fst_or_nop( struct x86_function *func,
unsigned writemask,
}
}
-
+#if 0
static void x87_emit_ex2( struct aos_compilation *cp )
{
struct x86_reg st0 = x86_make_reg(file_x87, 0);
assert( stack == cp->func->x87_stack);
}
+#endif
+#if 0
static void PIPE_CDECL print_reg( const char *msg,
const float *reg )
{
debug_printf("%s: %f %f %f %f\n", msg, reg[0], reg[1], reg[2], reg[3]);
}
+#endif
+#if 0
static void emit_print( struct aos_compilation *cp,
const char *message, /* must point to a static string! */
unsigned file,
unsigned idx )
{
struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
- struct x86_reg arg = get_reg_ptr( cp, file, idx );
+ struct x86_reg arg = aos_get_shader_reg_ptr( cp, file, idx );
unsigned i;
/* There shouldn't be anything on the x87 stack. Can add this
/* Done...
*/
}
+#endif
/**
* The traditional instructions. All operate on internal registers
{
struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
struct x86_reg neg = aos_get_internal(cp, IMM_NEGS);
- struct x86_reg dst = get_xmm_clone(cp, arg0);
+ struct x86_reg tmp = aos_get_xmm_reg(cp);
- sse_mulps(cp->func, dst, neg);
- sse_maxps(cp->func, dst, arg0);
+ sse_movaps(cp->func, tmp, arg0);
+ sse_mulps(cp->func, tmp, neg);
+ sse_maxps(cp->func, tmp, arg0);
- store_dest(cp, &op->FullDstRegisters[0], dst);
+ store_dest(cp, &op->FullDstRegisters[0], tmp);
return TRUE;
}
{
struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
- struct x86_reg dst = get_xmm_clone(cp, arg0);
+ struct x86_reg dst = get_xmm_writable(cp, arg0);
sse_addps(cp->func, dst, arg1);
return TRUE;
}
-
/* The dotproduct instructions don't really do that well in sse:
+ * XXX: produces wrong results -- disabled.
*/
static boolean emit_DP3( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
{
struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
struct x86_reg tmp = aos_get_xmm_reg(cp);
- struct x86_reg dst = get_xmm_clone(cp, arg0);
+ struct x86_reg dst = get_xmm_writable(cp, arg0);
sse_mulps(cp->func, dst, arg1);
-
/* Now the hard bit: sum the first 3 values:
*/
sse_movhlps(cp->func, tmp, dst);
emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
sse_addss(cp->func, dst, tmp);
- if (op->FullDstRegisters[0].DstRegister.WriteMask != 0x1)
- sse_shufps(cp->func, dst, dst, SHUF(X, X, X, X));
-
aos_release_xmm_reg(cp, tmp.idx);
- store_dest(cp, &op->FullDstRegisters[0], dst);
+ store_scalar_dest(cp, &op->FullDstRegisters[0], dst);
return TRUE;
}
-
-
static boolean emit_DP4( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
{
struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
struct x86_reg tmp = aos_get_xmm_reg(cp);
- struct x86_reg dst = get_xmm_clone(cp, arg0);
+ struct x86_reg dst = get_xmm_writable(cp, arg0);
sse_mulps(cp->func, dst, arg1);
emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
sse_addss(cp->func, dst, tmp);
- if (op->FullDstRegisters[0].DstRegister.WriteMask != 0x1)
- sse_shufps(cp->func, dst, dst, SHUF(X, X, X, X));
-
aos_release_xmm_reg(cp, tmp.idx);
- store_dest(cp, &op->FullDstRegisters[0], dst);
+ store_scalar_dest(cp, &op->FullDstRegisters[0], dst);
return TRUE;
}
struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
struct x86_reg tmp = aos_get_xmm_reg(cp);
- struct x86_reg dst = get_xmm_clone(cp, arg0);
+ struct x86_reg dst = get_xmm_writable(cp, arg0);
sse_mulps(cp->func, dst, arg1);
emit_pshufd(cp, tmp, arg1, SHUF(W,W,W,W));
sse_addss(cp->func, dst, tmp);
- if (op->FullDstRegisters[0].DstRegister.WriteMask != 0x1)
- sse_shufps(cp->func, dst, dst, SHUF(X, X, X, X));
-
aos_release_xmm_reg(cp, tmp.idx);
- store_dest(cp, &op->FullDstRegisters[0], dst);
+ store_scalar_dest(cp, &op->FullDstRegisters[0], dst);
return TRUE;
}
return TRUE;
}
-
+#if 0
static boolean emit_EX2( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
{
x87_fld_src(cp, &op->FullSrcRegisters[0], 0);
x87_fstp_dest4(cp, &op->FullDstRegisters[0]);
return TRUE;
}
+#endif
-static boolean emit_EXP( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
- struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]);
- struct x86_reg st0 = x86_make_reg(file_x87, 0);
- struct x86_reg st1 = x86_make_reg(file_x87, 1);
- struct x86_reg st3 = x86_make_reg(file_x87, 3);
- unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask;
-
- /* CAUTION: dst may alias arg0!
- */
- x87_fld_src(cp, &op->FullSrcRegisters[0], 0); /* arg0.x */
- x87_fld(cp->func, st0); /* arg arg */
-
- /* by default, fpu is setup to round-to-nearest. We want to
- * change this now, and track the state through to the end of the
- * generated function so that it isn't repeated unnecessarily.
- * Alternately, could subtract .5 to get round to -inf behaviour.
- */
- set_fpu_round_neg_inf( cp );
- x87_fprndint( cp->func ); /* flr(a) a */
- x87_fld(cp->func, st0); /* flr(a) flr(a) a */
- x87_fld1(cp->func); /* 1 floor(a) floor(a) a */
- x87_fst_or_nop(cp->func, writemask, 3, dst); /* stack unchanged */
-
- x87_fscale(cp->func); /* 2^floor(a) floor(a) a */
- x87_fst(cp->func, st3); /* 2^floor(a) floor(a) a 2^floor(a)*/
-
- x87_fstp_or_pop(cp->func, writemask, 0, dst); /* flr(a) a 2^flr(a) */
-
- x87_fsubp(cp->func, st1); /* frac(a) 2^flr(a) */
-
- x87_fst_or_nop(cp->func, writemask, 1, dst); /* frac(a) 2^flr(a) */
-
- x87_f2xm1(cp->func); /* (2^frac(a))-1 2^flr(a)*/
- x87_fld1(cp->func); /* 1 (2^frac(a))-1 2^flr(a)*/
- x87_faddp(cp->func, st1); /* 2^frac(a) 2^flr(a) */
- x87_fmulp(cp->func, st1); /* 2^a */
-
- x87_fstp_or_pop(cp->func, writemask, 2, dst);
-
-/* dst[0] = 2^floor(tmp); */
-/* dst[1] = frac(tmp); */
-/* dst[2] = 2^floor(tmp) * 2^frac(tmp); */
-/* dst[3] = 1.0F; */
- return TRUE;
-}
-
-static boolean emit_LOG( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
- struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]);
- struct x86_reg st0 = x86_make_reg(file_x87, 0);
- struct x86_reg st1 = x86_make_reg(file_x87, 1);
- struct x86_reg st2 = x86_make_reg(file_x87, 2);
- unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask;
-
- /* CAUTION: dst may alias arg0!
- */
- x87_fld_src(cp, &op->FullSrcRegisters[0], 0); /* arg0.x */
- x87_fabs(cp->func); /* |arg0.x| */
- x87_fxtract(cp->func); /* mantissa(arg0.x), exponent(arg0.x) */
- x87_fst(cp->func, st2); /* mantissa, exponent, mantissa */
- x87_fld1(cp->func); /* 1, mantissa, exponent, mantissa */
- x87_fyl2x(cp->func); /* log2(mantissa), exponent, mantissa */
- x87_fadd(cp->func, st0, st1); /* e+l2(m), e, m */
-
- x87_fstp_or_pop(cp->func, writemask, 2, dst); /* e, m */
-
- x87_fld1(cp->func); /* 1, e, m */
- x87_fsub(cp->func, st1, st0); /* 1, e-1, m */
-
- x87_fstp_or_pop(cp->func, writemask, 3, dst); /* e-1,m */
- x87_fstp_or_pop(cp->func, writemask, 0, dst); /* m */
-
- x87_fadd(cp->func, st0, st0); /* 2m */
-
- x87_fstp_or_pop( cp->func, writemask, 1, dst );
-
- return TRUE;
-}
static boolean emit_FLR( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
{
return TRUE;
}
-static PIPE_CDECL void do_lit( struct aos_machine *machine,
- float *result,
- const float *in,
- unsigned count )
-{
- if (in[0] > 0)
- {
- if (in[1] <= 0.0)
- {
- result[0] = 1.0F;
- result[1] = in[0];
- result[2] = 1.0;
- result[3] = 1.0F;
- }
- else
- {
- const float epsilon = 1.0F / 256.0F;
- float exponent = CLAMP(in[3], -(128.0F - epsilon), (128.0F - epsilon));
- result[0] = 1.0F;
- result[1] = in[0];
- result[2] = powf(in[1], exponent);
- result[3] = 1.0;
- }
- }
- else
- {
- result[0] = 1.0F;
- result[1] = 0.0;
- result[2] = 0.0;
- result[3] = 1.0F;
- }
-}
-
-
-static PIPE_CDECL void do_lit_lut( struct aos_machine *machine,
- float *result,
- const float *in,
- unsigned count )
-{
- if (in[0] > 0)
- {
- if (in[1] <= 0.0)
- {
- result[0] = 1.0F;
- result[1] = in[0];
- result[2] = 1.0;
- result[3] = 1.0F;
- return;
- }
-
- if (machine->lit_info[count].shine_tab->exponent != in[3]) {
- machine->lit_info[count].func = do_lit;
- goto no_luck;
- }
-
- if (in[1] <= 1.0)
- {
- const float *tab = machine->lit_info[count].shine_tab->values;
- float f = in[1] * 256;
- int k = (int)f;
- float frac = f - (float)k;
-
- result[0] = 1.0F;
- result[1] = in[0];
- result[2] = tab[k] + frac*(tab[k+1]-tab[k]);
- result[3] = 1.0;
- return;
- }
-
- no_luck:
- {
- const float epsilon = 1.0F / 256.0F;
- float exponent = CLAMP(in[3], -(128.0F - epsilon), (128.0F - epsilon));
- result[0] = 1.0F;
- result[1] = in[0];
- result[2] = powf(in[1], exponent);
- result[3] = 1.0;
- }
- }
- else
- {
- result[0] = 1.0F;
- result[1] = 0.0;
- result[2] = 0.0;
- result[3] = 1.0F;
- }
-}
-
-
-
-static void PIPE_CDECL populate_lut( struct aos_machine *machine,
- float *result,
- const float *in,
- unsigned count )
-{
- unsigned i, tab;
-
- /* Search for an existing table for this value. Note that without
- * static analysis we don't really know if in[3] will be constant,
- * but it usually is...
- */
- for (tab = 0; tab < 4; tab++) {
- if (machine->shine_tab[tab].exponent == in[3]) {
- goto found;
- }
- }
-
- for (tab = 0, i = 1; i < 4; i++) {
- if (machine->shine_tab[i].last_used < machine->shine_tab[tab].last_used)
- tab = i;
- }
-
- if (machine->shine_tab[tab].last_used == machine->now) {
- /* No unused tables (this is not a ffvertex program...). Just
- * call pow each time:
- */
- machine->lit_info[count].func = do_lit;
- machine->lit_info[count].func( machine, result, in, count );
- return;
- }
- else {
- do_populate_lut( &machine->shine_tab[tab], in[3] );
- }
-
- found:
- machine->shine_tab[tab].last_used = machine->now;
- machine->lit_info[count].shine_tab = &machine->shine_tab[tab];
- machine->lit_info[count].func = do_lit_lut;
- machine->lit_info[count].func( machine, result, in, count );
-}
Offset(struct lit_info, func)));
}
else {
- x86_mov_reg_imm( cp->func, ecx, (int)do_lit );
+ x86_mov_reg_imm( cp->func, ecx, (int)aos_do_lit );
}
x86_call( cp->func, ecx );
if (writemask != TGSI_WRITEMASK_XYZW) {
store_dest( cp,
&op->FullDstRegisters[0],
- get_xmm_clone( cp, result ) );
+ get_xmm_writable( cp, result ) );
}
return TRUE;
}
-
+#if 0
static boolean emit_inline_LIT( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
{
struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]);
return TRUE;
}
+#endif
{
struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
- struct x86_reg dst = get_xmm_clone(cp, arg0);
+ struct x86_reg dst = get_xmm_writable(cp, arg0);
sse_maxps(cp->func, dst, arg1);
{
struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
- struct x86_reg dst = get_xmm_clone(cp, arg0);
+ struct x86_reg dst = get_xmm_writable(cp, arg0);
sse_minps(cp->func, dst, arg1);
static boolean emit_MOV( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
{
struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
- struct x86_reg dst = get_xmm_clone(cp, arg0);
+ struct x86_reg dst = get_xmm_writable(cp, arg0);
/* potentially nothing to do */
{
struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
- struct x86_reg dst = get_xmm_clone(cp, arg0);
+ struct x86_reg dst = get_xmm_writable(cp, arg0);
sse_mulps(cp->func, dst, arg1);
/* If we can't clobber old contents of arg0, get a temporary & copy
* it there, then clobber it...
*/
- arg0 = get_xmm_clone(cp, arg0);
+ arg0 = get_xmm_writable(cp, arg0);
sse_mulps(cp->func, arg0, arg1);
sse_addps(cp->func, arg0, arg2);
}
+
+/* A wrapper for powf().
+ * Makes sure it is cdecl and operates on floats.
+ */
+static float PIPE_CDECL _powerf( float x, float y )
+{
+#if FAST_MATH
+ return util_fast_pow(x, y);
+#else
+ return powf( x, y );
+#endif
+}
+
+#if FAST_MATH
+static float PIPE_CDECL _exp2(float x)
+{
+ return util_fast_exp2(x);
+}
+#endif
+
+
+/* Really not sufficient -- need to check for conditions that could
+ * generate inf/nan values, which will slow things down hugely.
+ */
static boolean emit_POW( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
{
+#if 0
x87_fld_src(cp, &op->FullSrcRegisters[1], 0); /* a1.x */
x87_fld_src(cp, &op->FullSrcRegisters[0], 0); /* a0.x a1.x */
x87_fyl2x(cp->func); /* a1*log2(a0) */
x87_emit_ex2( cp ); /* 2^(a1*log2(a0)) */
x87_fstp_dest4(cp, &op->FullDstRegisters[0]);
+#else
+ uint i;
+
+ /* For absolute correctness, need to spill/invalidate all XMM regs
+ * too.
+ */
+ for (i = 0; i < 8; i++) {
+ if (cp->xmm[i].dirty)
+ spill(cp, i);
+ aos_release_xmm_reg(cp, i);
+ }
+
+ /* Push caller-save (ie scratch) regs.
+ */
+ x86_cdecl_caller_push_regs( cp->func );
+
+ x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, -8) );
+
+ x87_fld_src( cp, &op->FullSrcRegisters[1], 0 );
+ x87_fstp( cp->func, x86_make_disp( cp->stack_ESP, 4 ) );
+ x87_fld_src( cp, &op->FullSrcRegisters[0], 0 );
+ x87_fstp( cp->func, x86_make_disp( cp->stack_ESP, 0 ) );
+
+ /* tmp_EAX has been pushed & will be restored below */
+ x86_mov_reg_imm( cp->func, cp->tmp_EAX, (unsigned long) _powerf );
+ x86_call( cp->func, cp->tmp_EAX );
+
+ x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, 8) );
+
+ x86_cdecl_caller_pop_regs( cp->func );
+
+ /* Note retval on x87 stack:
+ */
+ cp->func->x87_stack++;
+
+ x87_fstp_dest4( cp, &op->FullDstRegisters[0] );
+#endif
+ return TRUE;
+}
+
+
+#if FAST_MATH
+static boolean emit_EXPBASE2( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
+{
+ uint i;
+
+ /* For absolute correctness, need to spill/invalidate all XMM regs
+ * too.
+ */
+ for (i = 0; i < 8; i++) {
+ if (cp->xmm[i].dirty)
+ spill(cp, i);
+ aos_release_xmm_reg(cp, i);
+ }
+
+ /* Push caller-save (ie scratch) regs.
+ */
+ x86_cdecl_caller_push_regs( cp->func );
+
+ x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, -4) );
+
+ x87_fld_src( cp, &op->FullSrcRegisters[0], 0 );
+ x87_fstp( cp->func, x86_make_disp( cp->stack_ESP, 0 ) );
+
+ /* tmp_EAX has been pushed & will be restored below */
+ x86_mov_reg_imm( cp->func, cp->tmp_EAX, (unsigned long) _exp2 );
+ x86_call( cp->func, cp->tmp_EAX );
+
+ x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, 4) );
+
+ x86_cdecl_caller_pop_regs( cp->func );
+
+ /* Note retval on x87 stack:
+ */
+ cp->func->x87_stack++;
+
+ x87_fstp_dest4( cp, &op->FullDstRegisters[0] );
+
return TRUE;
}
+#endif
static boolean emit_RCP( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
sse_divss(cp->func, dst, arg0);
}
- if (op->FullDstRegisters[0].DstRegister.WriteMask != 0x1)
- sse_shufps(cp->func, dst, dst, SHUF(X, X, X, X));
-
- store_dest(cp, &op->FullDstRegisters[0], dst);
+ store_scalar_dest(cp, &op->FullDstRegisters[0], dst);
return TRUE;
}
+
+/* Although rsqrtps() and rcpps() are low precision on some/all SSE
+ * implementations, it is possible to improve its precision at
+ * fairly low cost, using a newton/raphson step, as below:
+ *
+ * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
+ * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
+ * or:
+ * x1 = rsqrtps(a) * [1.5 - .5 * a * rsqrtps(a) * rsqrtps(a)]
+ *
+ *
+ * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
+ */
static boolean emit_RSQ( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
{
- struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
- struct x86_reg dst = aos_get_xmm_reg(cp);
- sse_rsqrtss(cp->func, dst, arg0);
-
- /* Extend precision here...
- */
-
- if (op->FullDstRegisters[0].DstRegister.WriteMask != 0x1)
- sse_shufps(cp->func, dst, dst, SHUF(X, X, X, X));
+ if (0) {
+ struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
+ struct x86_reg r = aos_get_xmm_reg(cp);
+ sse_rsqrtss(cp->func, r, arg0);
+ store_scalar_dest(cp, &op->FullDstRegisters[0], r);
+ return TRUE;
+ }
+ else {
+ struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
+ struct x86_reg r = aos_get_xmm_reg(cp);
- store_dest(cp, &op->FullDstRegisters[0], dst);
- return TRUE;
+ struct x86_reg neg_half = get_reg_ptr( cp, AOS_FILE_INTERNAL, IMM_RSQ );
+ struct x86_reg one_point_five = x86_make_disp( neg_half, 4 );
+ struct x86_reg src = get_xmm_writable( cp, arg0 );
+
+ sse_rsqrtss( cp->func, r, src ); /* rsqrtss(a) */
+ sse_mulss( cp->func, src, neg_half ); /* -.5 * a */
+ sse_mulss( cp->func, src, r ); /* -.5 * a * r */
+ sse_mulss( cp->func, src, r ); /* -.5 * a * r * r */
+ sse_addss( cp->func, src, one_point_five ); /* 1.5 - .5 * a * r * r */
+ sse_mulss( cp->func, r, src ); /* r * (1.5 - .5 * a * r * r) */
+
+ store_scalar_dest(cp, &op->FullDstRegisters[0], r);
+ return TRUE;
+ }
}
struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
struct x86_reg ones = aos_get_internal(cp, IMM_ONES);
- struct x86_reg dst = get_xmm_clone(cp, arg0);
+ struct x86_reg dst = get_xmm_writable(cp, arg0);
sse_cmpps(cp->func, dst, arg1, cc_NotLessThan);
sse_andps(cp->func, dst, ones);
struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
struct x86_reg ones = aos_get_internal(cp, IMM_ONES);
- struct x86_reg dst = get_xmm_clone(cp, arg0);
+ struct x86_reg dst = get_xmm_writable(cp, arg0);
sse_cmpps(cp->func, dst, arg1, cc_LessThan);
sse_andps(cp->func, dst, ones);
{
struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
- struct x86_reg dst = get_xmm_clone(cp, arg0);
+ struct x86_reg dst = get_xmm_writable(cp, arg0);
sse_subps(cp->func, dst, arg1);
return TRUE;
}
+static boolean emit_TRUNC( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
+{
+ struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
+ struct x86_reg tmp0 = aos_get_xmm_reg(cp);
+
+ sse2_cvttps2dq(cp->func, tmp0, arg0);
+ sse2_cvtdq2ps(cp->func, tmp0, tmp0);
+
+ store_dest(cp, &op->FullDstRegisters[0], tmp0);
+ return TRUE;
+}
static boolean emit_XPD( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
{
struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
- struct x86_reg dst = aos_get_xmm_reg(cp);
struct x86_reg tmp0 = aos_get_xmm_reg(cp);
struct x86_reg tmp1 = aos_get_xmm_reg(cp);
- /* Could avoid tmp0, tmp1 if we overwrote arg0, arg1. Need a way
- * to invalidate registers. This will come with better analysis
- * (liveness analysis) of the incoming program.
- */
- emit_pshufd(cp, dst, arg0, SHUF(Y, Z, X, W));
- emit_pshufd(cp, tmp1, arg1, SHUF(Z, X, Y, W));
- sse_mulps(cp->func, dst, tmp1);
- emit_pshufd(cp, tmp0, arg0, SHUF(Z, X, Y, W));
emit_pshufd(cp, tmp1, arg1, SHUF(Y, Z, X, W));
- sse_mulps(cp->func, tmp0, tmp1);
- sse_subps(cp->func, dst, tmp0);
+ sse_mulps(cp->func, tmp1, arg0);
+ emit_pshufd(cp, tmp0, arg0, SHUF(Y, Z, X, W));
+ sse_mulps(cp->func, tmp0, arg1);
+ sse_subps(cp->func, tmp1, tmp0);
+ sse_shufps(cp->func, tmp1, tmp1, SHUF(Y, Z, X, W));
+/* dst[2] = arg0[0] * arg1[1] - arg0[1] * arg1[0]; */
/* dst[0] = arg0[1] * arg1[2] - arg0[2] * arg1[1]; */
/* dst[1] = arg0[2] * arg1[0] - arg0[0] * arg1[2]; */
-/* dst[2] = arg0[0] * arg1[1] - arg0[1] * arg1[0]; */
/* dst[3] is undef */
aos_release_xmm_reg(cp, tmp0.idx);
- aos_release_xmm_reg(cp, tmp1.idx);
- store_dest(cp, &op->FullDstRegisters[0], dst);
+ store_dest(cp, &op->FullDstRegisters[0], tmp1);
return TRUE;
}
return emit_RSQ(cp, inst);
case TGSI_OPCODE_EXP:
- return emit_EXP(cp, inst);
+ /*return emit_EXP(cp, inst);*/
+ return FALSE;
case TGSI_OPCODE_LOG:
- return emit_LOG(cp, inst);
+ /*return emit_LOG(cp, inst);*/
+ return FALSE;
case TGSI_OPCODE_MUL:
return emit_MUL(cp, inst);
return emit_RND(cp, inst);
case TGSI_OPCODE_EXPBASE2:
+#if FAST_MATH
+ return emit_EXPBASE2(cp, inst);
+#elif 0
+ /* this seems to fail for "larger" exponents.
+ * See glean tvertProg1's EX2 test.
+ */
return emit_EX2(cp, inst);
+#else
+ return FALSE;
+#endif
case TGSI_OPCODE_LOGBASE2:
return emit_LG2(cp, inst);
case TGSI_OPCODE_SIN:
return emit_SIN(cp, inst);
+ case TGSI_OPCODE_TRUNC:
+ return emit_TRUNC(cp, inst);
+
case TGSI_OPCODE_END:
return TRUE;
{
struct x86_reg pos = aos_get_shader_reg_xmm(cp,
TGSI_FILE_OUTPUT,
- 0);
+ cp->vaos->draw->vs.position_output );
struct x86_reg scale = x86_make_disp(cp->machine_EDX,
Offset(struct aos_machine, scale));
aos_adopt_xmm_reg( cp,
pos,
TGSI_FILE_OUTPUT,
- 0,
+ cp->vaos->draw->vs.position_output,
TRUE );
return TRUE;
}
struct x86_reg tmp = aos_get_xmm_reg(cp);
struct x86_reg pos = aos_get_shader_reg_xmm(cp,
TGSI_FILE_OUTPUT,
- 0);
+ cp->vaos->draw->vs.position_output);
struct x86_reg scale = x86_make_disp(cp->machine_EDX,
Offset(struct aos_machine, scale));
aos_adopt_xmm_reg( cp,
pos,
TGSI_FILE_OUTPUT,
- 0,
+ cp->vaos->draw->vs.position_output,
TRUE );
return TRUE;
}
+#if 0
static boolean note_immediate( struct aos_compilation *cp,
struct tgsi_full_immediate *imm )
{
return TRUE;
}
+#endif
}
-#define ARG_VARIENT 1
+#define ARG_MACHINE 1
#define ARG_START_ELTS 2
#define ARG_COUNT 3
#define ARG_OUTBUF 4
struct aos_compilation cp;
unsigned fixup, label;
+ util_init_math();
+
tgsi_parse_init( &parse, varient->base.vs->state.tokens );
memset(&cp, 0, sizeof(cp));
cp.outbuf_ECX = x86_make_reg(file_REG32, reg_CX);
cp.machine_EDX = x86_make_reg(file_REG32, reg_DX);
cp.count_ESI = x86_make_reg(file_REG32, reg_SI);
+ cp.temp_EBP = x86_make_reg(file_REG32, reg_BP);
+ cp.stack_ESP = x86_make_reg( file_REG32, reg_SP );
x86_init_func(cp.func);
x86_push(cp.func, cp.idx_EBX);
x86_push(cp.func, cp.count_ESI);
+ x86_push(cp.func, cp.temp_EBP);
/* Load arguments into regs:
*/
- x86_mov(cp.func, cp.machine_EDX, x86_fn_arg(cp.func, ARG_VARIENT));
+ x86_mov(cp.func, cp.machine_EDX, x86_fn_arg(cp.func, ARG_MACHINE));
x86_mov(cp.func, cp.idx_EBX, x86_fn_arg(cp.func, ARG_START_ELTS));
x86_mov(cp.func, cp.count_ESI, x86_fn_arg(cp.func, ARG_COUNT));
x86_mov(cp.func, cp.outbuf_ECX, x86_fn_arg(cp.func, ARG_OUTBUF));
x86_cmp(cp.func, cp.count_ESI, cp.tmp_EAX);
fixup = x86_jcc_forward(cp.func, cc_E);
- /* Dig out the machine pointer from inside the varient arg
- */
- x86_mov(cp.func, cp.machine_EDX,
- x86_make_disp(cp.machine_EDX,
- Offset( struct draw_vs_varient_aos_sse, machine )));
save_fpu_state( &cp );
set_fpu_round_nearest( &cp );
+ aos_init_inputs( &cp, linear );
+
+ cp.x86_reg[0] = 0;
+ cp.x86_reg[1] = 0;
+
/* Note address for loop jump
*/
label = x86_get_label(cp.func);
switch (parse.FullToken.Token.Type) {
case TGSI_TOKEN_TYPE_IMMEDIATE:
+#if 0
if (!note_immediate( &cp, &parse.FullToken.FullImmediate ))
goto fail;
+#endif
break;
case TGSI_TOKEN_TYPE_INSTRUCTION:
+ if (DISASSEM)
+ tgsi_dump_instruction( &parse.FullToken.FullInstruction, cp.insn_counter );
+
if (!emit_instruction( &cp, &parse.FullToken.FullInstruction ))
goto fail;
break;
x87_assert_stack_empty(cp.func);
cp.insn_counter++;
- debug_printf("\n");
+
+ if (DISASSEM)
+ debug_printf("\n");
+ }
+
+
+ {
+ unsigned i;
+ for (i = 0; i < 8; i++) {
+ if (cp.xmm[i].file != TGSI_FILE_OUTPUT) {
+ cp.xmm[i].file = TGSI_FILE_NULL;
+ cp.xmm[i].dirty = 0;
+ }
+ }
}
if (cp.error)
goto fail;
- if (cp.vaos->base.key.viewport) {
- if (0)
- emit_viewport(&cp);
- else
- emit_rhw_viewport(&cp);
+ if (cp.vaos->base.key.clip) {
+ /* not really handling clipping, just do the rhw so we can
+ * see the results...
+ */
+ emit_rhw_viewport(&cp);
+ }
+ else if (cp.vaos->base.key.viewport) {
+ emit_viewport(&cp);
}
/* Emit output... TODO: do this eagerly after the last write to a
/* Incr index
*/
- if (linear) {
- x86_inc(cp.func, cp.idx_EBX);
- }
- else {
- x86_lea(cp.func, cp.idx_EBX, x86_make_disp(cp.idx_EBX, 4));
- }
-
+ aos_incr_inputs( &cp, linear );
}
/* decr count, loop if not zero
*/
x86_dec(cp.func, cp.count_ESI);
-/* x86_test(cp.func, cp.count_ESI, cp.count_ESI); */
x86_jcc(cp.func, cc_NZ, label);
restore_fpu_state(&cp);
if (cp.func->need_emms)
mmx_emms(cp.func);
+ x86_pop(cp.func, cp.temp_EBP);
x86_pop(cp.func, cp.count_ESI);
x86_pop(cp.func, cp.idx_EBX);
unsigned stride )
{
struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
- unsigned i;
- for (i = 0; i < vaos->base.vs->info.num_inputs; i++) {
- if (vaos->base.key.element[i].in.buffer == buf) {
- vaos->machine->attrib[i].input_ptr = ((char *)ptr +
- vaos->base.key.element[i].in.offset);
- vaos->machine->attrib[i].input_stride = stride;
- }
+ if (buf < vaos->nr_vb) {
+ vaos->buffer[buf].base_ptr = (char *)ptr;
+ vaos->buffer[buf].stride = stride;
}
+
+ if (0) debug_printf("%s %d/%d: %p %d\n", __FUNCTION__, buf, vaos->nr_vb, ptr, stride);
}
-static void vaos_destroy( struct draw_vs_varient *varient )
+
+static void PIPE_CDECL vaos_run_elts( struct draw_vs_varient *varient,
+ const unsigned *elts,
+ unsigned count,
+ void *output_buffer )
{
struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
+ struct aos_machine *machine = vaos->draw->vs.aos_machine;
- if (vaos->machine)
- align_free( vaos->machine );
+ if (0) debug_printf("%s %d\n", __FUNCTION__, count);
- x86_release_func( &vaos->func[0] );
- x86_release_func( &vaos->func[1] );
+ machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size;
+ machine->constants = vaos->draw->vs.aligned_constants;
+ machine->immediates = vaos->base.vs->immediates;
+ machine->buffer = vaos->buffer;
- FREE(vaos);
-}
-
-static void vaos_run_elts( struct draw_vs_varient *varient,
- const unsigned *elts,
- unsigned count,
- void *output_buffer )
-{
- struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
-
- vaos->gen_run_elts( varient,
+ vaos->gen_run_elts( machine,
elts,
count,
output_buffer );
}
-static void vaos_run_linear( struct draw_vs_varient *varient,
- unsigned start,
- unsigned count,
- void *output_buffer )
+static void PIPE_CDECL vaos_run_linear( struct draw_vs_varient *varient,
+ unsigned start,
+ unsigned count,
+ void *output_buffer )
{
struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
+ struct aos_machine *machine = vaos->draw->vs.aos_machine;
+
+ if (0) debug_printf("%s %d %d const: %x\n", __FUNCTION__, start, count,
+ vaos->base.key.const_vbuffers);
- vaos->gen_run_linear( varient,
+ machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size;
+ machine->constants = vaos->draw->vs.aligned_constants;
+ machine->immediates = vaos->base.vs->immediates;
+ machine->buffer = vaos->buffer;
+
+ vaos->gen_run_linear( machine,
start,
count,
output_buffer );
+
+ /* Sanity spot checks to make sure we didn't trash our constants */
+ assert(machine->internal[IMM_ONES][0] == 1.0f);
+ assert(machine->internal[IMM_IDENTITY][0] == 0.0f);
+ assert(machine->internal[IMM_NEGS][0] == -1.0f);
}
-static void vaos_set_constants( struct draw_vs_varient *varient,
- const float (*constants)[4] )
+
+static void vaos_destroy( struct draw_vs_varient *varient )
{
struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
- memcpy(vaos->machine->constant,
- constants,
- (vaos->base.vs->info.file_max[TGSI_FILE_CONSTANT] + 1) * 4 * sizeof(float));
+ FREE( vaos->buffer );
-#if 0
- unsigned i;
- for (i =0; i < vaos->base.vs->info.file_max[TGSI_FILE_CONSTANT] + 1; i++)
- debug_printf("state %d: %f %f %f %f\n",
- i,
- constants[i][0],
- constants[i][1],
- constants[i][2],
- constants[i][3]);
-#endif
-
- {
- unsigned i;
- for (i = 0; i < MAX_LIT_INFO; i++) {
- vaos->machine->lit_info[i].func = populate_lut;
- vaos->machine->now++;
- }
- }
-}
-
-
-static void vaos_set_viewport( struct draw_vs_varient *varient,
- const struct pipe_viewport_state *viewport )
-{
- struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
+ x86_release_func( &vaos->func[0] );
+ x86_release_func( &vaos->func[1] );
- memcpy(vaos->machine->scale, viewport->scale, 4 * sizeof(float));
- memcpy(vaos->machine->translate, viewport->translate, 4 * sizeof(float));
+ FREE(vaos);
}
static struct draw_vs_varient *varient_aos_sse( struct draw_vertex_shader *vs,
const struct draw_vs_varient_key *key )
{
+ unsigned i;
struct draw_vs_varient_aos_sse *vaos = CALLOC_STRUCT(draw_vs_varient_aos_sse);
- if (key->clip)
- return NULL;
-
if (!vaos)
goto fail;
vaos->base.key = *key;
vaos->base.vs = vs;
- vaos->base.set_input = vaos_set_buffer;
- vaos->base.set_constants = vaos_set_constants;
- vaos->base.set_viewport = vaos_set_viewport;
+ vaos->base.set_buffer = vaos_set_buffer;
vaos->base.destroy = vaos_destroy;
vaos->base.run_linear = vaos_run_linear;
vaos->base.run_elts = vaos_run_elts;
- vaos->machine = align_malloc( sizeof(struct aos_machine), 16 );
- if (!vaos->machine)
+ vaos->draw = vs->draw;
+
+ for (i = 0; i < key->nr_inputs; i++)
+ vaos->nr_vb = MAX2( vaos->nr_vb, key->element[i].in.buffer + 1 );
+
+ vaos->buffer = MALLOC( vaos->nr_vb * sizeof(vaos->buffer[0]) );
+ if (!vaos->buffer)
goto fail;
-
- memset(vaos->machine, 0, sizeof(struct aos_machine));
- init_internals(vaos->machine);
+ if (0)
+ debug_printf("nr_vb: %d const: %x\n", vaos->nr_vb, vaos->base.key.const_vbuffers);
+
+#if 0
tgsi_dump(vs->state.tokens, 0);
+#endif
if (!build_vertex_program( vaos, TRUE ))
goto fail;
if (!build_vertex_program( vaos, FALSE ))
goto fail;
- vaos->gen_run_linear = (vsv_run_linear_func)x86_get_func(&vaos->func[0]);
+ vaos->gen_run_linear = (vaos_run_linear_func)x86_get_func(&vaos->func[0]);
if (!vaos->gen_run_linear)
goto fail;
- vaos->gen_run_elts = (vsv_run_elts_func)x86_get_func(&vaos->func[1]);
+ vaos->gen_run_elts = (vaos_run_elts_func)x86_get_func(&vaos->func[1]);
if (!vaos->gen_run_elts)
goto fail;
return &vaos->base;
fail:
- if (vaos->machine)
- align_free( vaos->machine );
+ if (vaos && vaos->buffer)
+ FREE(vaos->buffer);
if (vaos)
x86_release_func( &vaos->func[0] );
struct draw_vs_varient *varient = varient_aos_sse( vs, key );
if (varient == NULL) {
- assert(0);
varient = draw_vs_varient_generic( vs, key );
}
-#endif
+#endif /* PIPE_ARCH_X86 */