}
-
-
/**
* Perform a reduced swizzle:
*/
/* Need a temporary to be correct in the case where result == arg0.
*/
COPY_4V(tmp, arg0);
-
- result[0] = tmp[GET_RSW(swz, 0)];
- result[1] = tmp[GET_RSW(swz, 1)];
- result[2] = tmp[GET_RSW(swz, 2)];
- result[3] = tmp[GET_RSW(swz, 3)];
-
+
+ result[0] = tmp[GET_SWZ(swz, 0)];
+ result[1] = tmp[GET_SWZ(swz, 1)];
+ result[2] = tmp[GET_SWZ(swz, 2)];
+ result[3] = tmp[GET_SWZ(swz, 3)];
+
+ if (neg) {
+ if (neg & 0x1) result[0] = -result[0];
+ if (neg & 0x2) result[1] = -result[1];
+ if (neg & 0x4) result[2] = -result[2];
+ if (neg & 0x8) result[3] = -result[3];
+ }
+}
+
+/**
+ * Perform a full swizzle
+ */
+static void do_SWZ( struct arb_vp_machine *m, union instruction op )
+{
+ GLfloat *result = m->File[0][op.rsw.dst];
+ const GLfloat *arg0 = m->File[op.rsw.file0][op.rsw.idx0];
+ GLuint swz = op.rsw.swz;
+ GLuint neg = op.rsw.neg;
+ GLfloat tmp[6];
+ tmp[4] = 0.0;
+ tmp[5] = 1.0;
+
+ /* Need a temporary to be correct in the case where result == arg0.
+ */
+ COPY_4V(tmp, arg0);
+
+ result[0] = tmp[GET_SWZ(swz, 0)];
+ result[1] = tmp[GET_SWZ(swz, 1)];
+ result[2] = tmp[GET_SWZ(swz, 2)];
+ result[3] = tmp[GET_SWZ(swz, 3)];
+
if (neg) {
if (neg & 0x1) result[0] = -result[0];
if (neg & 0x2) result[1] = -result[1];
_mesa_printf(", ");
print_reg(op.rsw.file0, op.rsw.idx0);
_mesa_printf(".");
- for (i = 0; i < 4; i++, swz >>= 2) {
- const char *cswz = "xyzw";
+ for (i = 0; i < 4; i++, swz >>= 3) {
+ const char *cswz = "xyzw01";
if (neg & (1<<i))
_mesa_printf("-");
- _mesa_printf("%c", cswz[swz&0x3]);
+ _mesa_printf("%c", cswz[swz&0x7]);
+ }
+ _mesa_printf("\n");
+}
+
+static void print_SWZ( union instruction op )
+{
+ GLuint swz = op.rsw.swz;
+ GLuint neg = op.rsw.neg;
+ GLuint i;
+
+ _mesa_printf("SWZ ");
+ print_reg(0, op.rsw.dst);
+ _mesa_printf(", ");
+ print_reg(op.rsw.file0, op.rsw.idx0);
+ _mesa_printf(".");
+ for (i = 0; i < 4; i++, swz >>= 3) {
+ const char *cswz = "xyzw01";
+ if (neg & (1<<i))
+ _mesa_printf("-");
+ _mesa_printf("%c", cswz[swz&0x7]);
}
_mesa_printf("\n");
}
case OPCODE_RCC:
case OPCODE_RET:
case OPCODE_SSG:
- case OPCODE_SWZ:
print_NOP(op);
break;
+ case OPCODE_SWZ:
+ print_SWZ(op);
+ break;
case RSW:
print_RSW(op);
break;
do_NOP,/*SSG*/
do_NOP,/*STR*/
do_SUB,
- do_RSW,/*SWZ*/
+ do_SWZ,/*SWZ*/
do_NOP,/*TEX*/
do_NOP,/*TXB*/
do_NOP,/*TXD*/
{
struct reg reg = cvp_load_reg( cp, src->File, src->Index, src->RelAddr, arg );
union instruction rsw, noop;
-
+
/* Emit any necessary swizzling.
*/
_mesa_bzero(&rsw, sizeof(rsw));
/* we're expecting 2-bit swizzles below... */
#if 1 /* XXX THESE ASSERTIONS CURRENTLY FAIL DURING GLEAN TESTS! */
+/* hopefully no longer happens? */
ASSERT(GET_SWZ(src->Swizzle, 0) < 4);
ASSERT(GET_SWZ(src->Swizzle, 1) < 4);
ASSERT(GET_SWZ(src->Swizzle, 2) < 4);
ASSERT(GET_SWZ(src->Swizzle, 3) < 4);
#endif
- rsw.rsw.swz = ((GET_SWZ(src->Swizzle, 0) << 0) |
- (GET_SWZ(src->Swizzle, 1) << 2) |
- (GET_SWZ(src->Swizzle, 2) << 4) |
- (GET_SWZ(src->Swizzle, 3) << 6));
+ rsw.rsw.swz = src->Swizzle;
_mesa_bzero(&noop, sizeof(noop));
noop.rsw.neg = 0;
- noop.rsw.swz = RSW_NOOP;
+ noop.rsw.swz = SWIZZLE_NOOP;
if (_mesa_memcmp(&rsw, &noop, sizeof(rsw)) !=0) {
union instruction *op = cvp_next_instruction(cp);
}
}
-static struct reg cvp_emit_rsw( struct compilation *cp,
- GLuint dst,
- struct reg src,
- GLuint neg,
- GLuint swz,
- GLboolean force)
-{
- struct reg retval;
-
- if (swz != RSW_NOOP || neg != 0) {
- union instruction *op = cvp_next_instruction(cp);
- op->rsw.opcode = RSW;
- op->rsw.dst = dst;
- op->rsw.file0 = src.file;
- op->rsw.idx0 = src.idx;
- op->rsw.neg = neg;
- op->rsw.swz = swz;
-
- retval.file = FILE_REG;
- retval.idx = dst;
- return retval;
- }
- else if (force) {
- /* Oops. Degenerate case:
- */
- union instruction *op = cvp_next_instruction(cp);
- op->alu.opcode = OPCODE_MOV;
- op->alu.dst = dst;
- op->alu.file0 = src.file;
- op->alu.idx0 = src.idx;
-
- retval.file = FILE_REG;
- retval.idx = dst;
- return retval;
- }
- else {
- return src;
- }
-}
-
static void cvp_emit_inst( struct compilation *cp,
const struct prog_instruction *inst )
op->alu.idx0 = reg[0].idx;
break;
- case OPCODE_SWZ: {
- GLuint swz0 = 0, swz1 = 0;
- GLuint neg0 = 0, neg1 = 0;
- GLuint mask = 0;
-
- /* Translate 3-bit-per-element swizzle into two 2-bit swizzles,
- * one from the source register the other from a constant
- * {0,0,0,1}.
- */
- for (i = 0; i < 4; i++) {
- GLuint swzelt = GET_SWZ(inst->SrcReg[0].Swizzle, i);
- if (swzelt >= SWIZZLE_ZERO) {
- neg0 |= inst->SrcReg[0].NegateBase & (1<<i);
- if (swzelt == SWIZZLE_ONE)
- swz0 |= SWIZZLE_W << (i*2);
- else if (i < SWIZZLE_W)
- swz0 |= i << (i*2);
- }
- else {
- mask |= 1<<i;
- neg1 |= inst->SrcReg[0].NegateBase & (1<<i);
- swz1 |= swzelt << (i*2);
- }
- }
+ case OPCODE_END:
+ break;
+ case OPCODE_SWZ:
result = cvp_choose_result( cp, &inst->DstReg, &fixup );
- reg[0].file = FILE_REG;
- reg[0].idx = REG_ID;
- reg[1] = cvp_emit_arg( cp, &inst->SrcReg[0], REG_ARG0 );
-
- if (mask == WRITEMASK_XYZW) {
- cvp_emit_rsw(cp, result, reg[0], neg0, swz0, GL_TRUE);
-
- }
- else if (mask == 0) {
- cvp_emit_rsw(cp, result, reg[1], neg1, swz1, GL_TRUE);
- }
- else {
- cvp_emit_rsw(cp, result, reg[0], neg0, swz0, GL_TRUE);
- reg[1] = cvp_emit_rsw(cp, REG_ARG0, reg[1], neg1, swz1, GL_FALSE);
-
- op = cvp_next_instruction(cp);
- op->msk.opcode = MSK;
- op->msk.dst = result;
- op->msk.file = reg[1].file;
- op->msk.idx = reg[1].idx;
- op->msk.mask = mask;
- }
+ reg[0] = cvp_load_reg( cp, inst->SrcReg[0].File,
+ inst->SrcReg[0].Index, inst->SrcReg[0].RelAddr, REG_ARG0 );
+ op = cvp_next_instruction(cp);
+ op->rsw.opcode = inst->Opcode;
+ op->rsw.file0 = reg[0].file;
+ op->rsw.idx0 = reg[0].idx;
+ op->rsw.dst = result;
+ op->rsw.swz = inst->SrcReg[0].Swizzle;
+ op->rsw.neg = inst->SrcReg[0].NegateBase;
if (result == REG_RES) {
op = cvp_next_instruction(cp);
*op = fixup;
}
break;
- }
-
- case OPCODE_END:
- break;
default:
result = cvp_choose_result( cp, &inst->DstReg, &fixup );
if (result == REG_RES) {
op = cvp_next_instruction(cp);
*op = fixup;
- }
+ }
break;
}
}
*/
ASSIGN_4V(m->File[0][REG_ID], 0, 0, 0, 1);
ASSIGN_4V(m->File[0][REG_ONES], 1, 1, 1, 1);
- ASSIGN_4V(m->File[0][REG_SWZ], -1, 1, 0, 0);
+ ASSIGN_4V(m->File[0][REG_SWZ], 1, -1, 0, 0);
ASSIGN_4V(m->File[0][REG_NEG], -1, -1, -1, -1);
ASSIGN_4V(m->File[0][REG_LIT], 1, 0, 0, 1);
ASSIGN_4V(m->File[0][REG_LIT2], 1, .5, .2, 1); /* debug value */
#define REG_IN31 63
#define REG_ID 64 /* 0,0,0,1 */
#define REG_ONES 65 /* 1,1,1,1 */
-#define REG_SWZ 66 /* -1,1,0,0 */
+#define REG_SWZ 66 /* 1,-1,0,0 */
#define REG_NEG 67 /* -1,-1,-1,-1 */
#define REG_LIT 68 /* 1,0,0,1 */
#define REG_LIT2 69 /* 1,0,0,1 */
GLuint file0:2;
GLuint idx0:7;
GLuint neg:4;
- GLuint swz:8; /* xyzw only */
+ GLuint swz:12; /* xyzw01 */
} rsw;
struct {
/**
- * Reduced swizzle is a 2-bit field; only X/Y/Z/W are allowed, not 0/1.
+ * Reduced swizzle is a 3-bit field, for simplicity same as normal swizzle, X/Y/Z/W/0/1 allowed.
*/
-#define RSW_NOOP ((0<<0) | (1<<2) | (2<<4) | (3<<6))
-#define GET_RSW(swz, idx) (((swz) >> ((idx)*2)) & 0x3)
-
struct input {
GLuint idx;
{
struct x86_reg arg0 = get_arg(cp, op.rsw.file0, op.rsw.idx0);
struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.rsw.dst);
- GLuint swz = op.rsw.swz;
+ GLuint swz = GET_SWZ(op.rsw.swz, 0) | (GET_SWZ(op.rsw.swz, 1) << 2) |
+ (GET_SWZ(op.rsw.swz, 2) << 4| (GET_SWZ(op.rsw.swz, 3) << 6));
GLuint neg = op.rsw.neg;
emit_pshufd(cp, dst, arg0, swz);
-
+
if (neg) {
struct x86_reg negs = get_arg(cp, FILE_REG, REG_SWZ);
struct x86_reg tmp = get_xmm_reg(cp);
* Use neg as arg to pshufd
* Multiply
*/
+ /* is the emit_pshufd necessary? only SWZ can negate individual components */
emit_pshufd(cp, tmp, negs,
SHUF((neg & 1) ? 1 : 0,
(neg & 2) ? 1 : 0,
return GL_TRUE;
}
+/* Perform a full swizzle
+ */
+static GLboolean emit_SWZ( struct compilation *cp, union instruction op )
+{
+ struct x86_reg arg0 = get_arg(cp, op.rsw.file0, op.rsw.idx0);
+ struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.rsw.dst);
+ struct x86_reg negs = get_arg(cp, FILE_REG, REG_SWZ);
+ struct x86_reg tmp = get_xmm_reg(cp);
+ GLubyte neg = op.rsw.neg;
+ GLubyte shuf2, swz, savepos, savemask, swizzle[4];
+
+ swizzle[0] = GET_SWZ(op.rsw.swz, 0);
+ swizzle[1] = GET_SWZ(op.rsw.swz, 1);
+ swizzle[2] = GET_SWZ(op.rsw.swz, 2);
+ swizzle[3] = GET_SWZ(op.rsw.swz, 3);
+
+ swz = SHUF((swizzle[0] & 3), (swizzle[1] & 3),
+ (swizzle[2] & 3), (swizzle[3] & 3));
+
+ emit_pshufd(cp, dst, arg0, swz);
+
+ /* can handle negation and replace with zero with the same shuffle/mul */
+ shuf2 = SHUF(swizzle[0] == 4 ? 2 : (neg & 1),
+ swizzle[1] == 4 ? 2 : ((neg & 2) >> 1),
+ swizzle[2] == 4 ? 2 : ((neg & 4) >> 2),
+ swizzle[3] == 4 ? 2 : ((neg & 8) >> 3));
+
+ /* now the hard part is getting those 1's in there... */
+ savepos = 0;
+ savemask = 0;
+ if (swizzle[0] == 5) savepos = 1;
+ if (swizzle[1] == 5) savepos = 2;
+ else savemask |= 1 << 2;
+ if (swizzle[2] == 5) savepos = 3;
+ else savemask |= 2 << 4;
+ if (swizzle[3] == 5) savepos = 4;
+ else savemask |= 3 << 6;
+ if (savepos) {
+ /* need a mov first as movss from memory will overwrite high bits of xmm reg */
+ sse_movups(&cp->func, tmp, negs);
+ /* can only replace lowest 32bits, thus move away that part first */
+ emit_pshufd(cp, dst, dst, savemask);
+ sse_movss(&cp->func, dst, tmp);
+ emit_pshufd(cp, dst, dst, (savepos - 1) | (savemask & 0xfc));
+ }
+
+ if (shuf2) {
+ /* Load 1,-1,0,0
+ * Use neg as arg to pshufd
+ * Multiply
+ */
+ emit_pshufd(cp, tmp, negs, shuf2);
+ sse_mulps(&cp->func, dst, tmp);
+ }
+
+ return GL_TRUE;
+}
+
/* Helper for writemask:
*/
static GLboolean emit_shuf_copy1( struct compilation *cp,
struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
- struct x86_reg ones = get_reg_ptr(FILE_REG, REG_ONES);
- struct x86_reg tmp = get_xmm_reg(cp);
+ struct x86_reg tmp = get_xmm_reg(cp);
- emit_pshufd(cp, dst, arg0, SHUF(W,X,Y,Z));
- sse_movss(&cp->func, dst, ones);
- emit_pshufd(cp, dst, dst, SHUF(W,X,Y,Z));
+ sse_movups(&cp->func, dst, arg0);
sse_mulps(&cp->func, dst, arg1);
-
- /* Now the hard bit: sum the values (from DP4):
+
+ /* Now the hard bit: sum the values (from DP3):
*/
sse_movhlps(&cp->func, tmp, dst);
- sse_addps(&cp->func, dst, tmp); /* a*x+c*z, b*y+d*w, a*x+c*z, b*y+d*w */
+ sse_addss(&cp->func, dst, tmp); /* a*x+c*z, b*y, ?, ? */
emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
sse_addss(&cp->func, dst, tmp);
+ emit_pshufd(cp, tmp, arg1, SHUF(W,W,W,W));
+ sse_addss(&cp->func, dst, tmp);
sse_shufps(&cp->func, dst, dst, SHUF(X, X, X, X));
return GL_TRUE;
}
{
struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
-
- /* TODO: Calculate absolute value
- */
#if 0
+ struct x86_reg neg = get_reg_ptr(FILE_REG, REG_NEG);
+
+/* get abs value first. This STILL doesn't work.
+ Looks like we get bogus neg values ?
+*/
sse_movss(&cp->func, dst, arg0);
sse_mulss(&cp->func, dst, neg);
sse_maxss(&cp->func, dst, arg0);
-#endif
+ sse_rsqrtss(&cp->func, dst, dst);
+#endif
sse_rsqrtss(&cp->func, dst, arg0);
sse_shufps(&cp->func, dst, dst, SHUF(X, X, X, X));
return GL_TRUE;
emit_NOP, /* SSG */
emit_NOP, /* STR */
emit_SUB,
- emit_RSW, /* SWZ */
+ emit_SWZ, /* SWZ */
emit_NOP, /* TEX */
emit_NOP, /* TXB */
emit_NOP, /* TXD */
emit_modrm( p, dst, src );
}
+void sse_maxss( struct x86_function *p,
+ struct x86_reg dst,
+ struct x86_reg src )
+{
+ emit_3ub(p, 0xF3, X86_TWOB, 0x5F);
+ emit_modrm( p, dst, src );
+}
+
void sse_divss( struct x86_function *p,
struct x86_reg dst,
struct x86_reg src )
emit_modrm( p, dst, src );
}
+void sse_mulss( struct x86_function *p,
+ struct x86_reg dst,
+ struct x86_reg src )
+{
+ emit_3ub(p, 0xF3, X86_TWOB, 0x59);
+ emit_modrm( p, dst, src );
+}
+
void sse_addps( struct x86_function *p,
struct x86_reg dst,
struct x86_reg src )
void sse_andps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
void sse_cmpps( struct x86_function *p, struct x86_reg dst, struct x86_reg src, GLubyte cc );
void sse_maxps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_maxss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
void sse_minps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
void sse_movaps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
void sse_movhlps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
void sse_movss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
void sse_movups( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
void sse_mulps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_mulss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
void sse_subps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
void sse_rsqrtss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
void sse_shufps( struct x86_function *p, struct x86_reg dest, struct x86_reg arg0, GLubyte shuf );