From fe57ed4f2566e30384d0c786998842405d8e8990 Mon Sep 17 00:00:00 2001
From: Roland Scheidegger <rscheidegger@gmx.ch>
Date: Thu, 1 Jun 2006 22:56:40 +0000
Subject: [PATCH] Fix extended swizzling in vertex programs by introducing
 special swizzle instruction, extend the 2 bit rsw field to 3 bit like used in
 other places. While here, also fix up rsw (negation), dph and try to fix up
 rsq with negative values (doesn't work, bug seems elsewhere) in the sse
 codegen code.

---
 src/mesa/tnl/t_vb_arbprogram.c     | 192 ++++++++++++-----------------
 src/mesa/tnl/t_vb_arbprogram.h     |   9 +-
 src/mesa/tnl/t_vb_arbprogram_sse.c |  92 +++++++++++---
 src/mesa/x86/rtasm/x86sse.c        |  16 +++
 src/mesa/x86/rtasm/x86sse.h        |   2 +
 5 files changed, 179 insertions(+), 132 deletions(-)

diff --git a/src/mesa/tnl/t_vb_arbprogram.c b/src/mesa/tnl/t_vb_arbprogram.c
index 88d8fe95464..d034929fe0a 100644
--- a/src/mesa/tnl/t_vb_arbprogram.c
+++ b/src/mesa/tnl/t_vb_arbprogram.c
@@ -115,8 +115,6 @@ static GLfloat rough_approx_log2_0_1(GLfloat x)
 }
 
 
-
-
 /**
  * Perform a reduced swizzle:
  */
@@ -131,12 +129,42 @@ static void do_RSW( struct arb_vp_machine *m, union instruction op )
    /* Need a temporary to be correct in the case where result == arg0.
     */
    COPY_4V(tmp, arg0);
-   
-   result[0] = tmp[GET_RSW(swz, 0)];
-   result[1] = tmp[GET_RSW(swz, 1)];
-   result[2] = tmp[GET_RSW(swz, 2)];
-   result[3] = tmp[GET_RSW(swz, 3)];
-   
+
+   result[0] = tmp[GET_SWZ(swz, 0)];
+   result[1] = tmp[GET_SWZ(swz, 1)];
+   result[2] = tmp[GET_SWZ(swz, 2)];
+   result[3] = tmp[GET_SWZ(swz, 3)];
+
+   if (neg) {
+      if (neg & 0x1) result[0] = -result[0];
+      if (neg & 0x2) result[1] = -result[1];
+      if (neg & 0x4) result[2] = -result[2];
+      if (neg & 0x8) result[3] = -result[3];
+   }
+}
+
+/**
+ * Perform a full swizzle
+ */
+static void do_SWZ( struct arb_vp_machine *m, union instruction op ) 
+{
+   GLfloat *result = m->File[0][op.rsw.dst];
+   const GLfloat *arg0 = m->File[op.rsw.file0][op.rsw.idx0];
+   GLuint swz = op.rsw.swz;
+   GLuint neg = op.rsw.neg;
+   GLfloat tmp[6];
+   tmp[4] = 0.0;
+   tmp[5] = 1.0;
+
+   /* Need a temporary to be correct in the case where result == arg0.
+    */
+   COPY_4V(tmp, arg0);
+
+   result[0] = tmp[GET_SWZ(swz, 0)];
+   result[1] = tmp[GET_SWZ(swz, 1)];
+   result[2] = tmp[GET_SWZ(swz, 2)];
+   result[3] = tmp[GET_SWZ(swz, 3)];
+
    if (neg) {
       if (neg & 0x1) result[0] = -result[0];
       if (neg & 0x2) result[1] = -result[1];
@@ -570,11 +598,31 @@ static void print_RSW( union instruction op )
    _mesa_printf(", ");
    print_reg(op.rsw.file0, op.rsw.idx0);
    _mesa_printf(".");
-   for (i = 0; i < 4; i++, swz >>= 2) {
-      const char *cswz = "xyzw";
+   for (i = 0; i < 4; i++, swz >>= 3) {
+      const char *cswz = "xyzw01";
       if (neg & (1<<i))   
 	 _mesa_printf("-");
-      _mesa_printf("%c", cswz[swz&0x3]);
+      _mesa_printf("%c", cswz[swz&0x7]);
+   }
+   _mesa_printf("\n");
+}
+
+static void print_SWZ( union instruction op )
+{
+   GLuint swz = op.rsw.swz;
+   GLuint neg = op.rsw.neg;
+   GLuint i;
+
+   _mesa_printf("SWZ ");
+   print_reg(0, op.rsw.dst);
+   _mesa_printf(", ");
+   print_reg(op.rsw.file0, op.rsw.idx0);
+   _mesa_printf(".");
+   for (i = 0; i < 4; i++, swz >>= 3) {
+      const char *cswz = "xyzw01";
+      if (neg & (1<<i))   
+	 _mesa_printf("-");
+      _mesa_printf("%c", cswz[swz&0x7]);
    }
    _mesa_printf("\n");
 }
@@ -651,9 +699,11 @@ _tnl_disassem_vba_insn( union instruction op )
    case OPCODE_RCC:
    case OPCODE_RET:
    case OPCODE_SSG:
-   case OPCODE_SWZ:
       print_NOP(op);
       break;
+   case OPCODE_SWZ:
+      print_SWZ(op);
+      break;
    case RSW:
       print_RSW(op);
       break;
@@ -728,7 +778,7 @@ static void (* const opcode_func[MAX_OPCODE+3])(struct arb_vp_machine *, union i
    do_NOP,/*SSG*/
    do_NOP,/*STR*/
    do_SUB,
-   do_RSW,/*SWZ*/
+   do_SWZ,/*SWZ*/
    do_NOP,/*TEX*/
    do_NOP,/*TXB*/
    do_NOP,/*TXD*/
@@ -833,7 +883,7 @@ static struct reg cvp_emit_arg( struct compilation *cp,
 {
    struct reg reg = cvp_load_reg( cp, src->File, src->Index, src->RelAddr, arg );
    union instruction rsw, noop;
-   
+
    /* Emit any necessary swizzling.  
     */
    _mesa_bzero(&rsw, sizeof(rsw));
@@ -841,19 +891,17 @@ static struct reg cvp_emit_arg( struct compilation *cp,
 
    /* we're expecting 2-bit swizzles below... */
 #if 1 /* XXX THESE ASSERTIONS CURRENTLY FAIL DURING GLEAN TESTS! */
+/* hopefully no longer happens? */
    ASSERT(GET_SWZ(src->Swizzle, 0) < 4);
    ASSERT(GET_SWZ(src->Swizzle, 1) < 4);
    ASSERT(GET_SWZ(src->Swizzle, 2) < 4);
    ASSERT(GET_SWZ(src->Swizzle, 3) < 4);
 #endif
-   rsw.rsw.swz = ((GET_SWZ(src->Swizzle, 0) << 0) |
-		  (GET_SWZ(src->Swizzle, 1) << 2) |
-		  (GET_SWZ(src->Swizzle, 2) << 4) |
-		  (GET_SWZ(src->Swizzle, 3) << 6));
+   rsw.rsw.swz = src->Swizzle;
 
    _mesa_bzero(&noop, sizeof(noop));
    noop.rsw.neg = 0;
-   noop.rsw.swz = RSW_NOOP;
+   noop.rsw.swz = SWIZZLE_NOOP;
 
    if (_mesa_memcmp(&rsw, &noop, sizeof(rsw)) !=0) {
       union instruction *op = cvp_next_instruction(cp);
@@ -907,46 +955,6 @@ static GLuint cvp_choose_result( struct compilation *cp,
    }
 }
 
-static struct reg cvp_emit_rsw( struct compilation *cp, 
-				GLuint dst,
-				struct reg src,
-				GLuint neg, 
-				GLuint swz,
-				GLboolean force)
-{
-   struct reg retval;
-
-   if (swz != RSW_NOOP || neg != 0) {
-      union instruction *op = cvp_next_instruction(cp);
-      op->rsw.opcode = RSW;
-      op->rsw.dst = dst;
-      op->rsw.file0 = src.file;
-      op->rsw.idx0 = src.idx;
-      op->rsw.neg = neg;
-      op->rsw.swz = swz;
-	    
-      retval.file = FILE_REG;
-      retval.idx = dst;
-      return retval;
-   }
-   else if (force) {
-      /* Oops.  Degenerate case:
-       */
-      union instruction *op = cvp_next_instruction(cp);
-      op->alu.opcode = OPCODE_MOV;
-      op->alu.dst = dst;
-      op->alu.file0 = src.file;
-      op->alu.idx0 = src.idx;
-      
-      retval.file = FILE_REG;
-      retval.idx = dst;
-      return retval;
-   }
-   else {
-      return src;
-   }
-}
-
 
 static void cvp_emit_inst( struct compilation *cp,
 			   const struct prog_instruction *inst )
@@ -998,64 +1006,26 @@ static void cvp_emit_inst( struct compilation *cp,
       op->alu.idx0 = reg[0].idx;
       break;
 
-   case OPCODE_SWZ: {
-      GLuint swz0 = 0, swz1 = 0;
-      GLuint neg0 = 0, neg1 = 0;
-      GLuint mask = 0;
-
-      /* Translate 3-bit-per-element swizzle into two 2-bit swizzles,
-       * one from the source register the other from a constant
-       * {0,0,0,1}.
-       */
-      for (i = 0; i < 4; i++) {
-	 GLuint swzelt = GET_SWZ(inst->SrcReg[0].Swizzle, i);
-	 if (swzelt >= SWIZZLE_ZERO) {
-	    neg0 |= inst->SrcReg[0].NegateBase & (1<<i);
-	    if (swzelt == SWIZZLE_ONE)
-	       swz0 |= SWIZZLE_W << (i*2);
-	    else if (i < SWIZZLE_W)
-	       swz0 |= i << (i*2);
-	 }
-	 else {
-	    mask |= 1<<i;
-	    neg1 |= inst->SrcReg[0].NegateBase & (1<<i);
-	    swz1 |= swzelt << (i*2);
-	 }
-      }
+   case OPCODE_END:
+      break;
 
+   case OPCODE_SWZ:
       result = cvp_choose_result( cp, &inst->DstReg, &fixup );
-      reg[0].file = FILE_REG;
-      reg[0].idx = REG_ID;
-      reg[1] = cvp_emit_arg( cp, &inst->SrcReg[0], REG_ARG0 );
-
-      if (mask == WRITEMASK_XYZW) {
-	 cvp_emit_rsw(cp, result, reg[0], neg0, swz0, GL_TRUE);
-	 
-      }
-      else if (mask == 0) {
-	 cvp_emit_rsw(cp, result, reg[1], neg1, swz1, GL_TRUE);
-      }
-      else {
-	 cvp_emit_rsw(cp, result, reg[0], neg0, swz0, GL_TRUE);
-	 reg[1] = cvp_emit_rsw(cp, REG_ARG0, reg[1], neg1, swz1, GL_FALSE);
-
-	 op = cvp_next_instruction(cp);
-	 op->msk.opcode = MSK;
-	 op->msk.dst = result;
-	 op->msk.file = reg[1].file;
-	 op->msk.idx = reg[1].idx;
-	 op->msk.mask = mask;
-      }
+      reg[0] = cvp_load_reg( cp, inst->SrcReg[0].File,
+			inst->SrcReg[0].Index, inst->SrcReg[0].RelAddr, REG_ARG0 );
+      op = cvp_next_instruction(cp);
+      op->rsw.opcode = inst->Opcode;
+      op->rsw.file0 = reg[0].file;
+      op->rsw.idx0 = reg[0].idx;
+      op->rsw.dst = result;
+      op->rsw.swz = inst->SrcReg[0].Swizzle;
+      op->rsw.neg = inst->SrcReg[0].NegateBase;
 
       if (result == REG_RES) {
 	 op = cvp_next_instruction(cp);
 	 *op = fixup;
       }
       break;
-   }
-
-   case OPCODE_END:
-      break;
 
    default:
       result = cvp_choose_result( cp, &inst->DstReg, &fixup );
@@ -1074,7 +1044,7 @@ static void cvp_emit_inst( struct compilation *cp,
       if (result == REG_RES) {
 	 op = cvp_next_instruction(cp);
 	 *op = fixup;
-      }      	 
+      }
       break;
    }
 }
@@ -1485,7 +1455,7 @@ static GLboolean init_vertex_program( GLcontext *ctx,
     */
    ASSIGN_4V(m->File[0][REG_ID], 0, 0, 0, 1);
    ASSIGN_4V(m->File[0][REG_ONES], 1, 1, 1, 1);
-   ASSIGN_4V(m->File[0][REG_SWZ], -1, 1, 0, 0);
+   ASSIGN_4V(m->File[0][REG_SWZ], 1, -1, 0, 0);
    ASSIGN_4V(m->File[0][REG_NEG], -1, -1, -1, -1);
    ASSIGN_4V(m->File[0][REG_LIT], 1, 0, 0, 1);
    ASSIGN_4V(m->File[0][REG_LIT2], 1, .5, .2, 1); /* debug value */
diff --git a/src/mesa/tnl/t_vb_arbprogram.h b/src/mesa/tnl/t_vb_arbprogram.h
index 60786d6a016..dab725d7f7a 100644
--- a/src/mesa/tnl/t_vb_arbprogram.h
+++ b/src/mesa/tnl/t_vb_arbprogram.h
@@ -61,7 +61,7 @@
 #define REG_IN31   63
 #define REG_ID     64		/* 0,0,0,1 */
 #define REG_ONES   65		/* 1,1,1,1 */
-#define REG_SWZ    66		/* -1,1,0,0 */
+#define REG_SWZ    66		/* 1,-1,0,0 */
 #define REG_NEG    67		/* -1,-1,-1,-1 */
 #define REG_LIT    68           /* 1,0,0,1 */
 #define REG_LIT2    69           /* 1,0,0,1 */
@@ -98,7 +98,7 @@ union instruction {
       GLuint file0:2;
       GLuint idx0:7;
       GLuint neg:4;
-      GLuint swz:8;		/* xyzw only */
+      GLuint swz:12;		/* xyzw01 */
    } rsw;
 
    struct {
@@ -114,11 +114,8 @@ union instruction {
 
 
 /**
- * Reduced swizzle is a 2-bit field; only X/Y/Z/W are allowed, not 0/1.
+ * Reduced swizzle is a 3-bit field, for simplicity same as normal swizzle, X/Y/Z/W/0/1 allowed.
  */
-#define RSW_NOOP ((0<<0) | (1<<2) | (2<<4) | (3<<6))
-#define GET_RSW(swz, idx)      (((swz) >> ((idx)*2)) & 0x3)
-
 
 struct input {
    GLuint idx;
diff --git a/src/mesa/tnl/t_vb_arbprogram_sse.c b/src/mesa/tnl/t_vb_arbprogram_sse.c
index 19061c0d8d1..b9126d6d886 100644
--- a/src/mesa/tnl/t_vb_arbprogram_sse.c
+++ b/src/mesa/tnl/t_vb_arbprogram_sse.c
@@ -294,11 +294,12 @@ static GLboolean emit_RSW( struct compilation *cp, union instruction op )
 {
    struct x86_reg arg0 = get_arg(cp, op.rsw.file0, op.rsw.idx0);
    struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.rsw.dst);
-   GLuint swz = op.rsw.swz;
+   GLuint swz = GET_SWZ(op.rsw.swz, 0) | (GET_SWZ(op.rsw.swz, 1) << 2) |
+		(GET_SWZ(op.rsw.swz, 2) << 4| (GET_SWZ(op.rsw.swz, 3) << 6));
    GLuint neg = op.rsw.neg;
 
    emit_pshufd(cp, dst, arg0, swz);
-   
+
    if (neg) {
       struct x86_reg negs = get_arg(cp, FILE_REG, REG_SWZ);
       struct x86_reg tmp = get_xmm_reg(cp);
@@ -306,6 +307,7 @@ static GLboolean emit_RSW( struct compilation *cp, union instruction op )
        * Use neg as arg to pshufd
        * Multiply
        */
+      /* is the emit_pshufd necessary? only SWZ can negate individual components */
       emit_pshufd(cp, tmp, negs, 
 		  SHUF((neg & 1) ? 1 : 0,
 		       (neg & 2) ? 1 : 0,
@@ -317,6 +319,64 @@ static GLboolean emit_RSW( struct compilation *cp, union instruction op )
    return GL_TRUE;
 }
 
+/* Perform a full swizzle
+ */
+static GLboolean emit_SWZ( struct compilation *cp, union instruction op ) 
+{
+   struct x86_reg arg0 = get_arg(cp, op.rsw.file0, op.rsw.idx0);
+   struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.rsw.dst);
+   struct x86_reg negs = get_arg(cp, FILE_REG, REG_SWZ);
+   struct x86_reg tmp = get_xmm_reg(cp);
+   GLubyte neg = op.rsw.neg;
+   GLubyte shuf2, swz, savepos, savemask, swizzle[4];
+
+   swizzle[0] = GET_SWZ(op.rsw.swz, 0);
+   swizzle[1] = GET_SWZ(op.rsw.swz, 1);
+   swizzle[2] = GET_SWZ(op.rsw.swz, 2);
+   swizzle[3] = GET_SWZ(op.rsw.swz, 3);
+
+   swz = SHUF((swizzle[0] & 3), (swizzle[1] & 3),
+	      (swizzle[2] & 3), (swizzle[3] & 3));
+
+   emit_pshufd(cp, dst, arg0, swz);
+
+   /* can handle negation and replace with zero with the same shuffle/mul */
+   shuf2 = SHUF(swizzle[0] == 4 ? 2 : (neg & 1),
+	        swizzle[1] == 4 ? 2 : ((neg & 2) >> 1),
+	        swizzle[2] == 4 ? 2 : ((neg & 4) >> 2),
+	        swizzle[3] == 4 ? 2 : ((neg & 8) >> 3));
+
+   /* now the hard part is getting those 1's in there... */
+   savepos = 0;
+   savemask = 0;
+   if (swizzle[0] == 5) savepos = 1;
+   if (swizzle[1] == 5) savepos = 2;
+   else savemask |= 1 << 2;
+   if (swizzle[2] == 5) savepos = 3;
+   else savemask |= 2 << 4;
+   if (swizzle[3] == 5) savepos = 4;
+   else savemask |= 3 << 6;
+   if (savepos) {
+      /* need a mov first as movss from memory will overwrite high bits of xmm reg */
+      sse_movups(&cp->func, tmp, negs);
+      /* can only replace lowest 32bits, thus move away that part first */
+      emit_pshufd(cp, dst, dst, savemask);
+      sse_movss(&cp->func, dst, tmp);
+      emit_pshufd(cp, dst, dst, (savepos - 1) | (savemask & 0xfc));
+   }
+
+   if (shuf2) {
+      /* Load 1,-1,0,0
+       * Use neg as arg to pshufd
+       * Multiply
+       */
+      emit_pshufd(cp, tmp, negs, shuf2);
+      sse_mulps(&cp->func, dst, tmp);
+   }
+
+   return GL_TRUE;
+}
+
 /* Helper for writemask:
  */
 static GLboolean emit_shuf_copy1( struct compilation *cp,
@@ -595,20 +655,19 @@ static GLboolean emit_DPH( struct compilation *cp, union instruction op )
    struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0); 
    struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1); 
    struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
-   struct x86_reg ones = get_reg_ptr(FILE_REG, REG_ONES);
-   struct x86_reg tmp = get_xmm_reg(cp);      
+   struct x86_reg tmp = get_xmm_reg(cp);
 
-   emit_pshufd(cp, dst, arg0, SHUF(W,X,Y,Z));
-   sse_movss(&cp->func, dst, ones);
-   emit_pshufd(cp, dst, dst, SHUF(W,X,Y,Z));
+   sse_movups(&cp->func, dst, arg0);
    sse_mulps(&cp->func, dst, arg1);
-   
-   /* Now the hard bit: sum the values (from DP4):
+
+   /* Now the hard bit: sum the values (from DP3):
     */ 
    sse_movhlps(&cp->func, tmp, dst);
-   sse_addps(&cp->func, dst, tmp); /* a*x+c*z, b*y+d*w, a*x+c*z, b*y+d*w */
+   sse_addss(&cp->func, dst, tmp); /* a*x+c*z, b*y, ?, ? */
    emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
    sse_addss(&cp->func, dst, tmp);
+   emit_pshufd(cp, tmp, arg1, SHUF(W,W,W,W));
+   sse_addss(&cp->func, dst, tmp);
    sse_shufps(&cp->func, dst, dst, SHUF(X, X, X, X));
    return GL_TRUE;
 }
@@ -985,15 +1044,18 @@ static GLboolean emit_RSQ( struct compilation *cp, union instruction op )
 {
    struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
    struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
-
-   /* TODO: Calculate absolute value
-    */
 #if 0
+   struct x86_reg neg = get_reg_ptr(FILE_REG, REG_NEG);
+
+/* get abs value first. This STILL doesn't work.
+   Looks like we get bogus neg values ?
+*/
    sse_movss(&cp->func, dst, arg0);
    sse_mulss(&cp->func, dst, neg);
    sse_maxss(&cp->func, dst, arg0);
-#endif
 
+   sse_rsqrtss(&cp->func, dst, dst);
+#endif
    sse_rsqrtss(&cp->func, dst, arg0);
    sse_shufps(&cp->func, dst, dst, SHUF(X, X, X, X));
    return GL_TRUE;
@@ -1132,7 +1194,7 @@ static GLboolean (* const emit_func[])(struct compilation *, union instruction)
    emit_NOP, /* SSG */
    emit_NOP, /* STR */
    emit_SUB,
-   emit_RSW, /* SWZ */
+   emit_SWZ, /* SWZ */
    emit_NOP, /* TEX */
    emit_NOP, /* TXB */
    emit_NOP, /* TXD */
diff --git a/src/mesa/x86/rtasm/x86sse.c b/src/mesa/x86/rtasm/x86sse.c
index 9f34004ba0c..6137aef8ece 100644
--- a/src/mesa/x86/rtasm/x86sse.c
+++ b/src/mesa/x86/rtasm/x86sse.c
@@ -424,6 +424,14 @@ void sse_maxps( struct x86_function *p,
    emit_modrm( p, dst, src );
 }
 
+void sse_maxss( struct x86_function *p,
+		struct x86_reg dst,
+		struct x86_reg src )
+{
+   emit_3ub(p, 0xF3, X86_TWOB, 0x5F);
+   emit_modrm( p, dst, src );
+}
+
 void sse_divss( struct x86_function *p,
 		struct x86_reg dst,
 		struct x86_reg src )
@@ -456,6 +464,14 @@ void sse_mulps( struct x86_function *p,
    emit_modrm( p, dst, src );
 }
 
+void sse_mulss( struct x86_function *p,
+		struct x86_reg dst,
+		struct x86_reg src )
+{
+   emit_3ub(p, 0xF3, X86_TWOB, 0x59);
+   emit_modrm( p, dst, src );
+}
+
 void sse_addps( struct x86_function *p,
 		struct x86_reg dst,
 		struct x86_reg src )
diff --git a/src/mesa/x86/rtasm/x86sse.h b/src/mesa/x86/rtasm/x86sse.h
index 430cf2f939d..5ec54894311 100644
--- a/src/mesa/x86/rtasm/x86sse.h
+++ b/src/mesa/x86/rtasm/x86sse.h
@@ -156,6 +156,7 @@ void sse_divss( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
 void sse_andps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse_cmpps( struct x86_function *p, struct x86_reg dst, struct x86_reg src, GLubyte cc );
 void sse_maxps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_maxss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse_minps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse_movaps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse_movhlps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
@@ -165,6 +166,7 @@ void sse_movlps( struct x86_function *p, struct x86_reg dst, struct x86_reg src
 void sse_movss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse_movups( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse_mulps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_mulss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse_subps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse_rsqrtss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse_shufps( struct x86_function *p, struct x86_reg dest, struct x86_reg arg0, GLubyte shuf );
-- 
2.30.2