-/* $Id: 3dnow_normal.S,v 1.1 2001/03/29 06:46:16 gareth Exp $ */
+/* $Id: 3dnow_normal.S,v 1.6 2003/11/26 08:32:35 dborca Exp $ */
/*
* Mesa 3-D graphics library
- * Version: 3.5
+ * Version: 5.1
*
- * Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
+ * Copyright (C) 1999-2003 Brian Paul All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
MOV_L ( ARG_LENGTHS, EDI )
MOV_L ( ARG_IN, ESI )
MOV_L ( ARG_DEST, EAX )
- MOV_L ( REGOFF(V3F_COUNT, ESI), EBP ) /* dest->count = in->count */
- MOV_L ( EBP, REGOFF(V3F_COUNT, EAX) )
- MOV_L ( REGOFF(V3F_START, ESI), EDX ) /* in->start */
- MOV_L ( REGOFF(V3F_START, EAX), EAX ) /* dest->start */
+ MOV_L ( REGOFF(V4F_COUNT, ESI), EBP ) /* dest->count = in->count */
+ MOV_L ( EBP, REGOFF(V4F_COUNT, EAX) )
+ MOV_L ( REGOFF(V4F_START, ESI), EDX ) /* in->start */
+ MOV_L ( REGOFF(V4F_START, EAX), EAX ) /* dest->start */
MOV_L ( ARG_MAT, ECX )
MOV_L ( REGOFF(MATRIX_INV, ECX), ECX ) /* mat->inv */
CMP_L ( CONST(0), EBP ) /* count > 0 ?? */
JE ( LLBL (G3TN_end) )
- MOV_L ( REGOFF (V3F_COUNT, ESI), EBP )
+ MOV_L ( REGOFF (V4F_COUNT, ESI), EBP )
FEMMS
PUSH_L ( EBP )
PFMUL ( MM0, MM6 ) /* scale * m9 | scale * m8 */
PFMUL ( MM0, MM7 ) /* | scale * m10 */
+ALIGNTEXT32
LLBL (G3TN_scale_end):
+LLBL (G3TN_transform):
MOVQ ( REGIND (EDX), MM0 ) /* x1 | x0 */
MOVD ( REGOFF (8, EDX), MM2 ) /* | x2 */
-ALIGNTEXT32
-LLBL (G3TN_transform):
MOVQ ( MM0, MM1 ) /* x1 | x0 */
PUNPCKLDQ ( MM2, MM2 ) /* x2 | x2 */
PFMUL ( MM3, MM0 ) /* x1*m1 | x0*m0 */
- ADD_L ( CONST(12), EAX ) /* next r */
+ ADD_L ( CONST(16), EAX ) /* next r */
PREFETCHW ( REGIND(EAX) )
PFADD ( MM2, MM0 ) /* x0*m4+x1*m5+x2*m6| x0*m0+...+x2**/
MOVQ ( REGIND (EDX), MM1 ) /* x1 | x0 */
- MOVQ ( MM0, REGOFF(-12, EAX) ) /* write r0, r1 */
+ MOVQ ( MM0, REGOFF(-16, EAX) ) /* write r0, r1 */
PFMUL ( MM6, MM1 ) /* x1*m9 | x0*m8 */
MOVD ( REGOFF (8, EDX), MM2 ) /* | x2 */
PREFETCH ( REGIND(EDX) )
- MOVD ( MM1, REGOFF(-4, EAX) ) /* write r2 */
- MOVQ ( REGIND (EDX), MM0 ) /* x1 | x0 */
-
- MOVD ( REGOFF (8, EDX), MM2 ) /* | x2 */
+ MOVD ( MM1, REGOFF(-8, EAX) ) /* write r2 */
DEC_L ( EBP ) /* decrement normal counter */
JA ( LLBL (G3TN_transform) )
POP_L ( EAX ) /* now normalizing ... */
POP_L ( EBP )
- MOVQ ( REGIND(EAX), MM0 ) /* x1 | x0 */
- MOVD ( REGOFF(8, EAX), MM1 ) /* | x2 */
-
CMP_L ( CONST(0), EDI ) /* lengths == 0 ? */
JE ( LLBL (G3TN_norm ) ) /* calculate lengths */
PREFETCHW ( REGOFF(12,EAX) )
+ MOVQ ( REGIND(EAX), MM0 ) /* x1 | x0 */
+ MOVD ( REGOFF(8, EAX), MM1 ) /* | x2 */
+
MOVD ( REGIND (EDI), MM3 ) /* | length (x) */
PFMUL ( MM3, MM1 ) /* | x2 (normalize*/
MOVQ ( MM0, REGIND(EAX) ) /* write new x0, x1 */
MOVD ( MM1, REGOFF(8, EAX) ) /* write new x2 */
- ADD_L ( CONST(12), EAX ) /* next r */
+ ADD_L ( CONST(16), EAX ) /* next r */
DEC_L ( EBP ) /* decrement normal counter */
- MOVQ ( REGIND(EAX), MM0 ) /* x1 | x0 */
- MOVD ( REGOFF(8, EAX), MM1 ) /* | x2 */
JA ( LLBL (G3TN_norm_w_lengths) )
JMP ( LLBL (G3TN_exit_3dnow) )
PREFETCHW ( REGIND(EAX) )
+ MOVQ ( REGIND (EAX), MM0 ) /* x1 | x0 */
+ MOVD ( REGOFF(8, EAX), MM1 ) /* | x2 */
+
MOVQ ( MM0, MM3 ) /* x1 | x0 */
MOVQ ( MM1, MM4 ) /* | x2 */
PFMUL ( MM0, MM3 ) /* x1*x1 | x0*x0 */
- ADD_L ( CONST(12), EAX ) /* next r */
+ ADD_L ( CONST(16), EAX ) /* next r */
PFMUL ( MM1, MM4 ) /* | x2*x2 */
PFADD ( MM4, MM3 ) /* | x0*x0+x2*x2 */
PFMUL ( MM5, MM0 ) /* x1 (normalized) | x0 (normalize*/
- MOVQ ( MM0, REGOFF(-12, EAX) ) /* write new x0, x1 */
+ MOVQ ( MM0, REGOFF(-16, EAX) ) /* write new x0, x1 */
PFMUL ( MM5, MM1 ) /* | x2 (normalize*/
- MOVD ( MM1, REGOFF(-4, EAX) ) /* write new x2 */
- MOVQ ( REGIND (EAX), MM0 ) /* x1 | x0 */
-
- MOVD ( REGOFF(8, EAX), MM1 ) /* | x2 */
+ MOVD ( MM1, REGOFF(-8, EAX) ) /* write new x2 */
JA ( LLBL (G3TN_norm) )
LLBL (G3TN_exit_3dnow):
MOV_L ( ARG_LENGTHS, EDI )
MOV_L ( ARG_IN, ESI )
MOV_L ( ARG_DEST, EAX )
- MOV_L ( REGOFF(V3F_COUNT, ESI), EBP ) /* dest->count = in->count */
- MOV_L ( EBP, REGOFF(V3F_COUNT, EAX) )
+ MOV_L ( REGOFF(V4F_COUNT, ESI), EBP ) /* dest->count = in->count */
+ MOV_L ( EBP, REGOFF(V4F_COUNT, EAX) )
MOV_L ( ARG_MAT, ECX )
- MOV_L ( REGOFF(V3F_START, EAX), EAX ) /* dest->start */
+ MOV_L ( REGOFF(V4F_START, EAX), EAX ) /* dest->start */
MOV_L ( REGOFF(MATRIX_INV, ECX), ECX ) /* mat->inv */
- MOV_L ( REGOFF(V3F_START, ESI), EDX ) /* in->start */
+ MOV_L ( REGOFF(V4F_START, ESI), EDX ) /* in->start */
CMP_L ( CONST(0), EBP ) /* count > 0 ?? */
JE ( LLBL (G3TNNR_end) )
ALIGNTEXT32
LLBL (G3TNNR_scale_end):
- MOVQ ( REGIND(EDX), MM6 ) /* x1 | x0 */
- MOVD ( REGOFF(8, EDX), MM7 ) /* | x2 */
-
CMP_L ( CONST(0), EDI ) /* lengths == 0 ? */
JE ( LLBL (G3TNNR_norm) ) /* need to calculate lengths */
PREFETCHW ( REGIND(EAX) )
+ MOVQ ( REGIND(EDX), MM6 ) /* x1 | x0 */
+ MOVD ( REGOFF(8, EDX), MM7 ) /* | x2 */
+
PFMUL ( MM0, MM6 ) /* x1*m5 | x0*m0 */
ADD_L ( STRIDE, EDX ) /* next normal */
PREFETCH ( REGIND(EDX) )
PFMUL ( MM2, MM7 ) /* | x2*m10 */
- ADD_L ( CONST(12), EAX ) /* next r */
+ ADD_L ( CONST(16), EAX ) /* next r */
PFMUL ( MM3, MM7 ) /* | x2 (normalized) */
PUNPCKLDQ ( MM3, MM3 ) /* length (x) | length (x) */
PFMUL ( MM3, MM6 ) /* x1 (normalized) | x0 (normalized) */
DEC_L ( EBP ) /* decrement normal counter */
- MOVQ ( MM6, REGOFF(-12, EAX) ) /* write r0, r1 */
+ MOVQ ( MM6, REGOFF(-16, EAX) ) /* write r0, r1 */
- MOVD ( MM7, REGOFF(-4, EAX) ) /* write r2 */
+ MOVD ( MM7, REGOFF(-8, EAX) ) /* write r2 */
MOVD ( REGIND(EDI), MM3 ) /* | length (x) */
- MOVQ ( REGIND(EDX), MM6 ) /* x1 | x0 */
- MOVD ( REGOFF(8, EDX), MM7 ) /* | x2 */
-
JA ( LLBL (G3TNNR_norm_w_lengths) )
JMP ( LLBL (G3TNNR_exit_3dnow) )
PREFETCHW ( REGIND(EAX) )
+ MOVQ ( REGIND(EDX), MM6 ) /* x1 | x0 */
+ MOVD ( REGOFF(8, EDX), MM7 ) /* | x2 */
+
PFMUL ( MM0, MM6 ) /* x1*m5 | x0*m0 */
- ADD_L ( CONST(12), EAX ) /* next r */
+ ADD_L ( CONST(16), EAX ) /* next r */
PFMUL ( MM2, MM7 ) /* | x2*m10 */
MOVQ ( MM6, MM3 ) /* x1 (transformed)| x0 (transformed) */
PFRCPIT2 ( MM4, MM5 )
PFMUL ( MM5, MM6 ) /* x1 (normalized) | x0 (normalized) */
- MOVQ ( MM6, REGOFF(-12, EAX) ) /* write r0, r1 */
+ MOVQ ( MM6, REGOFF(-16, EAX) ) /* write r0, r1 */
PFMUL ( MM5, MM7 ) /* | x2 (normalized) */
- MOVD ( MM7, REGOFF(-4, EAX) ) /* write r2 */
- MOVQ ( REGIND(EDX), MM6 ) /* x1 | x0 */
-
- MOVD ( REGOFF(8, EDX), MM7 ) /* | x2 */
+ MOVD ( MM7, REGOFF(-8, EAX) ) /* write r2 */
JA ( LLBL (G3TNNR_norm) )
MOV_L ( ARG_IN, EAX )
MOV_L ( ARG_DEST, EDX )
- MOV_L ( REGOFF(V3F_COUNT, EAX), EBP ) /* dest->count = in->count */
- MOV_L ( EBP, REGOFF(V3F_COUNT, EDX) )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), EBP ) /* dest->count = in->count */
+ MOV_L ( EBP, REGOFF(V4F_COUNT, EDX) )
MOV_L ( ARG_IN, ESI )
MOV_L ( ARG_MAT, ECX )
MOV_L ( REGOFF(MATRIX_INV, ECX), ECX ) /* mat->inv */
- MOV_L ( REGOFF(V3F_START, EDX), EAX ) /* dest->start */
- MOV_L ( REGOFF(V3F_START, ESI), EDX ) /* in->start */
+ MOV_L ( REGOFF(V4F_START, EDX), EAX ) /* dest->start */
+ MOV_L ( REGOFF(V4F_START, ESI), EDX ) /* in->start */
CMP_L ( CONST(0), EBP )
JE ( LLBL (G3TRNR_end) )
PFMUL ( MM6, MM0 ) /* scale*m5 | scale*m0 */
MOVD ( REGOFF(40, ECX), MM2 ) /* | m10 */
- MOVQ ( REGIND(EDX), MM4 ) /* x1 | x0 */
PFMUL ( MM6, MM2 ) /* | scale*m10 */
- MOVD ( REGOFF(8, EDX), MM5 ) /* | x2 */
-
ALIGNTEXT32
LLBL (G3TRNR_rescale):
PREFETCHW ( REGIND(EAX) )
-
+
+ MOVQ ( REGIND(EDX), MM4 ) /* x1 | x0 */
+ MOVD ( REGOFF(8, EDX), MM5 ) /* | x2 */
+
PFMUL ( MM0, MM4 ) /* x1*m5 | x0*m0 */
ADD_L ( STRIDE, EDX ) /* next normal */
PREFETCH ( REGIND(EDX) )
PFMUL ( MM2, MM5 ) /* | x2*m10 */
- ADD_L ( CONST(12), EAX ) /* next r */
+ ADD_L ( CONST(16), EAX ) /* next r */
DEC_L ( EBP ) /* decrement normal counter */
- MOVQ ( MM4, REGOFF(-12, EAX) ) /* write r0, r1 */
-
- MOVD ( MM5, REGOFF(-4, EAX) ) /* write r2 */
- MOVQ ( REGIND(EDX), MM4 ) /* x1 | x0 */
+ MOVQ ( MM4, REGOFF(-16, EAX) ) /* write r0, r1 */
- MOVD ( REGOFF(8, EDX), MM5 ) /* | x2 */
+ MOVD ( MM5, REGOFF(-8, EAX) ) /* write r2 */
JA ( LLBL (G3TRNR_rescale) ) /* cnt > 0 ? -> process next normal */
FEMMS
MOV_L ( ARG_IN, ESI )
MOV_L ( ARG_DEST, EAX )
MOV_L ( ARG_MAT, ECX )
- MOV_L ( REGOFF(V3F_COUNT, ESI), EDI ) /* dest->count = in->count */
- MOV_L ( EDI, REGOFF(V3F_COUNT, EAX) )
- MOV_L ( REGOFF(V3F_START, EAX), EAX ) /* dest->start */
- MOV_L ( REGOFF(V3F_START, ESI), EDX ) /* in->start */
+ MOV_L ( REGOFF(V4F_COUNT, ESI), EDI ) /* dest->count = in->count */
+ MOV_L ( EDI, REGOFF(V4F_COUNT, EAX) )
+ MOV_L ( REGOFF(V4F_START, EAX), EAX ) /* dest->start */
+ MOV_L ( REGOFF(V4F_START, ESI), EDX ) /* in->start */
MOV_L ( REGOFF(MATRIX_INV, ECX), ECX ) /* mat->inv */
CMP_L ( CONST(0), EDI )
PFMUL ( MM0, MM5 ) /* scale*m6 | scale*m2 */
PFMUL ( MM0, MM6 ) /* scale*m9 | scale*m8 */
- MOVD ( REGOFF(8, EDX), MM2 ) /* | x2 */
PFMUL ( MM0, MM7 ) /* | scale*m10 */
- MOVQ ( REGIND(EDX), MM0 ) /* x1 | x0 */
ALIGNTEXT32
LLBL (G3TR_rescale):
PREFETCHW ( REGIND(EAX) )
+ MOVQ ( REGIND(EDX), MM0 ) /* x1 | x0 */
+ MOVD ( REGOFF(8, EDX), MM2 ) /* | x2 */
+
MOVQ ( MM0, MM1 ) /* x1 | x0 */
PUNPCKLDQ ( MM2, MM2 ) /* x2 | x2 */
PFMUL ( MM3, MM0 ) /* x1*m1 | x0*m0 */
- ADD_L ( CONST(12), EAX ) /* next r */
+ ADD_L ( CONST(16), EAX ) /* next r */
PFMUL ( MM4, MM1 ) /* x1*m5 | x0*m4 */
PFACC ( MM1, MM0 ) /* x0*m4+x1*m5 | x0*m0+x1*m1 */
PREFETCH ( REGIND(EDX) )
- MOVQ ( MM0, REGOFF(-12, EAX) ) /* write r0, r1 */
+ MOVQ ( MM0, REGOFF(-16, EAX) ) /* write r0, r1 */
PFMUL ( MM6, MM1 ) /* x1*m9 | x0*m8 */
PFMUL ( MM7, MM2 ) /* | x2*m10 */
PFACC ( MM1, MM1 ) /* *not used* | x0*m8+x1*m9 */
PFADD ( MM2, MM1 ) /* *not used* | x0*m8+x1*m9+x2*m10 */
- MOVD ( MM1, REGOFF(-4, EAX) ) /* write r2 */
-
- MOVQ ( REGIND(EDX), MM0 ) /* x1 | x0 */
- MOVD ( REGOFF(8, EDX), MM2 ) /* | x2 */
+ MOVD ( MM1, REGOFF(-8, EAX) ) /* write r2 */
DEC_L ( EDI ) /* decrement normal counter */
JA ( LLBL (G3TR_rescale) )
MOV_L ( ARG_IN, ESI )
MOV_L ( ARG_DEST, EAX )
MOV_L ( ARG_MAT, ECX )
- MOV_L ( REGOFF(V3F_COUNT, ESI), EDI ) /* dest->count = in->count */
- MOV_L ( EDI, REGOFF(V3F_COUNT, EAX) )
- MOV_L ( REGOFF(V3F_START, EAX), EAX ) /* dest->start */
- MOV_L ( REGOFF(V3F_START, ESI), EDX ) /* in->start */
+ MOV_L ( REGOFF(V4F_COUNT, ESI), EDI ) /* dest->count = in->count */
+ MOV_L ( EDI, REGOFF(V4F_COUNT, EAX) )
+ MOV_L ( REGOFF(V4F_START, EAX), EAX ) /* dest->start */
+ MOV_L ( REGOFF(V4F_START, ESI), EDX ) /* in->start */
MOV_L ( REGOFF(MATRIX_INV, ECX), ECX ) /* mat->inv */
CMP_L ( CONST(0), EDI )
MOVD ( REGOFF(40, ECX), MM2 ) /* | m10 */
PUNPCKLDQ ( MM2, MM2 ) /* m10 | m10 */
- MOVQ ( REGIND(EDX), MM4 ) /* x1 | x0 */
- MOVD ( REGOFF(8, EDX), MM5 ) /* | x2 */
-
ALIGNTEXT32
LLBL (G3TNR_transform):
PREFETCHW ( REGIND(EAX) )
+ MOVQ ( REGIND(EDX), MM4 ) /* x1 | x0 */
+ MOVD ( REGOFF(8, EDX), MM5 ) /* | x2 */
+
PFMUL ( MM0, MM4 ) /* x1*m5 | x0*m0 */
ADD_L ( STRIDE, EDX) /* next normal */
PREFETCH ( REGIND(EDX) )
PFMUL ( MM2, MM5 ) /* | x2*m10 */
- ADD_L ( CONST(12), EAX ) /* next r */
+ ADD_L ( CONST(16), EAX ) /* next r */
DEC_L ( EDI ) /* decrement normal counter */
- MOVQ ( MM4, REGOFF(-12, EAX) ) /* write r0, r1 */
+ MOVQ ( MM4, REGOFF(-16, EAX) ) /* write r0, r1 */
- MOVD ( MM5, REGOFF(-4, EAX) ) /* write r2 */
- MOVQ ( REGIND(EDX), MM4 ) /* x1 | x0 */
-
- MOVD ( REGOFF(8, EDX), MM5 ) /* | x2 */
+ MOVD ( MM5, REGOFF(-8, EAX) ) /* write r2 */
JA ( LLBL (G3TNR_transform) )
FEMMS
MOV_L ( ARG_IN, ESI )
MOV_L ( ARG_DEST, EAX )
MOV_L ( ARG_MAT, ECX )
- MOV_L ( REGOFF(V3F_COUNT, ESI), EDI ) /* dest->count = in->count */
- MOV_L ( EDI, REGOFF(V3F_COUNT, EAX) )
- MOV_L ( REGOFF(V3F_START, EAX), EAX ) /* dest->start */
- MOV_L ( REGOFF(V3F_START, ESI), EDX ) /* in->start */
+ MOV_L ( REGOFF(V4F_COUNT, ESI), EDI ) /* dest->count = in->count */
+ MOV_L ( EDI, REGOFF(V4F_COUNT, EAX) )
+ MOV_L ( REGOFF(V4F_START, EAX), EAX ) /* dest->start */
+ MOV_L ( REGOFF(V4F_START, ESI), EDX ) /* in->start */
MOV_L ( REGOFF(MATRIX_INV, ECX), ECX ) /* mat->inv */
CMP_L ( CONST(0), EDI ) /* count > 0 ?? */
MOVQ ( REGOFF(32, ECX), MM6 ) /* m9 | m8 */
MOVD ( REGOFF(40, ECX), MM7 ) /* | m10 */
- MOVQ ( REGIND(EDX), MM0 ) /* x1 | x0 */
- MOVD ( REGOFF(8, EDX), MM2 ) /* | x2 */
-
ALIGNTEXT32
LLBL (G3T_transform):
PREFETCHW ( REGIND(EAX) )
+ MOVQ ( REGIND(EDX), MM0 ) /* x1 | x0 */
+ MOVD ( REGOFF(8, EDX), MM2 ) /* | x2 */
+
MOVQ ( MM0, MM1 ) /* x1 | x0 */
PUNPCKLDQ ( MM2, MM2 ) /* x2 | x2 */
PFMUL ( MM3, MM0 ) /* x1*m1 | x0*m0 */
- ADD_L ( CONST(12), EAX ) /* next r */
+ ADD_L ( CONST(16), EAX ) /* next r */
PFMUL ( MM4, MM1 ) /* x1*m5 | x0*m4 */
PFACC ( MM1, MM0 ) /* x0*m4+x1*m5 | x0*m0+x1*m1 */
PFADD ( MM2, MM0 ) /* x0*m4...+x2*m6| x0*m0+x1*m1+x2*m2 */
MOVQ ( REGIND(EDX), MM1 ) /* x1 | x0 */
- MOVQ ( MM0, REGOFF(-12, EAX) ) /* write r0, r1 */
+ MOVQ ( MM0, REGOFF(-16, EAX) ) /* write r0, r1 */
PFMUL ( MM6, MM1 ) /* x1*m9 | x0*m8 */
MOVD ( REGOFF(8, EDX), MM2 ) /* | x2 */
PFACC ( MM1, MM1 ) /* *not used* | x0*m8+x1*m9 */
PFADD ( MM2, MM1 ) /* *not used* | x0*m8+x1*m9+x2*m10 */
- MOVD ( MM1, REGOFF(-4, EAX) ) /* write r2 */
- MOVQ ( REGIND(EDX), MM0 ) /* x1 | x0 */
-
- MOVD ( REGOFF(8, EDX), MM2 ) /* | x2 */
+ MOVD ( MM1, REGOFF(-8, EAX) ) /* write r2 */
DEC_L ( EDI ) /* decrement normal counter */
+
JA ( LLBL (G3T_transform) )
FEMMS
MOV_L ( ARG_IN, ESI )
MOV_L ( ARG_DEST, EAX )
- MOV_L ( REGOFF(V3F_COUNT, ESI), EBP ) /* dest->count = in->count */
- MOV_L ( EBP, REGOFF(V3F_COUNT, EAX) )
- MOV_L ( REGOFF(V3F_START, EAX), EAX ) /* dest->start */
- MOV_L ( REGOFF(V3F_START, ESI), ECX ) /* in->start */
+ MOV_L ( REGOFF(V4F_COUNT, ESI), EBP ) /* dest->count = in->count */
+ MOV_L ( EBP, REGOFF(V4F_COUNT, EAX) )
+ MOV_L ( REGOFF(V4F_START, EAX), EAX ) /* dest->start */
+ MOV_L ( REGOFF(V4F_START, ESI), ECX ) /* in->start */
MOV_L ( ARG_LENGTHS, EDX )
CMP_L ( CONST(0), EBP ) /* count > 0 ?? */
FEMMS
- MOVQ ( REGIND(ECX), MM0 ) /* x1 | x0 */
- MOVD ( REGOFF(8, ECX), MM1 ) /* | x2 */
-
CMP_L ( CONST(0), EDX ) /* lengths == 0 ? */
JE ( LLBL (G3N_norm2) ) /* calculate lengths */
PREFETCH ( REGIND(EAX) )
+ MOVQ ( REGIND(ECX), MM0 ) /* x1 | x0 */
+ MOVD ( REGOFF(8, ECX), MM1 ) /* | x2 */
+
MOVD ( REGIND(EDX), MM3 ) /* | length (x) */
PFMUL ( MM3, MM1 ) /* | x2 (normalized) */
MOVQ ( MM0, REGIND(EAX) ) /* write new x0, x1 */
MOVD ( MM1, REGOFF(8, EAX) ) /* write new x2 */
- ADD_L ( CONST(12), EAX ) /* next r */
+ ADD_L ( CONST(16), EAX ) /* next r */
ADD_L ( CONST(4), EDX ) /* next length */
DEC_L ( EBP ) /* decrement normal counter */
- MOVQ ( REGIND(ECX), MM0 ) /* x1 | x0 */
- MOVD ( REGOFF(8, ECX), MM1 ) /* | x2 */
JA ( LLBL (G3N_norm1) )
JMP ( LLBL (G3N_end1) )
PREFETCHW ( REGIND(EAX) )
+ PREFETCH ( REGIND(ECX) )
+
+ MOVQ ( REGIND(ECX), MM0 ) /* x1 | x0 */
+ MOVD ( REGOFF(8, ECX), MM1 ) /* | x2 */
+
MOVQ ( MM0, MM3 ) /* x1 | x0 */
ADD_L ( STRIDE, ECX ) /* next normal */
- PREFETCH ( REGIND(ECX) )
-
PFMUL ( MM0, MM3 ) /* x1*x1 | x0*x0 */
MOVQ ( MM1, MM4 ) /* | x2 */
- ADD_L ( CONST(12), EAX ) /* next r */
+ ADD_L ( CONST(16), EAX ) /* next r */
PFMUL ( MM1, MM4 ) /* | x2*x2 */
PFADD ( MM4, MM3 ) /* | x0*x0+x2*x2 */
PFRCPIT2 ( MM4, MM5 )
PFMUL ( MM5, MM0 ) /* x1 (normalized) | x0 (normalized) */
- MOVQ ( MM0, REGOFF(-12, EAX) ) /* write new x0, x1 */
+ MOVQ ( MM0, REGOFF(-16, EAX) ) /* write new x0, x1 */
PFMUL ( MM5, MM1 ) /* | x2 (normalized) */
- MOVD ( MM1, REGOFF(-4, EAX) ) /* write new x2 */
+ MOVD ( MM1, REGOFF(-8, EAX) ) /* write new x2 */
- MOVQ ( REGIND(ECX), MM0 ) /* x1 | x0 */
- MOVD ( REGOFF(8, ECX), MM1 ) /* | x2 */
JA ( LLBL (G3N_norm2) )
LLBL (G3N_end1):
MOV_L ( ARG_IN, ESI )
MOV_L ( ARG_DEST, EAX )
- MOV_L ( REGOFF(V3F_COUNT, ESI), EDX ) /* dest->count = in->count */
- MOV_L ( EDX, REGOFF(V3F_COUNT, EAX) )
- MOV_L ( REGOFF(V3F_START, EAX), EAX ) /* dest->start */
- MOV_L ( REGOFF(V3F_START, ESI), ECX ) /* in->start */
+ MOV_L ( REGOFF(V4F_COUNT, ESI), EDX ) /* dest->count = in->count */
+ MOV_L ( EDX, REGOFF(V4F_COUNT, EAX) )
+ MOV_L ( REGOFF(V4F_START, EAX), EAX ) /* dest->start */
+ MOV_L ( REGOFF(V4F_START, ESI), ECX ) /* in->start */
CMP_L ( CONST(0), EDX )
JE ( LLBL (G3R_end) )
MOVD ( ARG_SCALE, MM0 ) /* scale */
PUNPCKLDQ ( MM0, MM0 )
- MOVQ ( REGIND(ECX), MM1 ) /* x1 | x0 */
- MOVD ( REGOFF(8, ECX), MM2 ) /* | x2 */
-
ALIGNTEXT32
LLBL (G3R_rescale):
PREFETCHW ( REGIND(EAX) )
+ MOVQ ( REGIND(ECX), MM1 ) /* x1 | x0 */
+ MOVD ( REGOFF(8, ECX), MM2 ) /* | x2 */
+
PFMUL ( MM0, MM1 ) /* x1*scale | x0*scale */
ADD_L ( STRIDE, ECX ) /* next normal */
PREFETCH ( REGIND(ECX) )
PFMUL ( MM0, MM2 ) /* | x2*scale */
- ADD_L ( CONST(12), EAX ) /* next r */
+ ADD_L ( CONST(16), EAX ) /* next r */
- MOVQ ( MM1, REGOFF(-12, EAX) ) /* write r0, r1 */
- MOVD ( MM2, REGOFF(-4, EAX) ) /* write r2 */
+ MOVQ ( MM1, REGOFF(-16, EAX) ) /* write r0, r1 */
+ MOVD ( MM2, REGOFF(-8, EAX) ) /* write r2 */
DEC_L ( EDX ) /* decrement normal counter */
- MOVQ ( REGIND(ECX), MM1 ) /* x1 | x0 */
-
- MOVD ( REGOFF(8, ECX), MM2 ) /* | x2 */
JA ( LLBL (G3R_rescale) )
FEMMS