1 /* $Id: xform4.S,v 1.1 2005/05/07 16:59:59 brianp Exp $ */
4 * Mesa 3-D graphics library
7 * Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
9 * Permission is hereby granted, free of charge, to any person obtaining a
10 * copy of this software and associated documentation files (the "Software"),
11 * to deal in the Software without restriction, including without limitation
12 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
13 * and/or sell copies of the Software, and to permit persons to whom the
14 * Software is furnished to do so, subject to the following conditions:
16 * The above copyright notice and this permission notice shall be included
17 * in all copies or substantial portions of the Software.
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
23 * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
24 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
35 .globl _mesa_x86_64_transform_points4_general
36 _mesa_x86_64_transform_points4_general:
42 movl V4F_COUNT(%rdx), %ecx /* count */
43 movzx V4F_STRIDE(%rdx), %eax /* stride */
45 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
46 movl $4, V4F_SIZE(%rdi) /* set dest size */
47 .byte 0x66, 0x66, 0x66, 0x90 /* manual align += 3 */
48 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
50 testl %ecx, %ecx /* verify non-zero count */
54 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
55 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
59 movaps 0(%rsi), %xmm4 /* m3 | m2 | m1 | m0 */
60 movaps 16(%rsi), %xmm5 /* m7 | m6 | m5 | m4 */
61 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
62 movaps 32(%rsi), %xmm6 /* m11 | m10 | m9 | m8 */
63 movaps 48(%rsi), %xmm7 /* m15 | m14 | m13 | m12 */
67 movaps (%rdx), %xmm8 /* ox | oy | oz | ow */
70 pshufd $0x00, %xmm8, %xmm0 /* ox | ox | ox | ox */
72 pshufd $0x55, %xmm8, %xmm1 /* oy | oy | oy | oy */
73 mulps %xmm4, %xmm0 /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
74 pshufd $0xAA, %xmm8, %xmm2 /* oz | oz | oz | ox */
75 mulps %xmm5, %xmm1 /* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
76 pshufd $0xFF, %xmm8, %xmm3 /* ow | ow | ow | ow */
77 mulps %xmm6, %xmm2 /* oz*m11 | oz*m10 | oz*m9 | oz*m8 */
78 addps %xmm1, %xmm0 /* ox*m3+oy*m7 | ... */
79 mulps %xmm7, %xmm3 /* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
80 addps %xmm2, %xmm0 /* ox*m3+oy*m7+oz*m11 | ... */
82 addps %xmm3, %xmm0 /* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
84 movaps %xmm0, (%rdi) /* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
98 .byte 0xff, 0xff, 0xff, 0xff
99 .byte 0xff, 0xff, 0xff, 0xff
100 .byte 0xff, 0xff, 0xff, 0xff
101 .byte 0x00, 0x00, 0x00, 0x00
103 .byte 0x00, 0x00, 0x00, 0x00
104 .byte 0x00, 0x00, 0x00, 0x00
105 .byte 0x00, 0x00, 0x00, 0x00
110 .globl _mesa_x86_64_transform_points4_3d
112 * this is slower than _mesa_x86_64_transform_points4_general
113 * because it ensures that the last matrix row (or is it column?) is 0,0,0,1
115 _mesa_x86_64_transform_points4_3d:
117 leaq p4_constants(%rip), %rax
122 movaps 16(%rax), %xmm10
124 movl V4F_COUNT(%rdx), %ecx /* count */
125 movzx V4F_STRIDE(%rdx), %eax /* stride */
127 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
128 movl $4, V4F_SIZE(%rdi) /* set dest size */
129 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
131 testl %ecx, %ecx /* verify non-zero count */
134 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
135 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
139 movaps 0(%rsi), %xmm4 /* m3 | m2 | m1 | m0 */
140 movaps 16(%rsi), %xmm5 /* m7 | m6 | m5 | m4 */
141 andps %xmm9, %xmm4 /* 0.0 | m2 | m1 | m0 */
142 movaps 32(%rsi), %xmm6 /* m11 | m10 | m9 | m8 */
143 andps %xmm9, %xmm5 /* 0.0 | m6 | m5 | m4 */
144 movaps 48(%rsi), %xmm7 /* m15 | m14 | m13 | m12 */
145 andps %xmm9, %xmm6 /* 0.0 | m10 | m9 | m8 */
146 andps %xmm9, %xmm7 /* 0.0 | m14 | m13 | m12 */
147 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
148 orps %xmm10, %xmm7 /* 1.0 | m14 | m13 | m12 */
152 movaps (%rdx), %xmm8 /* ox | oy | oz | ow */
155 pshufd $0x00, %xmm8, %xmm0 /* ox | ox | ox | ox */
157 pshufd $0x55, %xmm8, %xmm1 /* oy | oy | oy | oy */
158 mulps %xmm4, %xmm0 /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
159 pshufd $0xAA, %xmm8, %xmm2 /* oz | oz | oz | ox */
160 mulps %xmm5, %xmm1 /* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
161 pshufd $0xFF, %xmm8, %xmm3 /* ow | ow | ow | ow */
162 mulps %xmm6, %xmm2 /* oz*m11 | oz*m10 | oz*m9 | oz*m8 */
163 addps %xmm1, %xmm0 /* ox*m3+oy*m7 | ... */
164 mulps %xmm7, %xmm3 /* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
165 addps %xmm2, %xmm0 /* ox*m3+oy*m7+oz*m11 | ... */
167 addps %xmm3, %xmm0 /* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
169 movaps %xmm0, (%rdi) /* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
181 .globl _mesa_x86_64_transform_points4_identity
182 _mesa_x86_64_transform_points4_identity:
184 movl V4F_COUNT(%rdx), %ecx /* count */
185 movzx V4F_STRIDE(%rdx), %eax /* stride */
187 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
188 movl $4, V4F_SIZE(%rdi) /* set dest size */
189 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
194 movq V4F_START(%rdx), %rsi /* ptr to first src vertex */
195 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
209 .globl _mesa_x86_64_transform_points4_3d_no_rot
210 _mesa_x86_64_transform_points4_3d_no_rot:
212 movl V4F_COUNT(%rdx), %ecx /* count */
213 movzx V4F_STRIDE(%rdx), %eax /* stride */
215 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
216 movl $4, V4F_SIZE(%rdi) /* set dest size */
217 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
218 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
221 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
224 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
225 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
229 movd (%rsi), %mm0 /* | m00 */
230 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
231 punpckldq 20(%rsi), %mm0 /* m11 | m00 */
233 movd 40(%rsi), %mm2 /* | m22 */
234 movq 48(%rsi), %mm1 /* m31 | m30 */
236 punpckldq 56(%rsi), %mm2 /* m11 | m00 */
242 movq (%rdx), %mm4 /* x1 | x0 */
243 movq 8(%rdx), %mm5 /* x3 | x2 */
244 movd 12(%rdx), %mm7 /* | x3 */
246 movq %mm5, %mm6 /* x3 | x2 */
247 pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */
249 punpckhdq %mm6, %mm6 /* x3 | x3 */
250 pfmul %mm2, %mm5 /* x3*m32 | x2*m22 */
252 pfmul %mm1, %mm6 /* x3*m31 | x3*m30 */
253 pfacc %mm7, %mm5 /* x3 | x2*m22+x3*m32 */
255 pfadd %mm6, %mm4 /* x1*m11+x3*m31 | x0*m00+x3*m30 */
258 movq %mm4, (%rdi) /* write r0, r1 */
259 movq %mm5, 8(%rdi) /* write r2, r3 */
265 jnz p4_3d_no_rot_loop
273 .globl _mesa_x86_64_transform_points4_perspective
274 _mesa_x86_64_transform_points4_perspective:
276 movl V4F_COUNT(%rdx), %ecx /* count */
277 movzx V4F_STRIDE(%rdx), %eax /* stride */
279 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
280 movl $4, V4F_SIZE(%rdi) /* set dest size */
281 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
284 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
285 jz p4_perspective_done
287 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
288 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
290 movd (%rsi), %mm0 /* | m00 */
291 pxor %mm7, %mm7 /* 0 | 0 */
292 punpckldq 20(%rsi), %mm0 /* m11 | m00 */
294 movq 32(%rsi), %mm2 /* m21 | m20 */
297 movd 40(%rsi), %mm1 /* | m22 */
299 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
300 punpckldq 56(%rsi), %mm1 /* m32 | m22 */
305 prefetchw 32(%rdi) /* prefetch 2 vertices ahead */
307 movq (%rdx), %mm4 /* x1 | x0 */
308 movq 8(%rdx), %mm5 /* x3 | x2 */
309 movd 8(%rdx), %mm3 /* | x2 */
311 movq %mm5, %mm6 /* x3 | x2 */
312 pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */
314 punpckldq %mm5, %mm5 /* x2 | x2 */
316 pfmul %mm2, %mm5 /* x2*m21 | x2*m20 */
317 pfsubr %mm7, %mm3 /* | -x2 */
319 pfmul %mm1, %mm6 /* x3*m32 | x2*m22 */
320 pfadd %mm4, %mm5 /* x1*m11+x2*m21 | x0*m00+x2*m20 */
322 pfacc %mm3, %mm6 /* -x2 | x2*m22+x3*m32 */
324 movq %mm5, (%rdi) /* write r0, r1 */
326 movq %mm6, 8(%rdi) /* write r2, r3 */
331 prefetch 32(%rdx) /* hopefully stride is zero */
332 jnz p4_perspective_loop
339 .globl _mesa_x86_64_transform_points4_2d_no_rot
340 _mesa_x86_64_transform_points4_2d_no_rot:
342 movl V4F_COUNT(%rdx), %ecx /* count */
343 movzx V4F_STRIDE(%rdx), %eax /* stride */
345 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
346 movl $4, V4F_SIZE(%rdi) /* set dest size */
347 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
350 .byte 0x90 /* manual align += 1 */
353 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
354 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
356 movd (%rsi), %mm0 /* | m00 */
358 punpckldq 20(%rsi), %mm0 /* m11 | m00 */
360 movq 48(%rsi), %mm1 /* m31 | m30 */
364 prefetchw 32(%rdi) /* prefetch 2 vertices ahead */
366 movq (%rdx), %mm4 /* x1 | x0 */
367 movq 8(%rdx), %mm5 /* x3 | x2 */
369 pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */
370 movq %mm5, %mm6 /* x3 | x2 */
372 punpckhdq %mm6, %mm6 /* x3 | x3 */
375 pfmul %mm1, %mm6 /* x3*m31 | x3*m30 */
377 prefetch 32(%rdx) /* hopefully stride is zero */
378 pfadd %mm4, %mm6 /* x1*m11+x3*m31 | x0*m00+x3*m30 */
380 movq %mm6, (%rdi) /* write r0, r1 */
381 movq %mm5, 8(%rdi) /* write r2, r3 */
386 jnz p4_2d_no_rot_loop
394 .globl _mesa_x86_64_transform_points4_2d
395 _mesa_x86_64_transform_points4_2d:
397 movl V4F_COUNT(%rdx), %ecx /* count */
398 movzx V4F_STRIDE(%rdx), %eax /* stride */
400 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
401 movl $4, V4F_SIZE(%rdi) /* set dest size */
402 .byte 0x66, 0x66, 0x90 /* manual align += 4 */
403 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
406 .byte 0x66, 0x66, 0x90 /* manual align += 4 */
409 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
410 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
412 movd (%rsi), %mm0 /* | m00 */
413 movd 4(%rsi), %mm1 /* | m01 */
417 punpckldq 16(%rsi), %mm0 /* m10 | m00 */
418 .byte 0x66, 0x66, 0x90 /* manual align += 4 */
419 punpckldq 20(%rsi), %mm1 /* m11 | m01 */
421 movq 48(%rsi), %mm2 /* m31 | m30 */
425 prefetchw 32(%rdi) /* prefetch 2 vertices ahead */
427 movq (%rdx), %mm3 /* x1 | x0 */
428 movq 8(%rdx), %mm5 /* x3 | x2 */
430 movq %mm3, %mm4 /* x1 | x0 */
431 movq %mm5, %mm6 /* x3 | x2 */
433 pfmul %mm1, %mm4 /* x1*m11 | x0*m01 */
434 punpckhdq %mm6, %mm6 /* x3 | x3 */
436 pfmul %mm0, %mm3 /* x1*m10 | x0*m00 */
439 pfacc %mm4, %mm3 /* x0*m01+x1*m11 | x0*m00+x1*m10 */
441 pfmul %mm2, %mm6 /* x3*m31 | x3*m30 */
442 prefetch 32(%rdx) /* hopefully stride is zero */
444 pfadd %mm6, %mm3 /* r1 | r0 */
446 movq %mm3, (%rdi) /* write r0, r1 */
447 movq %mm5, 8(%rdi) /* write r2, r3 */