2 * Mesa 3-D graphics library
4 * Copyright (C) 1999-2007 Brian Paul All Rights Reserved.
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included
14 * in all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
17 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 * OTHER DEALINGS IN THE SOFTWARE.
32 #define MATH_ASM_PTR_SIZE 8
33 #include "math/m_vector_asm.h"
38 .globl _mesa_x86_64_cpuid
39 .hidden _mesa_x86_64_cpuid
56 .globl _mesa_x86_64_transform_points4_general
57 .hidden _mesa_x86_64_transform_points4_general
58 _mesa_x86_64_transform_points4_general:
65 movl V4F_COUNT(%rdx), %ecx /* count */
66 movzbl V4F_STRIDE(%rdx), %eax /* stride */
68 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
69 movl $4, V4F_SIZE(%rdi) /* set dest size */
70 .byte 0x66, 0x66, 0x66, 0x90 /* manual align += 3 */
71 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
73 testl %ecx, %ecx /* verify non-zero count */
77 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
78 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
82 movaps 0(%rsi), %xmm4 /* m3 | m2 | m1 | m0 */
83 movaps 16(%rsi), %xmm5 /* m7 | m6 | m5 | m4 */
84 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
85 movaps 32(%rsi), %xmm6 /* m11 | m10 | m9 | m8 */
86 movaps 48(%rsi), %xmm7 /* m15 | m14 | m13 | m12 */
90 movups (%rdx), %xmm8 /* ox | oy | oz | ow */
93 pshufd $0x00, %xmm8, %xmm0 /* ox | ox | ox | ox */
95 pshufd $0x55, %xmm8, %xmm1 /* oy | oy | oy | oy */
96 mulps %xmm4, %xmm0 /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
97 pshufd $0xAA, %xmm8, %xmm2 /* oz | oz | oz | ox */
98 mulps %xmm5, %xmm1 /* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
99 pshufd $0xFF, %xmm8, %xmm3 /* ow | ow | ow | ow */
100 mulps %xmm6, %xmm2 /* oz*m11 | oz*m10 | oz*m9 | oz*m8 */
101 addps %xmm1, %xmm0 /* ox*m3+oy*m7 | ... */
102 mulps %xmm7, %xmm3 /* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
103 addps %xmm2, %xmm0 /* ox*m3+oy*m7+oz*m11 | ... */
105 addps %xmm3, %xmm0 /* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
107 movaps %xmm0, (%rdi) /* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
121 .byte 0xff, 0xff, 0xff, 0xff
122 .byte 0xff, 0xff, 0xff, 0xff
123 .byte 0xff, 0xff, 0xff, 0xff
124 .byte 0x00, 0x00, 0x00, 0x00
126 .byte 0x00, 0x00, 0x00, 0x00
127 .byte 0x00, 0x00, 0x00, 0x00
128 .byte 0x00, 0x00, 0x00, 0x00
133 .globl _mesa_x86_64_transform_points4_3d
134 .hidden _mesa_x86_64_transform_points4_3d
136 * this is slower than _mesa_x86_64_transform_points4_general
137 * because it ensures that the last matrix row (or is it column?) is 0,0,0,1
139 _mesa_x86_64_transform_points4_3d:
141 leaq p4_constants(%rip), %rax
146 movaps 16(%rax), %xmm10
148 movl V4F_COUNT(%rdx), %ecx /* count */
149 movzbl V4F_STRIDE(%rdx), %eax /* stride */
151 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
152 movl $4, V4F_SIZE(%rdi) /* set dest size */
153 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
155 testl %ecx, %ecx /* verify non-zero count */
158 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
159 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
163 movaps 0(%rsi), %xmm4 /* m3 | m2 | m1 | m0 */
164 movaps 16(%rsi), %xmm5 /* m7 | m6 | m5 | m4 */
165 andps %xmm9, %xmm4 /* 0.0 | m2 | m1 | m0 */
166 movaps 32(%rsi), %xmm6 /* m11 | m10 | m9 | m8 */
167 andps %xmm9, %xmm5 /* 0.0 | m6 | m5 | m4 */
168 movaps 48(%rsi), %xmm7 /* m15 | m14 | m13 | m12 */
169 andps %xmm9, %xmm6 /* 0.0 | m10 | m9 | m8 */
170 andps %xmm9, %xmm7 /* 0.0 | m14 | m13 | m12 */
171 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
172 orps %xmm10, %xmm7 /* 1.0 | m14 | m13 | m12 */
176 movups (%rdx), %xmm8 /* ox | oy | oz | ow */
179 pshufd $0x00, %xmm8, %xmm0 /* ox | ox | ox | ox */
181 pshufd $0x55, %xmm8, %xmm1 /* oy | oy | oy | oy */
182 mulps %xmm4, %xmm0 /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
183 pshufd $0xAA, %xmm8, %xmm2 /* oz | oz | oz | ox */
184 mulps %xmm5, %xmm1 /* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
185 pshufd $0xFF, %xmm8, %xmm3 /* ow | ow | ow | ow */
186 mulps %xmm6, %xmm2 /* oz*m11 | oz*m10 | oz*m9 | oz*m8 */
187 addps %xmm1, %xmm0 /* ox*m3+oy*m7 | ... */
188 mulps %xmm7, %xmm3 /* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
189 addps %xmm2, %xmm0 /* ox*m3+oy*m7+oz*m11 | ... */
191 addps %xmm3, %xmm0 /* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
193 movaps %xmm0, (%rdi) /* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
205 .globl _mesa_x86_64_transform_points4_identity
206 .hidden _mesa_x86_64_transform_points4_identity
207 _mesa_x86_64_transform_points4_identity:
209 movl V4F_COUNT(%rdx), %ecx /* count */
210 movzbl V4F_STRIDE(%rdx), %eax /* stride */
212 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
213 movl $4, V4F_SIZE(%rdi) /* set dest size */
214 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
219 movq V4F_START(%rdx), %rsi /* ptr to first src vertex */
220 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
234 .globl _mesa_3dnow_transform_points4_3d_no_rot
235 .hidden _mesa_3dnow_transform_points4_3d_no_rot
236 _mesa_3dnow_transform_points4_3d_no_rot:
238 movl V4F_COUNT(%rdx), %ecx /* count */
239 movzbl V4F_STRIDE(%rdx), %eax /* stride */
241 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
242 movl $4, V4F_SIZE(%rdi) /* set dest size */
243 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
244 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
247 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
250 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
251 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
255 movd (%rsi), %mm0 /* | m00 */
256 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
257 punpckldq 20(%rsi), %mm0 /* m11 | m00 */
259 movd 40(%rsi), %mm2 /* | m22 */
260 movq 48(%rsi), %mm1 /* m31 | m30 */
262 punpckldq 56(%rsi), %mm2 /* m11 | m00 */
268 movq (%rdx), %mm4 /* x1 | x0 */
269 movq 8(%rdx), %mm5 /* x3 | x2 */
270 movd 12(%rdx), %mm7 /* | x3 */
272 movq %mm5, %mm6 /* x3 | x2 */
273 pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */
275 punpckhdq %mm6, %mm6 /* x3 | x3 */
276 pfmul %mm2, %mm5 /* x3*m32 | x2*m22 */
278 pfmul %mm1, %mm6 /* x3*m31 | x3*m30 */
279 pfacc %mm7, %mm5 /* x3 | x2*m22+x3*m32 */
281 pfadd %mm6, %mm4 /* x1*m11+x3*m31 | x0*m00+x3*m30 */
284 movq %mm4, (%rdi) /* write r0, r1 */
285 movq %mm5, 8(%rdi) /* write r2, r3 */
291 jnz p4_3d_no_rot_loop
299 .globl _mesa_3dnow_transform_points4_perspective
300 .hidden _mesa_3dnow_transform_points4_perspective
301 _mesa_3dnow_transform_points4_perspective:
303 movl V4F_COUNT(%rdx), %ecx /* count */
304 movzbl V4F_STRIDE(%rdx), %eax /* stride */
306 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
307 movl $4, V4F_SIZE(%rdi) /* set dest size */
308 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
311 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
312 jz p4_perspective_done
314 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
315 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
317 movd (%rsi), %mm0 /* | m00 */
318 pxor %mm7, %mm7 /* 0 | 0 */
319 punpckldq 20(%rsi), %mm0 /* m11 | m00 */
321 movq 32(%rsi), %mm2 /* m21 | m20 */
324 movd 40(%rsi), %mm1 /* | m22 */
326 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
327 punpckldq 56(%rsi), %mm1 /* m32 | m22 */
332 prefetcht1 32(%rdi) /* prefetch 2 vertices ahead */
334 movq (%rdx), %mm4 /* x1 | x0 */
335 movq 8(%rdx), %mm5 /* x3 | x2 */
336 movd 8(%rdx), %mm3 /* | x2 */
338 movq %mm5, %mm6 /* x3 | x2 */
339 pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */
341 punpckldq %mm5, %mm5 /* x2 | x2 */
343 pfmul %mm2, %mm5 /* x2*m21 | x2*m20 */
344 pfsubr %mm7, %mm3 /* | -x2 */
346 pfmul %mm1, %mm6 /* x3*m32 | x2*m22 */
347 pfadd %mm4, %mm5 /* x1*m11+x2*m21 | x0*m00+x2*m20 */
349 pfacc %mm3, %mm6 /* -x2 | x2*m22+x3*m32 */
351 movq %mm5, (%rdi) /* write r0, r1 */
353 movq %mm6, 8(%rdi) /* write r2, r3 */
358 prefetcht1 32(%rdx) /* hopefully stride is zero */
359 jnz p4_perspective_loop
366 .globl _mesa_3dnow_transform_points4_2d_no_rot
367 .hidden _mesa_3dnow_transform_points4_2d_no_rot
368 _mesa_3dnow_transform_points4_2d_no_rot:
370 movl V4F_COUNT(%rdx), %ecx /* count */
371 movzbl V4F_STRIDE(%rdx), %eax /* stride */
373 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
374 movl $4, V4F_SIZE(%rdi) /* set dest size */
375 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
378 .byte 0x90 /* manual align += 1 */
381 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
382 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
384 movd (%rsi), %mm0 /* | m00 */
386 punpckldq 20(%rsi), %mm0 /* m11 | m00 */
388 movq 48(%rsi), %mm1 /* m31 | m30 */
392 prefetcht1 32(%rdi) /* prefetch 2 vertices ahead */
394 movq (%rdx), %mm4 /* x1 | x0 */
395 movq 8(%rdx), %mm5 /* x3 | x2 */
397 pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */
398 movq %mm5, %mm6 /* x3 | x2 */
400 punpckhdq %mm6, %mm6 /* x3 | x3 */
403 pfmul %mm1, %mm6 /* x3*m31 | x3*m30 */
405 prefetcht1 32(%rdx) /* hopefully stride is zero */
406 pfadd %mm4, %mm6 /* x1*m11+x3*m31 | x0*m00+x3*m30 */
408 movq %mm6, (%rdi) /* write r0, r1 */
409 movq %mm5, 8(%rdi) /* write r2, r3 */
414 jnz p4_2d_no_rot_loop
422 .globl _mesa_3dnow_transform_points4_2d
423 .hidden _mesa_3dnow_transform_points4_2d
424 _mesa_3dnow_transform_points4_2d:
426 movl V4F_COUNT(%rdx), %ecx /* count */
427 movzbl V4F_STRIDE(%rdx), %eax /* stride */
429 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
430 movl $4, V4F_SIZE(%rdi) /* set dest size */
431 .byte 0x66, 0x66, 0x90 /* manual align += 4 */
432 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
435 .byte 0x66, 0x66, 0x90 /* manual align += 4 */
438 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
439 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
441 movd (%rsi), %mm0 /* | m00 */
442 movd 4(%rsi), %mm1 /* | m01 */
446 punpckldq 16(%rsi), %mm0 /* m10 | m00 */
447 .byte 0x66, 0x66, 0x90 /* manual align += 4 */
448 punpckldq 20(%rsi), %mm1 /* m11 | m01 */
450 movq 48(%rsi), %mm2 /* m31 | m30 */
454 prefetcht1 32(%rdi) /* prefetch 2 vertices ahead */
456 movq (%rdx), %mm3 /* x1 | x0 */
457 movq 8(%rdx), %mm5 /* x3 | x2 */
459 movq %mm3, %mm4 /* x1 | x0 */
460 movq %mm5, %mm6 /* x3 | x2 */
462 pfmul %mm1, %mm4 /* x1*m11 | x0*m01 */
463 punpckhdq %mm6, %mm6 /* x3 | x3 */
465 pfmul %mm0, %mm3 /* x1*m10 | x0*m00 */
468 pfacc %mm4, %mm3 /* x0*m01+x1*m11 | x0*m00+x1*m10 */
470 pfmul %mm2, %mm6 /* x3*m31 | x3*m30 */
471 prefetcht1 32(%rdx) /* hopefully stride is zero */
473 pfadd %mm6, %mm3 /* r1 | r0 */
475 movq %mm3, (%rdi) /* write r0, r1 */
476 movq %mm5, 8(%rdi) /* write r2, r3 */
489 #if defined (__ELF__) && defined (__linux__)
490 .section .note.GNU-stack,"",%progbits