2 * Mesa 3-D graphics library
5 * Copyright (C) 1999-2007 Brian Paul All Rights Reserved.
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 * and/or sell copies of the Software, and to permit persons to whom the
12 * Software is furnished to do so, subject to the following conditions:
14 * The above copyright notice and this permission notice shall be included
15 * in all copies or substantial portions of the Software.
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
21 * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
32 .globl _mesa_x86_64_cpuid
48 .globl _mesa_x86_64_transform_points4_general
49 _mesa_x86_64_transform_points4_general:
55 movl V4F_COUNT(%rdx), %ecx /* count */
56 movzx V4F_STRIDE(%rdx), %eax /* stride */
58 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
59 movl $4, V4F_SIZE(%rdi) /* set dest size */
60 .byte 0x66, 0x66, 0x66, 0x90 /* manual align += 3 */
61 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
63 testl %ecx, %ecx /* verify non-zero count */
67 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
68 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
72 movaps 0(%rsi), %xmm4 /* m3 | m2 | m1 | m0 */
73 movaps 16(%rsi), %xmm5 /* m7 | m6 | m5 | m4 */
74 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
75 movaps 32(%rsi), %xmm6 /* m11 | m10 | m9 | m8 */
76 movaps 48(%rsi), %xmm7 /* m15 | m14 | m13 | m12 */
80 movups (%rdx), %xmm8 /* ox | oy | oz | ow */
83 pshufd $0x00, %xmm8, %xmm0 /* ox | ox | ox | ox */
85 pshufd $0x55, %xmm8, %xmm1 /* oy | oy | oy | oy */
86 mulps %xmm4, %xmm0 /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
87 pshufd $0xAA, %xmm8, %xmm2 /* oz | oz | oz | ox */
88 mulps %xmm5, %xmm1 /* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
89 pshufd $0xFF, %xmm8, %xmm3 /* ow | ow | ow | ow */
90 mulps %xmm6, %xmm2 /* oz*m11 | oz*m10 | oz*m9 | oz*m8 */
91 addps %xmm1, %xmm0 /* ox*m3+oy*m7 | ... */
92 mulps %xmm7, %xmm3 /* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
93 addps %xmm2, %xmm0 /* ox*m3+oy*m7+oz*m11 | ... */
95 addps %xmm3, %xmm0 /* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
97 movaps %xmm0, (%rdi) /* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
111 .byte 0xff, 0xff, 0xff, 0xff
112 .byte 0xff, 0xff, 0xff, 0xff
113 .byte 0xff, 0xff, 0xff, 0xff
114 .byte 0x00, 0x00, 0x00, 0x00
116 .byte 0x00, 0x00, 0x00, 0x00
117 .byte 0x00, 0x00, 0x00, 0x00
118 .byte 0x00, 0x00, 0x00, 0x00
123 .globl _mesa_x86_64_transform_points4_3d
125 * this is slower than _mesa_x86_64_transform_points4_general
126 * because it ensures that the last matrix row (or is it column?) is 0,0,0,1
128 _mesa_x86_64_transform_points4_3d:
130 leaq p4_constants(%rip), %rax
135 movaps 16(%rax), %xmm10
137 movl V4F_COUNT(%rdx), %ecx /* count */
138 movzx V4F_STRIDE(%rdx), %eax /* stride */
140 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
141 movl $4, V4F_SIZE(%rdi) /* set dest size */
142 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
144 testl %ecx, %ecx /* verify non-zero count */
147 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
148 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
152 movaps 0(%rsi), %xmm4 /* m3 | m2 | m1 | m0 */
153 movaps 16(%rsi), %xmm5 /* m7 | m6 | m5 | m4 */
154 andps %xmm9, %xmm4 /* 0.0 | m2 | m1 | m0 */
155 movaps 32(%rsi), %xmm6 /* m11 | m10 | m9 | m8 */
156 andps %xmm9, %xmm5 /* 0.0 | m6 | m5 | m4 */
157 movaps 48(%rsi), %xmm7 /* m15 | m14 | m13 | m12 */
158 andps %xmm9, %xmm6 /* 0.0 | m10 | m9 | m8 */
159 andps %xmm9, %xmm7 /* 0.0 | m14 | m13 | m12 */
160 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
161 orps %xmm10, %xmm7 /* 1.0 | m14 | m13 | m12 */
165 movups (%rdx), %xmm8 /* ox | oy | oz | ow */
168 pshufd $0x00, %xmm8, %xmm0 /* ox | ox | ox | ox */
170 pshufd $0x55, %xmm8, %xmm1 /* oy | oy | oy | oy */
171 mulps %xmm4, %xmm0 /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
172 pshufd $0xAA, %xmm8, %xmm2 /* oz | oz | oz | ox */
173 mulps %xmm5, %xmm1 /* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
174 pshufd $0xFF, %xmm8, %xmm3 /* ow | ow | ow | ow */
175 mulps %xmm6, %xmm2 /* oz*m11 | oz*m10 | oz*m9 | oz*m8 */
176 addps %xmm1, %xmm0 /* ox*m3+oy*m7 | ... */
177 mulps %xmm7, %xmm3 /* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
178 addps %xmm2, %xmm0 /* ox*m3+oy*m7+oz*m11 | ... */
180 addps %xmm3, %xmm0 /* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
182 movaps %xmm0, (%rdi) /* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
194 .globl _mesa_x86_64_transform_points4_identity
195 _mesa_x86_64_transform_points4_identity:
197 movl V4F_COUNT(%rdx), %ecx /* count */
198 movzx V4F_STRIDE(%rdx), %eax /* stride */
200 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
201 movl $4, V4F_SIZE(%rdi) /* set dest size */
202 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
207 movq V4F_START(%rdx), %rsi /* ptr to first src vertex */
208 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
222 .globl _mesa_3dnow_transform_points4_3d_no_rot
223 _mesa_3dnow_transform_points4_3d_no_rot:
225 movl V4F_COUNT(%rdx), %ecx /* count */
226 movzx V4F_STRIDE(%rdx), %eax /* stride */
228 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
229 movl $4, V4F_SIZE(%rdi) /* set dest size */
230 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
231 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
234 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
237 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
238 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
242 movd (%rsi), %mm0 /* | m00 */
243 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
244 punpckldq 20(%rsi), %mm0 /* m11 | m00 */
246 movd 40(%rsi), %mm2 /* | m22 */
247 movq 48(%rsi), %mm1 /* m31 | m30 */
249 punpckldq 56(%rsi), %mm2 /* m11 | m00 */
255 movq (%rdx), %mm4 /* x1 | x0 */
256 movq 8(%rdx), %mm5 /* x3 | x2 */
257 movd 12(%rdx), %mm7 /* | x3 */
259 movq %mm5, %mm6 /* x3 | x2 */
260 pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */
262 punpckhdq %mm6, %mm6 /* x3 | x3 */
263 pfmul %mm2, %mm5 /* x3*m32 | x2*m22 */
265 pfmul %mm1, %mm6 /* x3*m31 | x3*m30 */
266 pfacc %mm7, %mm5 /* x3 | x2*m22+x3*m32 */
268 pfadd %mm6, %mm4 /* x1*m11+x3*m31 | x0*m00+x3*m30 */
271 movq %mm4, (%rdi) /* write r0, r1 */
272 movq %mm5, 8(%rdi) /* write r2, r3 */
278 jnz p4_3d_no_rot_loop
286 .globl _mesa_3dnow_transform_points4_perspective
287 _mesa_3dnow_transform_points4_perspective:
289 movl V4F_COUNT(%rdx), %ecx /* count */
290 movzx V4F_STRIDE(%rdx), %eax /* stride */
292 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
293 movl $4, V4F_SIZE(%rdi) /* set dest size */
294 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
297 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
298 jz p4_perspective_done
300 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
301 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
303 movd (%rsi), %mm0 /* | m00 */
304 pxor %mm7, %mm7 /* 0 | 0 */
305 punpckldq 20(%rsi), %mm0 /* m11 | m00 */
307 movq 32(%rsi), %mm2 /* m21 | m20 */
310 movd 40(%rsi), %mm1 /* | m22 */
312 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
313 punpckldq 56(%rsi), %mm1 /* m32 | m22 */
318 prefetchw 32(%rdi) /* prefetch 2 vertices ahead */
320 movq (%rdx), %mm4 /* x1 | x0 */
321 movq 8(%rdx), %mm5 /* x3 | x2 */
322 movd 8(%rdx), %mm3 /* | x2 */
324 movq %mm5, %mm6 /* x3 | x2 */
325 pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */
327 punpckldq %mm5, %mm5 /* x2 | x2 */
329 pfmul %mm2, %mm5 /* x2*m21 | x2*m20 */
330 pfsubr %mm7, %mm3 /* | -x2 */
332 pfmul %mm1, %mm6 /* x3*m32 | x2*m22 */
333 pfadd %mm4, %mm5 /* x1*m11+x2*m21 | x0*m00+x2*m20 */
335 pfacc %mm3, %mm6 /* -x2 | x2*m22+x3*m32 */
337 movq %mm5, (%rdi) /* write r0, r1 */
339 movq %mm6, 8(%rdi) /* write r2, r3 */
344 prefetch 32(%rdx) /* hopefully stride is zero */
345 jnz p4_perspective_loop
352 .globl _mesa_3dnow_transform_points4_2d_no_rot
353 _mesa_3dnow_transform_points4_2d_no_rot:
355 movl V4F_COUNT(%rdx), %ecx /* count */
356 movzx V4F_STRIDE(%rdx), %eax /* stride */
358 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
359 movl $4, V4F_SIZE(%rdi) /* set dest size */
360 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
363 .byte 0x90 /* manual align += 1 */
366 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
367 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
369 movd (%rsi), %mm0 /* | m00 */
371 punpckldq 20(%rsi), %mm0 /* m11 | m00 */
373 movq 48(%rsi), %mm1 /* m31 | m30 */
377 prefetchw 32(%rdi) /* prefetch 2 vertices ahead */
379 movq (%rdx), %mm4 /* x1 | x0 */
380 movq 8(%rdx), %mm5 /* x3 | x2 */
382 pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */
383 movq %mm5, %mm6 /* x3 | x2 */
385 punpckhdq %mm6, %mm6 /* x3 | x3 */
388 pfmul %mm1, %mm6 /* x3*m31 | x3*m30 */
390 prefetch 32(%rdx) /* hopefully stride is zero */
391 pfadd %mm4, %mm6 /* x1*m11+x3*m31 | x0*m00+x3*m30 */
393 movq %mm6, (%rdi) /* write r0, r1 */
394 movq %mm5, 8(%rdi) /* write r2, r3 */
399 jnz p4_2d_no_rot_loop
407 .globl _mesa_3dnow_transform_points4_2d
408 _mesa_3dnow_transform_points4_2d:
410 movl V4F_COUNT(%rdx), %ecx /* count */
411 movzx V4F_STRIDE(%rdx), %eax /* stride */
413 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
414 movl $4, V4F_SIZE(%rdi) /* set dest size */
415 .byte 0x66, 0x66, 0x90 /* manual align += 4 */
416 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
419 .byte 0x66, 0x66, 0x90 /* manual align += 4 */
422 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
423 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
425 movd (%rsi), %mm0 /* | m00 */
426 movd 4(%rsi), %mm1 /* | m01 */
430 punpckldq 16(%rsi), %mm0 /* m10 | m00 */
431 .byte 0x66, 0x66, 0x90 /* manual align += 4 */
432 punpckldq 20(%rsi), %mm1 /* m11 | m01 */
434 movq 48(%rsi), %mm2 /* m31 | m30 */
438 prefetchw 32(%rdi) /* prefetch 2 vertices ahead */
440 movq (%rdx), %mm3 /* x1 | x0 */
441 movq 8(%rdx), %mm5 /* x3 | x2 */
443 movq %mm3, %mm4 /* x1 | x0 */
444 movq %mm5, %mm6 /* x3 | x2 */
446 pfmul %mm1, %mm4 /* x1*m11 | x0*m01 */
447 punpckhdq %mm6, %mm6 /* x3 | x3 */
449 pfmul %mm0, %mm3 /* x1*m10 | x0*m00 */
452 pfacc %mm4, %mm3 /* x0*m01+x1*m11 | x0*m00+x1*m10 */
454 pfmul %mm2, %mm6 /* x3*m31 | x3*m30 */
455 prefetch 32(%rdx) /* hopefully stride is zero */
457 pfadd %mm6, %mm3 /* r1 | r0 */
459 movq %mm3, (%rdi) /* write r0, r1 */
460 movq %mm5, 8(%rdi) /* write r2, r3 */
473 #if defined (__ELF__) && defined (__linux__)
474 .section .note.GNU-stack,"",%progbits