2 * Mesa 3-D graphics library
5 * Copyright (C) 1999-2007 Brian Paul All Rights Reserved.
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 * and/or sell copies of the Software, and to permit persons to whom the
12 * Software is furnished to do so, subject to the following conditions:
14 * The above copyright notice and this permission notice shall be included
15 * in all copies or substantial portions of the Software.
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
21 * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
33 .globl _mesa_x86_64_transform_points4_general
34 _mesa_x86_64_transform_points4_general:
40 movl V4F_COUNT(%rdx), %ecx /* count */
41 movzx V4F_STRIDE(%rdx), %eax /* stride */
43 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
44 movl $4, V4F_SIZE(%rdi) /* set dest size */
45 .byte 0x66, 0x66, 0x66, 0x90 /* manual align += 3 */
46 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
48 testl %ecx, %ecx /* verify non-zero count */
52 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
53 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
57 movaps 0(%rsi), %xmm4 /* m3 | m2 | m1 | m0 */
58 movaps 16(%rsi), %xmm5 /* m7 | m6 | m5 | m4 */
59 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
60 movaps 32(%rsi), %xmm6 /* m11 | m10 | m9 | m8 */
61 movaps 48(%rsi), %xmm7 /* m15 | m14 | m13 | m12 */
65 movups (%rdx), %xmm8 /* ox | oy | oz | ow */
68 pshufd $0x00, %xmm8, %xmm0 /* ox | ox | ox | ox */
70 pshufd $0x55, %xmm8, %xmm1 /* oy | oy | oy | oy */
71 mulps %xmm4, %xmm0 /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
72 pshufd $0xAA, %xmm8, %xmm2 /* oz | oz | oz | ox */
73 mulps %xmm5, %xmm1 /* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
74 pshufd $0xFF, %xmm8, %xmm3 /* ow | ow | ow | ow */
75 mulps %xmm6, %xmm2 /* oz*m11 | oz*m10 | oz*m9 | oz*m8 */
76 addps %xmm1, %xmm0 /* ox*m3+oy*m7 | ... */
77 mulps %xmm7, %xmm3 /* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
78 addps %xmm2, %xmm0 /* ox*m3+oy*m7+oz*m11 | ... */
80 addps %xmm3, %xmm0 /* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
82 movaps %xmm0, (%rdi) /* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
96 .byte 0xff, 0xff, 0xff, 0xff
97 .byte 0xff, 0xff, 0xff, 0xff
98 .byte 0xff, 0xff, 0xff, 0xff
99 .byte 0x00, 0x00, 0x00, 0x00
101 .byte 0x00, 0x00, 0x00, 0x00
102 .byte 0x00, 0x00, 0x00, 0x00
103 .byte 0x00, 0x00, 0x00, 0x00
108 .globl _mesa_x86_64_transform_points4_3d
110 * this is slower than _mesa_x86_64_transform_points4_general
111 * because it ensures that the last matrix row (or is it column?) is 0,0,0,1
113 _mesa_x86_64_transform_points4_3d:
115 leaq p4_constants(%rip), %rax
120 movaps 16(%rax), %xmm10
122 movl V4F_COUNT(%rdx), %ecx /* count */
123 movzx V4F_STRIDE(%rdx), %eax /* stride */
125 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
126 movl $4, V4F_SIZE(%rdi) /* set dest size */
127 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
129 testl %ecx, %ecx /* verify non-zero count */
132 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
133 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
137 movaps 0(%rsi), %xmm4 /* m3 | m2 | m1 | m0 */
138 movaps 16(%rsi), %xmm5 /* m7 | m6 | m5 | m4 */
139 andps %xmm9, %xmm4 /* 0.0 | m2 | m1 | m0 */
140 movaps 32(%rsi), %xmm6 /* m11 | m10 | m9 | m8 */
141 andps %xmm9, %xmm5 /* 0.0 | m6 | m5 | m4 */
142 movaps 48(%rsi), %xmm7 /* m15 | m14 | m13 | m12 */
143 andps %xmm9, %xmm6 /* 0.0 | m10 | m9 | m8 */
144 andps %xmm9, %xmm7 /* 0.0 | m14 | m13 | m12 */
145 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
146 orps %xmm10, %xmm7 /* 1.0 | m14 | m13 | m12 */
150 movups (%rdx), %xmm8 /* ox | oy | oz | ow */
153 pshufd $0x00, %xmm8, %xmm0 /* ox | ox | ox | ox */
155 pshufd $0x55, %xmm8, %xmm1 /* oy | oy | oy | oy */
156 mulps %xmm4, %xmm0 /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
157 pshufd $0xAA, %xmm8, %xmm2 /* oz | oz | oz | ox */
158 mulps %xmm5, %xmm1 /* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
159 pshufd $0xFF, %xmm8, %xmm3 /* ow | ow | ow | ow */
160 mulps %xmm6, %xmm2 /* oz*m11 | oz*m10 | oz*m9 | oz*m8 */
161 addps %xmm1, %xmm0 /* ox*m3+oy*m7 | ... */
162 mulps %xmm7, %xmm3 /* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
163 addps %xmm2, %xmm0 /* ox*m3+oy*m7+oz*m11 | ... */
165 addps %xmm3, %xmm0 /* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
167 movaps %xmm0, (%rdi) /* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
179 .globl _mesa_x86_64_transform_points4_identity
180 _mesa_x86_64_transform_points4_identity:
182 movl V4F_COUNT(%rdx), %ecx /* count */
183 movzx V4F_STRIDE(%rdx), %eax /* stride */
185 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
186 movl $4, V4F_SIZE(%rdi) /* set dest size */
187 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
192 movq V4F_START(%rdx), %rsi /* ptr to first src vertex */
193 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
207 .globl _mesa_x86_64_transform_points4_3d_no_rot
208 _mesa_x86_64_transform_points4_3d_no_rot:
210 movl V4F_COUNT(%rdx), %ecx /* count */
211 movzx V4F_STRIDE(%rdx), %eax /* stride */
213 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
214 movl $4, V4F_SIZE(%rdi) /* set dest size */
215 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
216 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
219 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
222 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
223 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
227 movd (%rsi), %mm0 /* | m00 */
228 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
229 punpckldq 20(%rsi), %mm0 /* m11 | m00 */
231 movd 40(%rsi), %mm2 /* | m22 */
232 movq 48(%rsi), %mm1 /* m31 | m30 */
234 punpckldq 56(%rsi), %mm2 /* m11 | m00 */
240 movq (%rdx), %mm4 /* x1 | x0 */
241 movq 8(%rdx), %mm5 /* x3 | x2 */
242 movd 12(%rdx), %mm7 /* | x3 */
244 movq %mm5, %mm6 /* x3 | x2 */
245 pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */
247 punpckhdq %mm6, %mm6 /* x3 | x3 */
248 pfmul %mm2, %mm5 /* x3*m32 | x2*m22 */
250 pfmul %mm1, %mm6 /* x3*m31 | x3*m30 */
251 pfacc %mm7, %mm5 /* x3 | x2*m22+x3*m32 */
253 pfadd %mm6, %mm4 /* x1*m11+x3*m31 | x0*m00+x3*m30 */
256 movq %mm4, (%rdi) /* write r0, r1 */
257 movq %mm5, 8(%rdi) /* write r2, r3 */
263 jnz p4_3d_no_rot_loop
271 .globl _mesa_x86_64_transform_points4_perspective
272 _mesa_x86_64_transform_points4_perspective:
274 movl V4F_COUNT(%rdx), %ecx /* count */
275 movzx V4F_STRIDE(%rdx), %eax /* stride */
277 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
278 movl $4, V4F_SIZE(%rdi) /* set dest size */
279 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
282 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
283 jz p4_perspective_done
285 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
286 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
288 movd (%rsi), %mm0 /* | m00 */
289 pxor %mm7, %mm7 /* 0 | 0 */
290 punpckldq 20(%rsi), %mm0 /* m11 | m00 */
292 movq 32(%rsi), %mm2 /* m21 | m20 */
295 movd 40(%rsi), %mm1 /* | m22 */
297 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
298 punpckldq 56(%rsi), %mm1 /* m32 | m22 */
303 prefetchw 32(%rdi) /* prefetch 2 vertices ahead */
305 movq (%rdx), %mm4 /* x1 | x0 */
306 movq 8(%rdx), %mm5 /* x3 | x2 */
307 movd 8(%rdx), %mm3 /* | x2 */
309 movq %mm5, %mm6 /* x3 | x2 */
310 pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */
312 punpckldq %mm5, %mm5 /* x2 | x2 */
314 pfmul %mm2, %mm5 /* x2*m21 | x2*m20 */
315 pfsubr %mm7, %mm3 /* | -x2 */
317 pfmul %mm1, %mm6 /* x3*m32 | x2*m22 */
318 pfadd %mm4, %mm5 /* x1*m11+x2*m21 | x0*m00+x2*m20 */
320 pfacc %mm3, %mm6 /* -x2 | x2*m22+x3*m32 */
322 movq %mm5, (%rdi) /* write r0, r1 */
324 movq %mm6, 8(%rdi) /* write r2, r3 */
329 prefetch 32(%rdx) /* hopefully stride is zero */
330 jnz p4_perspective_loop
337 .globl _mesa_x86_64_transform_points4_2d_no_rot
338 _mesa_x86_64_transform_points4_2d_no_rot:
340 movl V4F_COUNT(%rdx), %ecx /* count */
341 movzx V4F_STRIDE(%rdx), %eax /* stride */
343 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
344 movl $4, V4F_SIZE(%rdi) /* set dest size */
345 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
348 .byte 0x90 /* manual align += 1 */
351 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
352 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
354 movd (%rsi), %mm0 /* | m00 */
356 punpckldq 20(%rsi), %mm0 /* m11 | m00 */
358 movq 48(%rsi), %mm1 /* m31 | m30 */
362 prefetchw 32(%rdi) /* prefetch 2 vertices ahead */
364 movq (%rdx), %mm4 /* x1 | x0 */
365 movq 8(%rdx), %mm5 /* x3 | x2 */
367 pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */
368 movq %mm5, %mm6 /* x3 | x2 */
370 punpckhdq %mm6, %mm6 /* x3 | x3 */
373 pfmul %mm1, %mm6 /* x3*m31 | x3*m30 */
375 prefetch 32(%rdx) /* hopefully stride is zero */
376 pfadd %mm4, %mm6 /* x1*m11+x3*m31 | x0*m00+x3*m30 */
378 movq %mm6, (%rdi) /* write r0, r1 */
379 movq %mm5, 8(%rdi) /* write r2, r3 */
384 jnz p4_2d_no_rot_loop
392 .globl _mesa_x86_64_transform_points4_2d
393 _mesa_x86_64_transform_points4_2d:
395 movl V4F_COUNT(%rdx), %ecx /* count */
396 movzx V4F_STRIDE(%rdx), %eax /* stride */
398 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
399 movl $4, V4F_SIZE(%rdi) /* set dest size */
400 .byte 0x66, 0x66, 0x90 /* manual align += 4 */
401 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
404 .byte 0x66, 0x66, 0x90 /* manual align += 4 */
407 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
408 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
410 movd (%rsi), %mm0 /* | m00 */
411 movd 4(%rsi), %mm1 /* | m01 */
415 punpckldq 16(%rsi), %mm0 /* m10 | m00 */
416 .byte 0x66, 0x66, 0x90 /* manual align += 4 */
417 punpckldq 20(%rsi), %mm1 /* m11 | m01 */
419 movq 48(%rsi), %mm2 /* m31 | m30 */
423 prefetchw 32(%rdi) /* prefetch 2 vertices ahead */
425 movq (%rdx), %mm3 /* x1 | x0 */
426 movq 8(%rdx), %mm5 /* x3 | x2 */
428 movq %mm3, %mm4 /* x1 | x0 */
429 movq %mm5, %mm6 /* x3 | x2 */
431 pfmul %mm1, %mm4 /* x1*m11 | x0*m01 */
432 punpckhdq %mm6, %mm6 /* x3 | x3 */
434 pfmul %mm0, %mm3 /* x1*m10 | x0*m00 */
437 pfacc %mm4, %mm3 /* x0*m01+x1*m11 | x0*m00+x1*m10 */
439 pfmul %mm2, %mm6 /* x3*m31 | x3*m30 */
440 prefetch 32(%rdx) /* hopefully stride is zero */
442 pfadd %mm6, %mm3 /* r1 | r0 */
444 movq %mm3, (%rdi) /* write r0, r1 */
445 movq %mm5, 8(%rdi) /* write r2, r3 */
458 #if defined (__ELF__) && defined (__linux__)
459 .section .note.GNU-stack,"",%progbits