3 * Mesa 3-D graphics library
6 * Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
8 * Permission is hereby granted, free of charge, to any person obtaining a
9 * copy of this software and associated documentation files (the "Software"),
10 * to deal in the Software without restriction, including without limitation
11 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 * and/or sell copies of the Software, and to permit persons to whom the
13 * Software is furnished to do so, subject to the following conditions:
15 * The above copyright notice and this permission notice shall be included
16 * in all copies or substantial portions of the Software.
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
22 * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
23 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
34 .globl _mesa_x86_64_transform_points4_general
35 _mesa_x86_64_transform_points4_general:
41 movl V4F_COUNT(%rdx), %ecx /* count */
42 movzx V4F_STRIDE(%rdx), %eax /* stride */
44 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
45 movl $4, V4F_SIZE(%rdi) /* set dest size */
46 .byte 0x66, 0x66, 0x66, 0x90 /* manual align += 3 */
47 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
49 testl %ecx, %ecx /* verify non-zero count */
53 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
54 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
58 movaps 0(%rsi), %xmm4 /* m3 | m2 | m1 | m0 */
59 movaps 16(%rsi), %xmm5 /* m7 | m6 | m5 | m4 */
60 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
61 movaps 32(%rsi), %xmm6 /* m11 | m10 | m9 | m8 */
62 movaps 48(%rsi), %xmm7 /* m15 | m14 | m13 | m12 */
66 movaps (%rdx), %xmm8 /* ox | oy | oz | ow */
69 pshufd $0x00, %xmm8, %xmm0 /* ox | ox | ox | ox */
71 pshufd $0x55, %xmm8, %xmm1 /* oy | oy | oy | oy */
72 mulps %xmm4, %xmm0 /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
73 pshufd $0xAA, %xmm8, %xmm2 /* oz | oz | oz | ox */
74 mulps %xmm5, %xmm1 /* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
75 pshufd $0xFF, %xmm8, %xmm3 /* ow | ow | ow | ow */
76 mulps %xmm6, %xmm2 /* oz*m11 | oz*m10 | oz*m9 | oz*m8 */
77 addps %xmm1, %xmm0 /* ox*m3+oy*m7 | ... */
78 mulps %xmm7, %xmm3 /* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
79 addps %xmm2, %xmm0 /* ox*m3+oy*m7+oz*m11 | ... */
81 addps %xmm3, %xmm0 /* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
83 movaps %xmm0, (%rdi) /* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
97 .byte 0xff, 0xff, 0xff, 0xff
98 .byte 0xff, 0xff, 0xff, 0xff
99 .byte 0xff, 0xff, 0xff, 0xff
100 .byte 0x00, 0x00, 0x00, 0x00
102 .byte 0x00, 0x00, 0x00, 0x00
103 .byte 0x00, 0x00, 0x00, 0x00
104 .byte 0x00, 0x00, 0x00, 0x00
109 .globl _mesa_x86_64_transform_points4_3d
111 * this is slower than _mesa_x86_64_transform_points4_general
112 * because it ensures that the last matrix row (or is it column?) is 0,0,0,1
114 _mesa_x86_64_transform_points4_3d:
116 leaq p4_constants(%rip), %rax
121 movaps 16(%rax), %xmm10
123 movl V4F_COUNT(%rdx), %ecx /* count */
124 movzx V4F_STRIDE(%rdx), %eax /* stride */
126 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
127 movl $4, V4F_SIZE(%rdi) /* set dest size */
128 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
130 testl %ecx, %ecx /* verify non-zero count */
133 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
134 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
138 movaps 0(%rsi), %xmm4 /* m3 | m2 | m1 | m0 */
139 movaps 16(%rsi), %xmm5 /* m7 | m6 | m5 | m4 */
140 andps %xmm9, %xmm4 /* 0.0 | m2 | m1 | m0 */
141 movaps 32(%rsi), %xmm6 /* m11 | m10 | m9 | m8 */
142 andps %xmm9, %xmm5 /* 0.0 | m6 | m5 | m4 */
143 movaps 48(%rsi), %xmm7 /* m15 | m14 | m13 | m12 */
144 andps %xmm9, %xmm6 /* 0.0 | m10 | m9 | m8 */
145 andps %xmm9, %xmm7 /* 0.0 | m14 | m13 | m12 */
146 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
147 orps %xmm10, %xmm7 /* 1.0 | m14 | m13 | m12 */
151 movaps (%rdx), %xmm8 /* ox | oy | oz | ow */
154 pshufd $0x00, %xmm8, %xmm0 /* ox | ox | ox | ox */
156 pshufd $0x55, %xmm8, %xmm1 /* oy | oy | oy | oy */
157 mulps %xmm4, %xmm0 /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
158 pshufd $0xAA, %xmm8, %xmm2 /* oz | oz | oz | ox */
159 mulps %xmm5, %xmm1 /* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
160 pshufd $0xFF, %xmm8, %xmm3 /* ow | ow | ow | ow */
161 mulps %xmm6, %xmm2 /* oz*m11 | oz*m10 | oz*m9 | oz*m8 */
162 addps %xmm1, %xmm0 /* ox*m3+oy*m7 | ... */
163 mulps %xmm7, %xmm3 /* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
164 addps %xmm2, %xmm0 /* ox*m3+oy*m7+oz*m11 | ... */
166 addps %xmm3, %xmm0 /* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
168 movaps %xmm0, (%rdi) /* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
180 .globl _mesa_x86_64_transform_points4_identity
181 _mesa_x86_64_transform_points4_identity:
183 movl V4F_COUNT(%rdx), %ecx /* count */
184 movzx V4F_STRIDE(%rdx), %eax /* stride */
186 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
187 movl $4, V4F_SIZE(%rdi) /* set dest size */
188 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
193 movq V4F_START(%rdx), %rsi /* ptr to first src vertex */
194 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
208 .globl _mesa_x86_64_transform_points4_3d_no_rot
209 _mesa_x86_64_transform_points4_3d_no_rot:
211 movl V4F_COUNT(%rdx), %ecx /* count */
212 movzx V4F_STRIDE(%rdx), %eax /* stride */
214 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
215 movl $4, V4F_SIZE(%rdi) /* set dest size */
216 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
217 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
220 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
223 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
224 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
228 movd (%rsi), %mm0 /* | m00 */
229 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
230 punpckldq 20(%rsi), %mm0 /* m11 | m00 */
232 movd 40(%rsi), %mm2 /* | m22 */
233 movq 48(%rsi), %mm1 /* m31 | m30 */
235 punpckldq 56(%rsi), %mm2 /* m11 | m00 */
241 movq (%rdx), %mm4 /* x1 | x0 */
242 movq 8(%rdx), %mm5 /* x3 | x2 */
243 movd 12(%rdx), %mm7 /* | x3 */
245 movq %mm5, %mm6 /* x3 | x2 */
246 pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */
248 punpckhdq %mm6, %mm6 /* x3 | x3 */
249 pfmul %mm2, %mm5 /* x3*m32 | x2*m22 */
251 pfmul %mm1, %mm6 /* x3*m31 | x3*m30 */
252 pfacc %mm7, %mm5 /* x3 | x2*m22+x3*m32 */
254 pfadd %mm6, %mm4 /* x1*m11+x3*m31 | x0*m00+x3*m30 */
257 movq %mm4, (%rdi) /* write r0, r1 */
258 movq %mm5, 8(%rdi) /* write r2, r3 */
264 jnz p4_3d_no_rot_loop
272 .globl _mesa_x86_64_transform_points4_perspective
273 _mesa_x86_64_transform_points4_perspective:
275 movl V4F_COUNT(%rdx), %ecx /* count */
276 movzx V4F_STRIDE(%rdx), %eax /* stride */
278 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
279 movl $4, V4F_SIZE(%rdi) /* set dest size */
280 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
283 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
284 jz p4_perspective_done
286 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
287 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
289 movd (%rsi), %mm0 /* | m00 */
290 pxor %mm7, %mm7 /* 0 | 0 */
291 punpckldq 20(%rsi), %mm0 /* m11 | m00 */
293 movq 32(%rsi), %mm2 /* m21 | m20 */
296 movd 40(%rsi), %mm1 /* | m22 */
298 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
299 punpckldq 56(%rsi), %mm1 /* m32 | m22 */
304 prefetchw 32(%rdi) /* prefetch 2 vertices ahead */
306 movq (%rdx), %mm4 /* x1 | x0 */
307 movq 8(%rdx), %mm5 /* x3 | x2 */
308 movd 8(%rdx), %mm3 /* | x2 */
310 movq %mm5, %mm6 /* x3 | x2 */
311 pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */
313 punpckldq %mm5, %mm5 /* x2 | x2 */
315 pfmul %mm2, %mm5 /* x2*m21 | x2*m20 */
316 pfsubr %mm7, %mm3 /* | -x2 */
318 pfmul %mm1, %mm6 /* x3*m32 | x2*m22 */
319 pfadd %mm4, %mm5 /* x1*m11+x2*m21 | x0*m00+x2*m20 */
321 pfacc %mm3, %mm6 /* -x2 | x2*m22+x3*m32 */
323 movq %mm5, (%rdi) /* write r0, r1 */
325 movq %mm6, 8(%rdi) /* write r2, r3 */
330 prefetch 32(%rdx) /* hopefully stride is zero */
331 jnz p4_perspective_loop
338 .globl _mesa_x86_64_transform_points4_2d_no_rot
339 _mesa_x86_64_transform_points4_2d_no_rot:
341 movl V4F_COUNT(%rdx), %ecx /* count */
342 movzx V4F_STRIDE(%rdx), %eax /* stride */
344 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
345 movl $4, V4F_SIZE(%rdi) /* set dest size */
346 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
349 .byte 0x90 /* manual align += 1 */
352 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
353 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
355 movd (%rsi), %mm0 /* | m00 */
357 punpckldq 20(%rsi), %mm0 /* m11 | m00 */
359 movq 48(%rsi), %mm1 /* m31 | m30 */
363 prefetchw 32(%rdi) /* prefetch 2 vertices ahead */
365 movq (%rdx), %mm4 /* x1 | x0 */
366 movq 8(%rdx), %mm5 /* x3 | x2 */
368 pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */
369 movq %mm5, %mm6 /* x3 | x2 */
371 punpckhdq %mm6, %mm6 /* x3 | x3 */
374 pfmul %mm1, %mm6 /* x3*m31 | x3*m30 */
376 prefetch 32(%rdx) /* hopefully stride is zero */
377 pfadd %mm4, %mm6 /* x1*m11+x3*m31 | x0*m00+x3*m30 */
379 movq %mm6, (%rdi) /* write r0, r1 */
380 movq %mm5, 8(%rdi) /* write r2, r3 */
385 jnz p4_2d_no_rot_loop
393 .globl _mesa_x86_64_transform_points4_2d
394 _mesa_x86_64_transform_points4_2d:
396 movl V4F_COUNT(%rdx), %ecx /* count */
397 movzx V4F_STRIDE(%rdx), %eax /* stride */
399 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
400 movl $4, V4F_SIZE(%rdi) /* set dest size */
401 .byte 0x66, 0x66, 0x90 /* manual align += 4 */
402 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
405 .byte 0x66, 0x66, 0x90 /* manual align += 4 */
408 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
409 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
411 movd (%rsi), %mm0 /* | m00 */
412 movd 4(%rsi), %mm1 /* | m01 */
416 punpckldq 16(%rsi), %mm0 /* m10 | m00 */
417 .byte 0x66, 0x66, 0x90 /* manual align += 4 */
418 punpckldq 20(%rsi), %mm1 /* m11 | m01 */
420 movq 48(%rsi), %mm2 /* m31 | m30 */
424 prefetchw 32(%rdi) /* prefetch 2 vertices ahead */
426 movq (%rdx), %mm3 /* x1 | x0 */
427 movq 8(%rdx), %mm5 /* x3 | x2 */
429 movq %mm3, %mm4 /* x1 | x0 */
430 movq %mm5, %mm6 /* x3 | x2 */
432 pfmul %mm1, %mm4 /* x1*m11 | x0*m01 */
433 punpckhdq %mm6, %mm6 /* x3 | x3 */
435 pfmul %mm0, %mm3 /* x1*m10 | x0*m00 */
438 pfacc %mm4, %mm3 /* x0*m01+x1*m11 | x0*m00+x1*m10 */
440 pfmul %mm2, %mm6 /* x3*m31 | x3*m30 */
441 prefetch 32(%rdx) /* hopefully stride is zero */
443 pfadd %mm6, %mm3 /* r1 | r0 */
445 movq %mm3, (%rdi) /* write r0, r1 */
446 movq %mm5, 8(%rdi) /* write r2, r3 */
459 #if defined (__ELF__) && defined (__linux__)
460 .section .note.GNU-stack,"",%progbits