Merge branch 'mesa_7_7_branch'
[mesa.git] / src / mesa / x86-64 / xform4.S
1 /*
2 * Mesa 3-D graphics library
3 * Version: 7.1
4 *
5 * Copyright (C) 1999-2007 Brian Paul All Rights Reserved.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 * and/or sell copies of the Software, and to permit persons to whom the
12 * Software is furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice shall be included
15 * in all copies or substantial portions of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
21 * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 */
24
25 #ifdef USE_X86_64_ASM
26
27 #include "matypes.h"
28
29 .text
30
31 .align 16
32 .globl _mesa_x86_64_cpuid
33 _mesa_x86_64_cpuid:
34 pushq %rbx
35 movl (%rdi), %eax
36 movl 8(%rdi), %ecx
37
38 cpuid
39
40 movl %ebx, 4(%rdi)
41 movl %eax, (%rdi)
42 movl %ecx, 8(%rdi)
43 movl %edx, 12(%rdi)
44 popq %rbx
45 ret
46
47 .align 16
48 .globl _mesa_x86_64_transform_points4_general
49 _mesa_x86_64_transform_points4_general:
50 /*
51 * rdi = dest
52 * rsi = matrix
53 * rdx = source
54 */
55 movl V4F_COUNT(%rdx), %ecx /* count */
56 movzx V4F_STRIDE(%rdx), %eax /* stride */
57
58 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
59 movl $4, V4F_SIZE(%rdi) /* set dest size */
60 .byte 0x66, 0x66, 0x66, 0x90 /* manual align += 3 */
61 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
62
63 testl %ecx, %ecx /* verify non-zero count */
64 prefetchnta 64(%rsi)
65 jz p4_general_done
66
67 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
68 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
69
70 prefetch 16(%rdx)
71
72 movaps 0(%rsi), %xmm4 /* m3 | m2 | m1 | m0 */
73 movaps 16(%rsi), %xmm5 /* m7 | m6 | m5 | m4 */
74 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
75 movaps 32(%rsi), %xmm6 /* m11 | m10 | m9 | m8 */
76 movaps 48(%rsi), %xmm7 /* m15 | m14 | m13 | m12 */
77
78 p4_general_loop:
79
80 movups (%rdx), %xmm8 /* ox | oy | oz | ow */
81 prefetchw 16(%rdi)
82
83 pshufd $0x00, %xmm8, %xmm0 /* ox | ox | ox | ox */
84 addq %rax, %rdx
85 pshufd $0x55, %xmm8, %xmm1 /* oy | oy | oy | oy */
86 mulps %xmm4, %xmm0 /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
87 pshufd $0xAA, %xmm8, %xmm2 /* oz | oz | oz | ox */
88 mulps %xmm5, %xmm1 /* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
89 pshufd $0xFF, %xmm8, %xmm3 /* ow | ow | ow | ow */
90 mulps %xmm6, %xmm2 /* oz*m11 | oz*m10 | oz*m9 | oz*m8 */
91 addps %xmm1, %xmm0 /* ox*m3+oy*m7 | ... */
92 mulps %xmm7, %xmm3 /* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
93 addps %xmm2, %xmm0 /* ox*m3+oy*m7+oz*m11 | ... */
94 prefetch 16(%rdx)
95 addps %xmm3, %xmm0 /* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
96
97 movaps %xmm0, (%rdi) /* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
98 addq $16, %rdi
99
100 decl %ecx
101 jnz p4_general_loop
102
103 p4_general_done:
104 .byte 0xf3
105 ret
106
107 .section .rodata
108
109 .align 16
110 p4_constants:
111 .byte 0xff, 0xff, 0xff, 0xff
112 .byte 0xff, 0xff, 0xff, 0xff
113 .byte 0xff, 0xff, 0xff, 0xff
114 .byte 0x00, 0x00, 0x00, 0x00
115
116 .byte 0x00, 0x00, 0x00, 0x00
117 .byte 0x00, 0x00, 0x00, 0x00
118 .byte 0x00, 0x00, 0x00, 0x00
119 .float 0f+1.0
120
121 .text
122 .align 16
123 .globl _mesa_x86_64_transform_points4_3d
124 /*
125 * this is slower than _mesa_x86_64_transform_points4_general
126 * because it ensures that the last matrix row (or is it column?) is 0,0,0,1
127 */
128 _mesa_x86_64_transform_points4_3d:
129
130 leaq p4_constants(%rip), %rax
131
132 prefetchnta 64(%rsi)
133
134 movaps (%rax), %xmm9
135 movaps 16(%rax), %xmm10
136
137 movl V4F_COUNT(%rdx), %ecx /* count */
138 movzx V4F_STRIDE(%rdx), %eax /* stride */
139
140 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
141 movl $4, V4F_SIZE(%rdi) /* set dest size */
142 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
143
144 testl %ecx, %ecx /* verify non-zero count */
145 jz p4_3d_done
146
147 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
148 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
149
150 prefetch 16(%rdx)
151
152 movaps 0(%rsi), %xmm4 /* m3 | m2 | m1 | m0 */
153 movaps 16(%rsi), %xmm5 /* m7 | m6 | m5 | m4 */
154 andps %xmm9, %xmm4 /* 0.0 | m2 | m1 | m0 */
155 movaps 32(%rsi), %xmm6 /* m11 | m10 | m9 | m8 */
156 andps %xmm9, %xmm5 /* 0.0 | m6 | m5 | m4 */
157 movaps 48(%rsi), %xmm7 /* m15 | m14 | m13 | m12 */
158 andps %xmm9, %xmm6 /* 0.0 | m10 | m9 | m8 */
159 andps %xmm9, %xmm7 /* 0.0 | m14 | m13 | m12 */
160 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
161 orps %xmm10, %xmm7 /* 1.0 | m14 | m13 | m12 */
162
163 p4_3d_loop:
164
165 movups (%rdx), %xmm8 /* ox | oy | oz | ow */
166 prefetchw 16(%rdi)
167
168 pshufd $0x00, %xmm8, %xmm0 /* ox | ox | ox | ox */
169 addq %rax, %rdx
170 pshufd $0x55, %xmm8, %xmm1 /* oy | oy | oy | oy */
171 mulps %xmm4, %xmm0 /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
172 pshufd $0xAA, %xmm8, %xmm2 /* oz | oz | oz | ox */
173 mulps %xmm5, %xmm1 /* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
174 pshufd $0xFF, %xmm8, %xmm3 /* ow | ow | ow | ow */
175 mulps %xmm6, %xmm2 /* oz*m11 | oz*m10 | oz*m9 | oz*m8 */
176 addps %xmm1, %xmm0 /* ox*m3+oy*m7 | ... */
177 mulps %xmm7, %xmm3 /* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
178 addps %xmm2, %xmm0 /* ox*m3+oy*m7+oz*m11 | ... */
179 prefetch 16(%rdx)
180 addps %xmm3, %xmm0 /* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
181
182 movaps %xmm0, (%rdi) /* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
183 addq $16, %rdi
184
185 dec %ecx
186 jnz p4_3d_loop
187
188 p4_3d_done:
189 .byte 0xf3
190 ret
191
192
193 .align 16
194 .globl _mesa_x86_64_transform_points4_identity
195 _mesa_x86_64_transform_points4_identity:
196
197 movl V4F_COUNT(%rdx), %ecx /* count */
198 movzx V4F_STRIDE(%rdx), %eax /* stride */
199
200 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
201 movl $4, V4F_SIZE(%rdi) /* set dest size */
202 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
203
204 test %ecx, %ecx
205 jz p4_identity_done
206
207 movq V4F_START(%rdx), %rsi /* ptr to first src vertex */
208 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
209 prefetch 64(%rsi)
210 prefetchw 64(%rdi)
211
212 add %ecx, %ecx
213
214 rep movsq
215
216 p4_identity_done:
217 .byte 0xf3
218 ret
219
220
221 .align 16
222 .globl _mesa_3dnow_transform_points4_3d_no_rot
223 _mesa_3dnow_transform_points4_3d_no_rot:
224
225 movl V4F_COUNT(%rdx), %ecx /* count */
226 movzx V4F_STRIDE(%rdx), %eax /* stride */
227
228 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
229 movl $4, V4F_SIZE(%rdi) /* set dest size */
230 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
231 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
232
233 test %ecx, %ecx
234 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
235 jz p4_3d_no_rot_done
236
237 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
238 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
239
240 prefetch (%rdx)
241
242 movd (%rsi), %mm0 /* | m00 */
243 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
244 punpckldq 20(%rsi), %mm0 /* m11 | m00 */
245
246 movd 40(%rsi), %mm2 /* | m22 */
247 movq 48(%rsi), %mm1 /* m31 | m30 */
248
249 punpckldq 56(%rsi), %mm2 /* m11 | m00 */
250
251 p4_3d_no_rot_loop:
252
253 prefetchw 32(%rdi)
254
255 movq (%rdx), %mm4 /* x1 | x0 */
256 movq 8(%rdx), %mm5 /* x3 | x2 */
257 movd 12(%rdx), %mm7 /* | x3 */
258
259 movq %mm5, %mm6 /* x3 | x2 */
260 pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */
261
262 punpckhdq %mm6, %mm6 /* x3 | x3 */
263 pfmul %mm2, %mm5 /* x3*m32 | x2*m22 */
264
265 pfmul %mm1, %mm6 /* x3*m31 | x3*m30 */
266 pfacc %mm7, %mm5 /* x3 | x2*m22+x3*m32 */
267
268 pfadd %mm6, %mm4 /* x1*m11+x3*m31 | x0*m00+x3*m30 */
269
270 addq %rax, %rdx
271 movq %mm4, (%rdi) /* write r0, r1 */
272 movq %mm5, 8(%rdi) /* write r2, r3 */
273
274 addq $16, %rdi
275
276 decl %ecx
277 prefetch 32(%rdx)
278 jnz p4_3d_no_rot_loop
279
280 p4_3d_no_rot_done:
281 femms
282 ret
283
284
285 .align 16
286 .globl _mesa_3dnow_transform_points4_perspective
287 _mesa_3dnow_transform_points4_perspective:
288
289 movl V4F_COUNT(%rdx), %ecx /* count */
290 movzx V4F_STRIDE(%rdx), %eax /* stride */
291
292 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
293 movl $4, V4F_SIZE(%rdi) /* set dest size */
294 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
295
296 test %ecx, %ecx
297 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
298 jz p4_perspective_done
299
300 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
301 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
302
303 movd (%rsi), %mm0 /* | m00 */
304 pxor %mm7, %mm7 /* 0 | 0 */
305 punpckldq 20(%rsi), %mm0 /* m11 | m00 */
306
307 movq 32(%rsi), %mm2 /* m21 | m20 */
308 prefetch (%rdx)
309
310 movd 40(%rsi), %mm1 /* | m22 */
311
312 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
313 punpckldq 56(%rsi), %mm1 /* m32 | m22 */
314
315
316 p4_perspective_loop:
317
318 prefetchw 32(%rdi) /* prefetch 2 vertices ahead */
319
320 movq (%rdx), %mm4 /* x1 | x0 */
321 movq 8(%rdx), %mm5 /* x3 | x2 */
322 movd 8(%rdx), %mm3 /* | x2 */
323
324 movq %mm5, %mm6 /* x3 | x2 */
325 pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */
326
327 punpckldq %mm5, %mm5 /* x2 | x2 */
328
329 pfmul %mm2, %mm5 /* x2*m21 | x2*m20 */
330 pfsubr %mm7, %mm3 /* | -x2 */
331
332 pfmul %mm1, %mm6 /* x3*m32 | x2*m22 */
333 pfadd %mm4, %mm5 /* x1*m11+x2*m21 | x0*m00+x2*m20 */
334
335 pfacc %mm3, %mm6 /* -x2 | x2*m22+x3*m32 */
336
337 movq %mm5, (%rdi) /* write r0, r1 */
338 addq %rax, %rdx
339 movq %mm6, 8(%rdi) /* write r2, r3 */
340
341 addq $16, %rdi
342
343 decl %ecx
344 prefetch 32(%rdx) /* hopefully stride is zero */
345 jnz p4_perspective_loop
346
347 p4_perspective_done:
348 femms
349 ret
350
351 .align 16
352 .globl _mesa_3dnow_transform_points4_2d_no_rot
353 _mesa_3dnow_transform_points4_2d_no_rot:
354
355 movl V4F_COUNT(%rdx), %ecx /* count */
356 movzx V4F_STRIDE(%rdx), %eax /* stride */
357
358 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
359 movl $4, V4F_SIZE(%rdi) /* set dest size */
360 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
361
362 test %ecx, %ecx
363 .byte 0x90 /* manual align += 1 */
364 jz p4_2d_no_rot_done
365
366 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
367 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
368
369 movd (%rsi), %mm0 /* | m00 */
370 prefetch (%rdx)
371 punpckldq 20(%rsi), %mm0 /* m11 | m00 */
372
373 movq 48(%rsi), %mm1 /* m31 | m30 */
374
375 p4_2d_no_rot_loop:
376
377 prefetchw 32(%rdi) /* prefetch 2 vertices ahead */
378
379 movq (%rdx), %mm4 /* x1 | x0 */
380 movq 8(%rdx), %mm5 /* x3 | x2 */
381
382 pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */
383 movq %mm5, %mm6 /* x3 | x2 */
384
385 punpckhdq %mm6, %mm6 /* x3 | x3 */
386
387 addq %rax, %rdx
388 pfmul %mm1, %mm6 /* x3*m31 | x3*m30 */
389
390 prefetch 32(%rdx) /* hopefully stride is zero */
391 pfadd %mm4, %mm6 /* x1*m11+x3*m31 | x0*m00+x3*m30 */
392
393 movq %mm6, (%rdi) /* write r0, r1 */
394 movq %mm5, 8(%rdi) /* write r2, r3 */
395
396 addq $16, %rdi
397
398 decl %ecx
399 jnz p4_2d_no_rot_loop
400
401 p4_2d_no_rot_done:
402 femms
403 ret
404
405
406 .align 16
407 .globl _mesa_3dnow_transform_points4_2d
408 _mesa_3dnow_transform_points4_2d:
409
410 movl V4F_COUNT(%rdx), %ecx /* count */
411 movzx V4F_STRIDE(%rdx), %eax /* stride */
412
413 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
414 movl $4, V4F_SIZE(%rdi) /* set dest size */
415 .byte 0x66, 0x66, 0x90 /* manual align += 4 */
416 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
417
418 test %ecx, %ecx
419 .byte 0x66, 0x66, 0x90 /* manual align += 4 */
420 jz p4_2d_done
421
422 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
423 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
424
425 movd (%rsi), %mm0 /* | m00 */
426 movd 4(%rsi), %mm1 /* | m01 */
427
428 prefetch (%rdx)
429
430 punpckldq 16(%rsi), %mm0 /* m10 | m00 */
431 .byte 0x66, 0x66, 0x90 /* manual align += 4 */
432 punpckldq 20(%rsi), %mm1 /* m11 | m01 */
433
434 movq 48(%rsi), %mm2 /* m31 | m30 */
435
436 p4_2d_loop:
437
438 prefetchw 32(%rdi) /* prefetch 2 vertices ahead */
439
440 movq (%rdx), %mm3 /* x1 | x0 */
441 movq 8(%rdx), %mm5 /* x3 | x2 */
442
443 movq %mm3, %mm4 /* x1 | x0 */
444 movq %mm5, %mm6 /* x3 | x2 */
445
446 pfmul %mm1, %mm4 /* x1*m11 | x0*m01 */
447 punpckhdq %mm6, %mm6 /* x3 | x3 */
448
449 pfmul %mm0, %mm3 /* x1*m10 | x0*m00 */
450
451 addq %rax, %rdx
452 pfacc %mm4, %mm3 /* x0*m01+x1*m11 | x0*m00+x1*m10 */
453
454 pfmul %mm2, %mm6 /* x3*m31 | x3*m30 */
455 prefetch 32(%rdx) /* hopefully stride is zero */
456
457 pfadd %mm6, %mm3 /* r1 | r0 */
458
459 movq %mm3, (%rdi) /* write r0, r1 */
460 movq %mm5, 8(%rdi) /* write r2, r3 */
461
462 addq $16, %rdi
463
464 decl %ecx
465 jnz p4_2d_loop
466
467 p4_2d_done:
468 femms
469 ret
470
471 #endif
472
473 #if defined (__ELF__) && defined (__linux__)
474 .section .note.GNU-stack,"",%progbits
475 #endif