Remove CVS keywords.
[mesa.git] / src / mesa / x86-64 / xform4.S
1
2 /*
3 * Mesa 3-D graphics library
4 * Version: 3.5
5 *
6 * Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a
9 * copy of this software and associated documentation files (the "Software"),
10 * to deal in the Software without restriction, including without limitation
11 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 * and/or sell copies of the Software, and to permit persons to whom the
13 * Software is furnished to do so, subject to the following conditions:
14 *
15 * The above copyright notice and this permission notice shall be included
16 * in all copies or substantial portions of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
22 * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
23 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24 */
25
26 #ifdef USE_X86_64_ASM
27
28 #include "matypes.h"
29
30 .text
31
32 .align 16
33
34 .globl _mesa_x86_64_transform_points4_general
35 _mesa_x86_64_transform_points4_general:
36 /*
37 * rdi = dest
38 * rsi = matrix
39 * rdx = source
40 */
41 movl V4F_COUNT(%rdx), %ecx /* count */
42 movzx V4F_STRIDE(%rdx), %eax /* stride */
43
44 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
45 movl $4, V4F_SIZE(%rdi) /* set dest size */
46 .byte 0x66, 0x66, 0x66, 0x90 /* manual align += 3 */
47 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
48
49 testl %ecx, %ecx /* verify non-zero count */
50 prefetchnta 64(%rsi)
51 jz p4_general_done
52
53 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
54 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
55
56 prefetch 16(%rdx)
57
58 movaps 0(%rsi), %xmm4 /* m3 | m2 | m1 | m0 */
59 movaps 16(%rsi), %xmm5 /* m7 | m6 | m5 | m4 */
60 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
61 movaps 32(%rsi), %xmm6 /* m11 | m10 | m9 | m8 */
62 movaps 48(%rsi), %xmm7 /* m15 | m14 | m13 | m12 */
63
64 p4_general_loop:
65
66 movaps (%rdx), %xmm8 /* ox | oy | oz | ow */
67 prefetchw 16(%rdi)
68
69 pshufd $0x00, %xmm8, %xmm0 /* ox | ox | ox | ox */
70 addq %rax, %rdx
71 pshufd $0x55, %xmm8, %xmm1 /* oy | oy | oy | oy */
72 mulps %xmm4, %xmm0 /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
73 pshufd $0xAA, %xmm8, %xmm2 /* oz | oz | oz | ox */
74 mulps %xmm5, %xmm1 /* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
75 pshufd $0xFF, %xmm8, %xmm3 /* ow | ow | ow | ow */
76 mulps %xmm6, %xmm2 /* oz*m11 | oz*m10 | oz*m9 | oz*m8 */
77 addps %xmm1, %xmm0 /* ox*m3+oy*m7 | ... */
78 mulps %xmm7, %xmm3 /* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
79 addps %xmm2, %xmm0 /* ox*m3+oy*m7+oz*m11 | ... */
80 prefetch 16(%rdx)
81 addps %xmm3, %xmm0 /* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
82
83 movaps %xmm0, (%rdi) /* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
84 addq $16, %rdi
85
86 decl %ecx
87 jnz p4_general_loop
88
89 p4_general_done:
90 .byte 0xf3
91 ret
92
93 .section .rodata
94
95 .align 16
96 p4_constants:
97 .byte 0xff, 0xff, 0xff, 0xff
98 .byte 0xff, 0xff, 0xff, 0xff
99 .byte 0xff, 0xff, 0xff, 0xff
100 .byte 0x00, 0x00, 0x00, 0x00
101
102 .byte 0x00, 0x00, 0x00, 0x00
103 .byte 0x00, 0x00, 0x00, 0x00
104 .byte 0x00, 0x00, 0x00, 0x00
105 .float 0f+1.0
106
107 .text
108 .align 16
109 .globl _mesa_x86_64_transform_points4_3d
110 /*
111 * this is slower than _mesa_x86_64_transform_points4_general
112 * because it ensures that the last matrix row (or is it column?) is 0,0,0,1
113 */
114 _mesa_x86_64_transform_points4_3d:
115
116 leaq p4_constants(%rip), %rax
117
118 prefetchnta 64(%rsi)
119
120 movaps (%rax), %xmm9
121 movaps 16(%rax), %xmm10
122
123 movl V4F_COUNT(%rdx), %ecx /* count */
124 movzx V4F_STRIDE(%rdx), %eax /* stride */
125
126 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
127 movl $4, V4F_SIZE(%rdi) /* set dest size */
128 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
129
130 testl %ecx, %ecx /* verify non-zero count */
131 jz p4_3d_done
132
133 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
134 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
135
136 prefetch 16(%rdx)
137
138 movaps 0(%rsi), %xmm4 /* m3 | m2 | m1 | m0 */
139 movaps 16(%rsi), %xmm5 /* m7 | m6 | m5 | m4 */
140 andps %xmm9, %xmm4 /* 0.0 | m2 | m1 | m0 */
141 movaps 32(%rsi), %xmm6 /* m11 | m10 | m9 | m8 */
142 andps %xmm9, %xmm5 /* 0.0 | m6 | m5 | m4 */
143 movaps 48(%rsi), %xmm7 /* m15 | m14 | m13 | m12 */
144 andps %xmm9, %xmm6 /* 0.0 | m10 | m9 | m8 */
145 andps %xmm9, %xmm7 /* 0.0 | m14 | m13 | m12 */
146 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
147 orps %xmm10, %xmm7 /* 1.0 | m14 | m13 | m12 */
148
149 p4_3d_loop:
150
151 movaps (%rdx), %xmm8 /* ox | oy | oz | ow */
152 prefetchw 16(%rdi)
153
154 pshufd $0x00, %xmm8, %xmm0 /* ox | ox | ox | ox */
155 addq %rax, %rdx
156 pshufd $0x55, %xmm8, %xmm1 /* oy | oy | oy | oy */
157 mulps %xmm4, %xmm0 /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
158 pshufd $0xAA, %xmm8, %xmm2 /* oz | oz | oz | ox */
159 mulps %xmm5, %xmm1 /* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
160 pshufd $0xFF, %xmm8, %xmm3 /* ow | ow | ow | ow */
161 mulps %xmm6, %xmm2 /* oz*m11 | oz*m10 | oz*m9 | oz*m8 */
162 addps %xmm1, %xmm0 /* ox*m3+oy*m7 | ... */
163 mulps %xmm7, %xmm3 /* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
164 addps %xmm2, %xmm0 /* ox*m3+oy*m7+oz*m11 | ... */
165 prefetch 16(%rdx)
166 addps %xmm3, %xmm0 /* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
167
168 movaps %xmm0, (%rdi) /* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
169 addq $16, %rdi
170
171 dec %ecx
172 jnz p4_3d_loop
173
174 p4_3d_done:
175 .byte 0xf3
176 ret
177
178
179 .align 16
180 .globl _mesa_x86_64_transform_points4_identity
181 _mesa_x86_64_transform_points4_identity:
182
183 movl V4F_COUNT(%rdx), %ecx /* count */
184 movzx V4F_STRIDE(%rdx), %eax /* stride */
185
186 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
187 movl $4, V4F_SIZE(%rdi) /* set dest size */
188 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
189
190 test %ecx, %ecx
191 jz p4_identity_done
192
193 movq V4F_START(%rdx), %rsi /* ptr to first src vertex */
194 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
195 prefetch 64(%rsi)
196 prefetchw 64(%rdi)
197
198 add %ecx, %ecx
199
200 rep movsq
201
202 p4_identity_done:
203 .byte 0xf3
204 ret
205
206
207 .align 16
208 .globl _mesa_x86_64_transform_points4_3d_no_rot
209 _mesa_x86_64_transform_points4_3d_no_rot:
210
211 movl V4F_COUNT(%rdx), %ecx /* count */
212 movzx V4F_STRIDE(%rdx), %eax /* stride */
213
214 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
215 movl $4, V4F_SIZE(%rdi) /* set dest size */
216 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
217 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
218
219 test %ecx, %ecx
220 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
221 jz p4_3d_no_rot_done
222
223 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
224 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
225
226 prefetch (%rdx)
227
228 movd (%rsi), %mm0 /* | m00 */
229 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
230 punpckldq 20(%rsi), %mm0 /* m11 | m00 */
231
232 movd 40(%rsi), %mm2 /* | m22 */
233 movq 48(%rsi), %mm1 /* m31 | m30 */
234
235 punpckldq 56(%rsi), %mm2 /* m11 | m00 */
236
237 p4_3d_no_rot_loop:
238
239 prefetchw 32(%rdi)
240
241 movq (%rdx), %mm4 /* x1 | x0 */
242 movq 8(%rdx), %mm5 /* x3 | x2 */
243 movd 12(%rdx), %mm7 /* | x3 */
244
245 movq %mm5, %mm6 /* x3 | x2 */
246 pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */
247
248 punpckhdq %mm6, %mm6 /* x3 | x3 */
249 pfmul %mm2, %mm5 /* x3*m32 | x2*m22 */
250
251 pfmul %mm1, %mm6 /* x3*m31 | x3*m30 */
252 pfacc %mm7, %mm5 /* x3 | x2*m22+x3*m32 */
253
254 pfadd %mm6, %mm4 /* x1*m11+x3*m31 | x0*m00+x3*m30 */
255
256 addq %rax, %rdx
257 movq %mm4, (%rdi) /* write r0, r1 */
258 movq %mm5, 8(%rdi) /* write r2, r3 */
259
260 addq $16, %rdi
261
262 decl %ecx
263 prefetch 32(%rdx)
264 jnz p4_3d_no_rot_loop
265
266 p4_3d_no_rot_done:
267 femms
268 ret
269
270
271 .align 16
272 .globl _mesa_x86_64_transform_points4_perspective
273 _mesa_x86_64_transform_points4_perspective:
274
275 movl V4F_COUNT(%rdx), %ecx /* count */
276 movzx V4F_STRIDE(%rdx), %eax /* stride */
277
278 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
279 movl $4, V4F_SIZE(%rdi) /* set dest size */
280 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
281
282 test %ecx, %ecx
283 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
284 jz p4_perspective_done
285
286 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
287 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
288
289 movd (%rsi), %mm0 /* | m00 */
290 pxor %mm7, %mm7 /* 0 | 0 */
291 punpckldq 20(%rsi), %mm0 /* m11 | m00 */
292
293 movq 32(%rsi), %mm2 /* m21 | m20 */
294 prefetch (%rdx)
295
296 movd 40(%rsi), %mm1 /* | m22 */
297
298 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
299 punpckldq 56(%rsi), %mm1 /* m32 | m22 */
300
301
302 p4_perspective_loop:
303
304 prefetchw 32(%rdi) /* prefetch 2 vertices ahead */
305
306 movq (%rdx), %mm4 /* x1 | x0 */
307 movq 8(%rdx), %mm5 /* x3 | x2 */
308 movd 8(%rdx), %mm3 /* | x2 */
309
310 movq %mm5, %mm6 /* x3 | x2 */
311 pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */
312
313 punpckldq %mm5, %mm5 /* x2 | x2 */
314
315 pfmul %mm2, %mm5 /* x2*m21 | x2*m20 */
316 pfsubr %mm7, %mm3 /* | -x2 */
317
318 pfmul %mm1, %mm6 /* x3*m32 | x2*m22 */
319 pfadd %mm4, %mm5 /* x1*m11+x2*m21 | x0*m00+x2*m20 */
320
321 pfacc %mm3, %mm6 /* -x2 | x2*m22+x3*m32 */
322
323 movq %mm5, (%rdi) /* write r0, r1 */
324 addq %rax, %rdx
325 movq %mm6, 8(%rdi) /* write r2, r3 */
326
327 addq $16, %rdi
328
329 decl %ecx
330 prefetch 32(%rdx) /* hopefully stride is zero */
331 jnz p4_perspective_loop
332
333 p4_perspective_done:
334 femms
335 ret
336
337 .align 16
338 .globl _mesa_x86_64_transform_points4_2d_no_rot
339 _mesa_x86_64_transform_points4_2d_no_rot:
340
341 movl V4F_COUNT(%rdx), %ecx /* count */
342 movzx V4F_STRIDE(%rdx), %eax /* stride */
343
344 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
345 movl $4, V4F_SIZE(%rdi) /* set dest size */
346 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
347
348 test %ecx, %ecx
349 .byte 0x90 /* manual align += 1 */
350 jz p4_2d_no_rot_done
351
352 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
353 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
354
355 movd (%rsi), %mm0 /* | m00 */
356 prefetch (%rdx)
357 punpckldq 20(%rsi), %mm0 /* m11 | m00 */
358
359 movq 48(%rsi), %mm1 /* m31 | m30 */
360
361 p4_2d_no_rot_loop:
362
363 prefetchw 32(%rdi) /* prefetch 2 vertices ahead */
364
365 movq (%rdx), %mm4 /* x1 | x0 */
366 movq 8(%rdx), %mm5 /* x3 | x2 */
367
368 pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */
369 movq %mm5, %mm6 /* x3 | x2 */
370
371 punpckhdq %mm6, %mm6 /* x3 | x3 */
372
373 addq %rax, %rdx
374 pfmul %mm1, %mm6 /* x3*m31 | x3*m30 */
375
376 prefetch 32(%rdx) /* hopefully stride is zero */
377 pfadd %mm4, %mm6 /* x1*m11+x3*m31 | x0*m00+x3*m30 */
378
379 movq %mm6, (%rdi) /* write r0, r1 */
380 movq %mm5, 8(%rdi) /* write r2, r3 */
381
382 addq $16, %rdi
383
384 decl %ecx
385 jnz p4_2d_no_rot_loop
386
387 p4_2d_no_rot_done:
388 femms
389 ret
390
391
392 .align 16
393 .globl _mesa_x86_64_transform_points4_2d
394 _mesa_x86_64_transform_points4_2d:
395
396 movl V4F_COUNT(%rdx), %ecx /* count */
397 movzx V4F_STRIDE(%rdx), %eax /* stride */
398
399 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
400 movl $4, V4F_SIZE(%rdi) /* set dest size */
401 .byte 0x66, 0x66, 0x90 /* manual align += 4 */
402 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
403
404 test %ecx, %ecx
405 .byte 0x66, 0x66, 0x90 /* manual align += 4 */
406 jz p4_2d_done
407
408 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
409 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
410
411 movd (%rsi), %mm0 /* | m00 */
412 movd 4(%rsi), %mm1 /* | m01 */
413
414 prefetch (%rdx)
415
416 punpckldq 16(%rsi), %mm0 /* m10 | m00 */
417 .byte 0x66, 0x66, 0x90 /* manual align += 4 */
418 punpckldq 20(%rsi), %mm1 /* m11 | m01 */
419
420 movq 48(%rsi), %mm2 /* m31 | m30 */
421
422 p4_2d_loop:
423
424 prefetchw 32(%rdi) /* prefetch 2 vertices ahead */
425
426 movq (%rdx), %mm3 /* x1 | x0 */
427 movq 8(%rdx), %mm5 /* x3 | x2 */
428
429 movq %mm3, %mm4 /* x1 | x0 */
430 movq %mm5, %mm6 /* x3 | x2 */
431
432 pfmul %mm1, %mm4 /* x1*m11 | x0*m01 */
433 punpckhdq %mm6, %mm6 /* x3 | x3 */
434
435 pfmul %mm0, %mm3 /* x1*m10 | x0*m00 */
436
437 addq %rax, %rdx
438 pfacc %mm4, %mm3 /* x0*m01+x1*m11 | x0*m00+x1*m10 */
439
440 pfmul %mm2, %mm6 /* x3*m31 | x3*m30 */
441 prefetch 32(%rdx) /* hopefully stride is zero */
442
443 pfadd %mm6, %mm3 /* r1 | r0 */
444
445 movq %mm3, (%rdi) /* write r0, r1 */
446 movq %mm5, 8(%rdi) /* write r2, r3 */
447
448 addq $16, %rdi
449
450 decl %ecx
451 jnz p4_2d_loop
452
453 p4_2d_done:
454 femms
455 ret
456
457 #endif
458
459 #if defined (__ELF__) && defined (__linux__)
460 .section .note.GNU-stack,"",%progbits
461 #endif