Merge branch '965-glsl'
[mesa.git] / src / mesa / x86-64 / xform4.S
1 /*
2 * Mesa 3-D graphics library
3 * Version: 7.1
4 *
5 * Copyright (C) 1999-2007 Brian Paul All Rights Reserved.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 * and/or sell copies of the Software, and to permit persons to whom the
12 * Software is furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice shall be included
15 * in all copies or substantial portions of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
21 * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 */
24
25 #ifdef USE_X86_64_ASM
26
27 #include "matypes.h"
28
29 .text
30
31 .align 16
32
33 .globl _mesa_x86_64_transform_points4_general
34 _mesa_x86_64_transform_points4_general:
35 /*
36 * rdi = dest
37 * rsi = matrix
38 * rdx = source
39 */
40 movl V4F_COUNT(%rdx), %ecx /* count */
41 movzx V4F_STRIDE(%rdx), %eax /* stride */
42
43 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
44 movl $4, V4F_SIZE(%rdi) /* set dest size */
45 .byte 0x66, 0x66, 0x66, 0x90 /* manual align += 3 */
46 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
47
48 testl %ecx, %ecx /* verify non-zero count */
49 prefetchnta 64(%rsi)
50 jz p4_general_done
51
52 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
53 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
54
55 prefetch 16(%rdx)
56
57 movaps 0(%rsi), %xmm4 /* m3 | m2 | m1 | m0 */
58 movaps 16(%rsi), %xmm5 /* m7 | m6 | m5 | m4 */
59 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
60 movaps 32(%rsi), %xmm6 /* m11 | m10 | m9 | m8 */
61 movaps 48(%rsi), %xmm7 /* m15 | m14 | m13 | m12 */
62
63 p4_general_loop:
64
65 movups (%rdx), %xmm8 /* ox | oy | oz | ow */
66 prefetchw 16(%rdi)
67
68 pshufd $0x00, %xmm8, %xmm0 /* ox | ox | ox | ox */
69 addq %rax, %rdx
70 pshufd $0x55, %xmm8, %xmm1 /* oy | oy | oy | oy */
71 mulps %xmm4, %xmm0 /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
72 pshufd $0xAA, %xmm8, %xmm2 /* oz | oz | oz | ox */
73 mulps %xmm5, %xmm1 /* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
74 pshufd $0xFF, %xmm8, %xmm3 /* ow | ow | ow | ow */
75 mulps %xmm6, %xmm2 /* oz*m11 | oz*m10 | oz*m9 | oz*m8 */
76 addps %xmm1, %xmm0 /* ox*m3+oy*m7 | ... */
77 mulps %xmm7, %xmm3 /* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
78 addps %xmm2, %xmm0 /* ox*m3+oy*m7+oz*m11 | ... */
79 prefetch 16(%rdx)
80 addps %xmm3, %xmm0 /* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
81
82 movaps %xmm0, (%rdi) /* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
83 addq $16, %rdi
84
85 decl %ecx
86 jnz p4_general_loop
87
88 p4_general_done:
89 .byte 0xf3
90 ret
91
92 .section .rodata
93
94 .align 16
95 p4_constants:
96 .byte 0xff, 0xff, 0xff, 0xff
97 .byte 0xff, 0xff, 0xff, 0xff
98 .byte 0xff, 0xff, 0xff, 0xff
99 .byte 0x00, 0x00, 0x00, 0x00
100
101 .byte 0x00, 0x00, 0x00, 0x00
102 .byte 0x00, 0x00, 0x00, 0x00
103 .byte 0x00, 0x00, 0x00, 0x00
104 .float 0f+1.0
105
106 .text
107 .align 16
108 .globl _mesa_x86_64_transform_points4_3d
109 /*
110 * this is slower than _mesa_x86_64_transform_points4_general
111 * because it ensures that the last matrix row (or is it column?) is 0,0,0,1
112 */
113 _mesa_x86_64_transform_points4_3d:
114
115 leaq p4_constants(%rip), %rax
116
117 prefetchnta 64(%rsi)
118
119 movaps (%rax), %xmm9
120 movaps 16(%rax), %xmm10
121
122 movl V4F_COUNT(%rdx), %ecx /* count */
123 movzx V4F_STRIDE(%rdx), %eax /* stride */
124
125 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
126 movl $4, V4F_SIZE(%rdi) /* set dest size */
127 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
128
129 testl %ecx, %ecx /* verify non-zero count */
130 jz p4_3d_done
131
132 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
133 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
134
135 prefetch 16(%rdx)
136
137 movaps 0(%rsi), %xmm4 /* m3 | m2 | m1 | m0 */
138 movaps 16(%rsi), %xmm5 /* m7 | m6 | m5 | m4 */
139 andps %xmm9, %xmm4 /* 0.0 | m2 | m1 | m0 */
140 movaps 32(%rsi), %xmm6 /* m11 | m10 | m9 | m8 */
141 andps %xmm9, %xmm5 /* 0.0 | m6 | m5 | m4 */
142 movaps 48(%rsi), %xmm7 /* m15 | m14 | m13 | m12 */
143 andps %xmm9, %xmm6 /* 0.0 | m10 | m9 | m8 */
144 andps %xmm9, %xmm7 /* 0.0 | m14 | m13 | m12 */
145 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
146 orps %xmm10, %xmm7 /* 1.0 | m14 | m13 | m12 */
147
148 p4_3d_loop:
149
150 movups (%rdx), %xmm8 /* ox | oy | oz | ow */
151 prefetchw 16(%rdi)
152
153 pshufd $0x00, %xmm8, %xmm0 /* ox | ox | ox | ox */
154 addq %rax, %rdx
155 pshufd $0x55, %xmm8, %xmm1 /* oy | oy | oy | oy */
156 mulps %xmm4, %xmm0 /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
157 pshufd $0xAA, %xmm8, %xmm2 /* oz | oz | oz | ox */
158 mulps %xmm5, %xmm1 /* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
159 pshufd $0xFF, %xmm8, %xmm3 /* ow | ow | ow | ow */
160 mulps %xmm6, %xmm2 /* oz*m11 | oz*m10 | oz*m9 | oz*m8 */
161 addps %xmm1, %xmm0 /* ox*m3+oy*m7 | ... */
162 mulps %xmm7, %xmm3 /* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
163 addps %xmm2, %xmm0 /* ox*m3+oy*m7+oz*m11 | ... */
164 prefetch 16(%rdx)
165 addps %xmm3, %xmm0 /* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
166
167 movaps %xmm0, (%rdi) /* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
168 addq $16, %rdi
169
170 dec %ecx
171 jnz p4_3d_loop
172
173 p4_3d_done:
174 .byte 0xf3
175 ret
176
177
178 .align 16
179 .globl _mesa_x86_64_transform_points4_identity
180 _mesa_x86_64_transform_points4_identity:
181
182 movl V4F_COUNT(%rdx), %ecx /* count */
183 movzx V4F_STRIDE(%rdx), %eax /* stride */
184
185 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
186 movl $4, V4F_SIZE(%rdi) /* set dest size */
187 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
188
189 test %ecx, %ecx
190 jz p4_identity_done
191
192 movq V4F_START(%rdx), %rsi /* ptr to first src vertex */
193 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
194 prefetch 64(%rsi)
195 prefetchw 64(%rdi)
196
197 add %ecx, %ecx
198
199 rep movsq
200
201 p4_identity_done:
202 .byte 0xf3
203 ret
204
205
206 .align 16
207 .globl _mesa_x86_64_transform_points4_3d_no_rot
208 _mesa_x86_64_transform_points4_3d_no_rot:
209
210 movl V4F_COUNT(%rdx), %ecx /* count */
211 movzx V4F_STRIDE(%rdx), %eax /* stride */
212
213 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
214 movl $4, V4F_SIZE(%rdi) /* set dest size */
215 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
216 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
217
218 test %ecx, %ecx
219 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
220 jz p4_3d_no_rot_done
221
222 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
223 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
224
225 prefetch (%rdx)
226
227 movd (%rsi), %mm0 /* | m00 */
228 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
229 punpckldq 20(%rsi), %mm0 /* m11 | m00 */
230
231 movd 40(%rsi), %mm2 /* | m22 */
232 movq 48(%rsi), %mm1 /* m31 | m30 */
233
234 punpckldq 56(%rsi), %mm2 /* m11 | m00 */
235
236 p4_3d_no_rot_loop:
237
238 prefetchw 32(%rdi)
239
240 movq (%rdx), %mm4 /* x1 | x0 */
241 movq 8(%rdx), %mm5 /* x3 | x2 */
242 movd 12(%rdx), %mm7 /* | x3 */
243
244 movq %mm5, %mm6 /* x3 | x2 */
245 pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */
246
247 punpckhdq %mm6, %mm6 /* x3 | x3 */
248 pfmul %mm2, %mm5 /* x3*m32 | x2*m22 */
249
250 pfmul %mm1, %mm6 /* x3*m31 | x3*m30 */
251 pfacc %mm7, %mm5 /* x3 | x2*m22+x3*m32 */
252
253 pfadd %mm6, %mm4 /* x1*m11+x3*m31 | x0*m00+x3*m30 */
254
255 addq %rax, %rdx
256 movq %mm4, (%rdi) /* write r0, r1 */
257 movq %mm5, 8(%rdi) /* write r2, r3 */
258
259 addq $16, %rdi
260
261 decl %ecx
262 prefetch 32(%rdx)
263 jnz p4_3d_no_rot_loop
264
265 p4_3d_no_rot_done:
266 femms
267 ret
268
269
270 .align 16
271 .globl _mesa_x86_64_transform_points4_perspective
272 _mesa_x86_64_transform_points4_perspective:
273
274 movl V4F_COUNT(%rdx), %ecx /* count */
275 movzx V4F_STRIDE(%rdx), %eax /* stride */
276
277 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
278 movl $4, V4F_SIZE(%rdi) /* set dest size */
279 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
280
281 test %ecx, %ecx
282 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
283 jz p4_perspective_done
284
285 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
286 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
287
288 movd (%rsi), %mm0 /* | m00 */
289 pxor %mm7, %mm7 /* 0 | 0 */
290 punpckldq 20(%rsi), %mm0 /* m11 | m00 */
291
292 movq 32(%rsi), %mm2 /* m21 | m20 */
293 prefetch (%rdx)
294
295 movd 40(%rsi), %mm1 /* | m22 */
296
297 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
298 punpckldq 56(%rsi), %mm1 /* m32 | m22 */
299
300
301 p4_perspective_loop:
302
303 prefetchw 32(%rdi) /* prefetch 2 vertices ahead */
304
305 movq (%rdx), %mm4 /* x1 | x0 */
306 movq 8(%rdx), %mm5 /* x3 | x2 */
307 movd 8(%rdx), %mm3 /* | x2 */
308
309 movq %mm5, %mm6 /* x3 | x2 */
310 pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */
311
312 punpckldq %mm5, %mm5 /* x2 | x2 */
313
314 pfmul %mm2, %mm5 /* x2*m21 | x2*m20 */
315 pfsubr %mm7, %mm3 /* | -x2 */
316
317 pfmul %mm1, %mm6 /* x3*m32 | x2*m22 */
318 pfadd %mm4, %mm5 /* x1*m11+x2*m21 | x0*m00+x2*m20 */
319
320 pfacc %mm3, %mm6 /* -x2 | x2*m22+x3*m32 */
321
322 movq %mm5, (%rdi) /* write r0, r1 */
323 addq %rax, %rdx
324 movq %mm6, 8(%rdi) /* write r2, r3 */
325
326 addq $16, %rdi
327
328 decl %ecx
329 prefetch 32(%rdx) /* hopefully stride is zero */
330 jnz p4_perspective_loop
331
332 p4_perspective_done:
333 femms
334 ret
335
336 .align 16
337 .globl _mesa_x86_64_transform_points4_2d_no_rot
338 _mesa_x86_64_transform_points4_2d_no_rot:
339
340 movl V4F_COUNT(%rdx), %ecx /* count */
341 movzx V4F_STRIDE(%rdx), %eax /* stride */
342
343 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
344 movl $4, V4F_SIZE(%rdi) /* set dest size */
345 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
346
347 test %ecx, %ecx
348 .byte 0x90 /* manual align += 1 */
349 jz p4_2d_no_rot_done
350
351 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
352 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
353
354 movd (%rsi), %mm0 /* | m00 */
355 prefetch (%rdx)
356 punpckldq 20(%rsi), %mm0 /* m11 | m00 */
357
358 movq 48(%rsi), %mm1 /* m31 | m30 */
359
360 p4_2d_no_rot_loop:
361
362 prefetchw 32(%rdi) /* prefetch 2 vertices ahead */
363
364 movq (%rdx), %mm4 /* x1 | x0 */
365 movq 8(%rdx), %mm5 /* x3 | x2 */
366
367 pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */
368 movq %mm5, %mm6 /* x3 | x2 */
369
370 punpckhdq %mm6, %mm6 /* x3 | x3 */
371
372 addq %rax, %rdx
373 pfmul %mm1, %mm6 /* x3*m31 | x3*m30 */
374
375 prefetch 32(%rdx) /* hopefully stride is zero */
376 pfadd %mm4, %mm6 /* x1*m11+x3*m31 | x0*m00+x3*m30 */
377
378 movq %mm6, (%rdi) /* write r0, r1 */
379 movq %mm5, 8(%rdi) /* write r2, r3 */
380
381 addq $16, %rdi
382
383 decl %ecx
384 jnz p4_2d_no_rot_loop
385
386 p4_2d_no_rot_done:
387 femms
388 ret
389
390
391 .align 16
392 .globl _mesa_x86_64_transform_points4_2d
393 _mesa_x86_64_transform_points4_2d:
394
395 movl V4F_COUNT(%rdx), %ecx /* count */
396 movzx V4F_STRIDE(%rdx), %eax /* stride */
397
398 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
399 movl $4, V4F_SIZE(%rdi) /* set dest size */
400 .byte 0x66, 0x66, 0x90 /* manual align += 4 */
401 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
402
403 test %ecx, %ecx
404 .byte 0x66, 0x66, 0x90 /* manual align += 4 */
405 jz p4_2d_done
406
407 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
408 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
409
410 movd (%rsi), %mm0 /* | m00 */
411 movd 4(%rsi), %mm1 /* | m01 */
412
413 prefetch (%rdx)
414
415 punpckldq 16(%rsi), %mm0 /* m10 | m00 */
416 .byte 0x66, 0x66, 0x90 /* manual align += 4 */
417 punpckldq 20(%rsi), %mm1 /* m11 | m01 */
418
419 movq 48(%rsi), %mm2 /* m31 | m30 */
420
421 p4_2d_loop:
422
423 prefetchw 32(%rdi) /* prefetch 2 vertices ahead */
424
425 movq (%rdx), %mm3 /* x1 | x0 */
426 movq 8(%rdx), %mm5 /* x3 | x2 */
427
428 movq %mm3, %mm4 /* x1 | x0 */
429 movq %mm5, %mm6 /* x3 | x2 */
430
431 pfmul %mm1, %mm4 /* x1*m11 | x0*m01 */
432 punpckhdq %mm6, %mm6 /* x3 | x3 */
433
434 pfmul %mm0, %mm3 /* x1*m10 | x0*m00 */
435
436 addq %rax, %rdx
437 pfacc %mm4, %mm3 /* x0*m01+x1*m11 | x0*m00+x1*m10 */
438
439 pfmul %mm2, %mm6 /* x3*m31 | x3*m30 */
440 prefetch 32(%rdx) /* hopefully stride is zero */
441
442 pfadd %mm6, %mm3 /* r1 | r0 */
443
444 movq %mm3, (%rdi) /* write r0, r1 */
445 movq %mm5, 8(%rdi) /* write r2, r3 */
446
447 addq $16, %rdi
448
449 decl %ecx
450 jnz p4_2d_loop
451
452 p4_2d_done:
453 femms
454 ret
455
456 #endif
457
458 #if defined (__ELF__) && defined (__linux__)
459 .section .note.GNU-stack,"",%progbits
460 #endif