fix GL_DOT3_RGBA texture combiner mode in generated fragment programs (bug #11030)
[mesa.git] / src / mesa / x86-64 / xform4.S
1 /* $Id: xform4.S,v 1.2 2006/04/17 18:58:24 krh Exp $ */
2
3 /*
4 * Mesa 3-D graphics library
5 * Version: 3.5
6 *
7 * Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
8 *
9 * Permission is hereby granted, free of charge, to any person obtaining a
10 * copy of this software and associated documentation files (the "Software"),
11 * to deal in the Software without restriction, including without limitation
12 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
13 * and/or sell copies of the Software, and to permit persons to whom the
14 * Software is furnished to do so, subject to the following conditions:
15 *
16 * The above copyright notice and this permission notice shall be included
17 * in all copies or substantial portions of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
23 * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
24 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 */
26
27 #ifdef USE_X86_64_ASM
28
29 #include "matypes.h"
30
31 .text
32
33 .align 16
34
35 .globl _mesa_x86_64_transform_points4_general
36 _mesa_x86_64_transform_points4_general:
37 /*
38 * rdi = dest
39 * rsi = matrix
40 * rdx = source
41 */
42 movl V4F_COUNT(%rdx), %ecx /* count */
43 movzx V4F_STRIDE(%rdx), %eax /* stride */
44
45 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
46 movl $4, V4F_SIZE(%rdi) /* set dest size */
47 .byte 0x66, 0x66, 0x66, 0x90 /* manual align += 3 */
48 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
49
50 testl %ecx, %ecx /* verify non-zero count */
51 prefetchnta 64(%rsi)
52 jz p4_general_done
53
54 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
55 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
56
57 prefetch 16(%rdx)
58
59 movaps 0(%rsi), %xmm4 /* m3 | m2 | m1 | m0 */
60 movaps 16(%rsi), %xmm5 /* m7 | m6 | m5 | m4 */
61 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
62 movaps 32(%rsi), %xmm6 /* m11 | m10 | m9 | m8 */
63 movaps 48(%rsi), %xmm7 /* m15 | m14 | m13 | m12 */
64
65 p4_general_loop:
66
67 movaps (%rdx), %xmm8 /* ox | oy | oz | ow */
68 prefetchw 16(%rdi)
69
70 pshufd $0x00, %xmm8, %xmm0 /* ox | ox | ox | ox */
71 addq %rax, %rdx
72 pshufd $0x55, %xmm8, %xmm1 /* oy | oy | oy | oy */
73 mulps %xmm4, %xmm0 /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
74 pshufd $0xAA, %xmm8, %xmm2 /* oz | oz | oz | ox */
75 mulps %xmm5, %xmm1 /* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
76 pshufd $0xFF, %xmm8, %xmm3 /* ow | ow | ow | ow */
77 mulps %xmm6, %xmm2 /* oz*m11 | oz*m10 | oz*m9 | oz*m8 */
78 addps %xmm1, %xmm0 /* ox*m3+oy*m7 | ... */
79 mulps %xmm7, %xmm3 /* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
80 addps %xmm2, %xmm0 /* ox*m3+oy*m7+oz*m11 | ... */
81 prefetch 16(%rdx)
82 addps %xmm3, %xmm0 /* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
83
84 movaps %xmm0, (%rdi) /* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
85 addq $16, %rdi
86
87 decl %ecx
88 jnz p4_general_loop
89
90 p4_general_done:
91 .byte 0xf3
92 ret
93
94 .section .rodata
95
96 .align 16
97 p4_constants:
98 .byte 0xff, 0xff, 0xff, 0xff
99 .byte 0xff, 0xff, 0xff, 0xff
100 .byte 0xff, 0xff, 0xff, 0xff
101 .byte 0x00, 0x00, 0x00, 0x00
102
103 .byte 0x00, 0x00, 0x00, 0x00
104 .byte 0x00, 0x00, 0x00, 0x00
105 .byte 0x00, 0x00, 0x00, 0x00
106 .float 0f+1.0
107
108 .text
109 .align 16
110 .globl _mesa_x86_64_transform_points4_3d
111 /*
112 * this is slower than _mesa_x86_64_transform_points4_general
113 * because it ensures that the last matrix row (or is it column?) is 0,0,0,1
114 */
115 _mesa_x86_64_transform_points4_3d:
116
117 leaq p4_constants(%rip), %rax
118
119 prefetchnta 64(%rsi)
120
121 movaps (%rax), %xmm9
122 movaps 16(%rax), %xmm10
123
124 movl V4F_COUNT(%rdx), %ecx /* count */
125 movzx V4F_STRIDE(%rdx), %eax /* stride */
126
127 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
128 movl $4, V4F_SIZE(%rdi) /* set dest size */
129 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
130
131 testl %ecx, %ecx /* verify non-zero count */
132 jz p4_3d_done
133
134 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
135 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
136
137 prefetch 16(%rdx)
138
139 movaps 0(%rsi), %xmm4 /* m3 | m2 | m1 | m0 */
140 movaps 16(%rsi), %xmm5 /* m7 | m6 | m5 | m4 */
141 andps %xmm9, %xmm4 /* 0.0 | m2 | m1 | m0 */
142 movaps 32(%rsi), %xmm6 /* m11 | m10 | m9 | m8 */
143 andps %xmm9, %xmm5 /* 0.0 | m6 | m5 | m4 */
144 movaps 48(%rsi), %xmm7 /* m15 | m14 | m13 | m12 */
145 andps %xmm9, %xmm6 /* 0.0 | m10 | m9 | m8 */
146 andps %xmm9, %xmm7 /* 0.0 | m14 | m13 | m12 */
147 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
148 orps %xmm10, %xmm7 /* 1.0 | m14 | m13 | m12 */
149
150 p4_3d_loop:
151
152 movaps (%rdx), %xmm8 /* ox | oy | oz | ow */
153 prefetchw 16(%rdi)
154
155 pshufd $0x00, %xmm8, %xmm0 /* ox | ox | ox | ox */
156 addq %rax, %rdx
157 pshufd $0x55, %xmm8, %xmm1 /* oy | oy | oy | oy */
158 mulps %xmm4, %xmm0 /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
159 pshufd $0xAA, %xmm8, %xmm2 /* oz | oz | oz | ox */
160 mulps %xmm5, %xmm1 /* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
161 pshufd $0xFF, %xmm8, %xmm3 /* ow | ow | ow | ow */
162 mulps %xmm6, %xmm2 /* oz*m11 | oz*m10 | oz*m9 | oz*m8 */
163 addps %xmm1, %xmm0 /* ox*m3+oy*m7 | ... */
164 mulps %xmm7, %xmm3 /* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
165 addps %xmm2, %xmm0 /* ox*m3+oy*m7+oz*m11 | ... */
166 prefetch 16(%rdx)
167 addps %xmm3, %xmm0 /* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
168
169 movaps %xmm0, (%rdi) /* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
170 addq $16, %rdi
171
172 dec %ecx
173 jnz p4_3d_loop
174
175 p4_3d_done:
176 .byte 0xf3
177 ret
178
179
180 .align 16
181 .globl _mesa_x86_64_transform_points4_identity
182 _mesa_x86_64_transform_points4_identity:
183
184 movl V4F_COUNT(%rdx), %ecx /* count */
185 movzx V4F_STRIDE(%rdx), %eax /* stride */
186
187 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
188 movl $4, V4F_SIZE(%rdi) /* set dest size */
189 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
190
191 test %ecx, %ecx
192 jz p4_identity_done
193
194 movq V4F_START(%rdx), %rsi /* ptr to first src vertex */
195 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
196 prefetch 64(%rsi)
197 prefetchw 64(%rdi)
198
199 add %ecx, %ecx
200
201 rep movsq
202
203 p4_identity_done:
204 .byte 0xf3
205 ret
206
207
208 .align 16
209 .globl _mesa_x86_64_transform_points4_3d_no_rot
210 _mesa_x86_64_transform_points4_3d_no_rot:
211
212 movl V4F_COUNT(%rdx), %ecx /* count */
213 movzx V4F_STRIDE(%rdx), %eax /* stride */
214
215 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
216 movl $4, V4F_SIZE(%rdi) /* set dest size */
217 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
218 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
219
220 test %ecx, %ecx
221 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
222 jz p4_3d_no_rot_done
223
224 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
225 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
226
227 prefetch (%rdx)
228
229 movd (%rsi), %mm0 /* | m00 */
230 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
231 punpckldq 20(%rsi), %mm0 /* m11 | m00 */
232
233 movd 40(%rsi), %mm2 /* | m22 */
234 movq 48(%rsi), %mm1 /* m31 | m30 */
235
236 punpckldq 56(%rsi), %mm2 /* m11 | m00 */
237
238 p4_3d_no_rot_loop:
239
240 prefetchw 32(%rdi)
241
242 movq (%rdx), %mm4 /* x1 | x0 */
243 movq 8(%rdx), %mm5 /* x3 | x2 */
244 movd 12(%rdx), %mm7 /* | x3 */
245
246 movq %mm5, %mm6 /* x3 | x2 */
247 pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */
248
249 punpckhdq %mm6, %mm6 /* x3 | x3 */
250 pfmul %mm2, %mm5 /* x3*m32 | x2*m22 */
251
252 pfmul %mm1, %mm6 /* x3*m31 | x3*m30 */
253 pfacc %mm7, %mm5 /* x3 | x2*m22+x3*m32 */
254
255 pfadd %mm6, %mm4 /* x1*m11+x3*m31 | x0*m00+x3*m30 */
256
257 addq %rax, %rdx
258 movq %mm4, (%rdi) /* write r0, r1 */
259 movq %mm5, 8(%rdi) /* write r2, r3 */
260
261 addq $16, %rdi
262
263 decl %ecx
264 prefetch 32(%rdx)
265 jnz p4_3d_no_rot_loop
266
267 p4_3d_no_rot_done:
268 femms
269 ret
270
271
272 .align 16
273 .globl _mesa_x86_64_transform_points4_perspective
274 _mesa_x86_64_transform_points4_perspective:
275
276 movl V4F_COUNT(%rdx), %ecx /* count */
277 movzx V4F_STRIDE(%rdx), %eax /* stride */
278
279 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
280 movl $4, V4F_SIZE(%rdi) /* set dest size */
281 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
282
283 test %ecx, %ecx
284 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
285 jz p4_perspective_done
286
287 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
288 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
289
290 movd (%rsi), %mm0 /* | m00 */
291 pxor %mm7, %mm7 /* 0 | 0 */
292 punpckldq 20(%rsi), %mm0 /* m11 | m00 */
293
294 movq 32(%rsi), %mm2 /* m21 | m20 */
295 prefetch (%rdx)
296
297 movd 40(%rsi), %mm1 /* | m22 */
298
299 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
300 punpckldq 56(%rsi), %mm1 /* m32 | m22 */
301
302
303 p4_perspective_loop:
304
305 prefetchw 32(%rdi) /* prefetch 2 vertices ahead */
306
307 movq (%rdx), %mm4 /* x1 | x0 */
308 movq 8(%rdx), %mm5 /* x3 | x2 */
309 movd 8(%rdx), %mm3 /* | x2 */
310
311 movq %mm5, %mm6 /* x3 | x2 */
312 pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */
313
314 punpckldq %mm5, %mm5 /* x2 | x2 */
315
316 pfmul %mm2, %mm5 /* x2*m21 | x2*m20 */
317 pfsubr %mm7, %mm3 /* | -x2 */
318
319 pfmul %mm1, %mm6 /* x3*m32 | x2*m22 */
320 pfadd %mm4, %mm5 /* x1*m11+x2*m21 | x0*m00+x2*m20 */
321
322 pfacc %mm3, %mm6 /* -x2 | x2*m22+x3*m32 */
323
324 movq %mm5, (%rdi) /* write r0, r1 */
325 addq %rax, %rdx
326 movq %mm6, 8(%rdi) /* write r2, r3 */
327
328 addq $16, %rdi
329
330 decl %ecx
331 prefetch 32(%rdx) /* hopefully stride is zero */
332 jnz p4_perspective_loop
333
334 p4_perspective_done:
335 femms
336 ret
337
338 .align 16
339 .globl _mesa_x86_64_transform_points4_2d_no_rot
340 _mesa_x86_64_transform_points4_2d_no_rot:
341
342 movl V4F_COUNT(%rdx), %ecx /* count */
343 movzx V4F_STRIDE(%rdx), %eax /* stride */
344
345 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
346 movl $4, V4F_SIZE(%rdi) /* set dest size */
347 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
348
349 test %ecx, %ecx
350 .byte 0x90 /* manual align += 1 */
351 jz p4_2d_no_rot_done
352
353 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
354 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
355
356 movd (%rsi), %mm0 /* | m00 */
357 prefetch (%rdx)
358 punpckldq 20(%rsi), %mm0 /* m11 | m00 */
359
360 movq 48(%rsi), %mm1 /* m31 | m30 */
361
362 p4_2d_no_rot_loop:
363
364 prefetchw 32(%rdi) /* prefetch 2 vertices ahead */
365
366 movq (%rdx), %mm4 /* x1 | x0 */
367 movq 8(%rdx), %mm5 /* x3 | x2 */
368
369 pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */
370 movq %mm5, %mm6 /* x3 | x2 */
371
372 punpckhdq %mm6, %mm6 /* x3 | x3 */
373
374 addq %rax, %rdx
375 pfmul %mm1, %mm6 /* x3*m31 | x3*m30 */
376
377 prefetch 32(%rdx) /* hopefully stride is zero */
378 pfadd %mm4, %mm6 /* x1*m11+x3*m31 | x0*m00+x3*m30 */
379
380 movq %mm6, (%rdi) /* write r0, r1 */
381 movq %mm5, 8(%rdi) /* write r2, r3 */
382
383 addq $16, %rdi
384
385 decl %ecx
386 jnz p4_2d_no_rot_loop
387
388 p4_2d_no_rot_done:
389 femms
390 ret
391
392
393 .align 16
394 .globl _mesa_x86_64_transform_points4_2d
395 _mesa_x86_64_transform_points4_2d:
396
397 movl V4F_COUNT(%rdx), %ecx /* count */
398 movzx V4F_STRIDE(%rdx), %eax /* stride */
399
400 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
401 movl $4, V4F_SIZE(%rdi) /* set dest size */
402 .byte 0x66, 0x66, 0x90 /* manual align += 4 */
403 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
404
405 test %ecx, %ecx
406 .byte 0x66, 0x66, 0x90 /* manual align += 4 */
407 jz p4_2d_done
408
409 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
410 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
411
412 movd (%rsi), %mm0 /* | m00 */
413 movd 4(%rsi), %mm1 /* | m01 */
414
415 prefetch (%rdx)
416
417 punpckldq 16(%rsi), %mm0 /* m10 | m00 */
418 .byte 0x66, 0x66, 0x90 /* manual align += 4 */
419 punpckldq 20(%rsi), %mm1 /* m11 | m01 */
420
421 movq 48(%rsi), %mm2 /* m31 | m30 */
422
423 p4_2d_loop:
424
425 prefetchw 32(%rdi) /* prefetch 2 vertices ahead */
426
427 movq (%rdx), %mm3 /* x1 | x0 */
428 movq 8(%rdx), %mm5 /* x3 | x2 */
429
430 movq %mm3, %mm4 /* x1 | x0 */
431 movq %mm5, %mm6 /* x3 | x2 */
432
433 pfmul %mm1, %mm4 /* x1*m11 | x0*m01 */
434 punpckhdq %mm6, %mm6 /* x3 | x3 */
435
436 pfmul %mm0, %mm3 /* x1*m10 | x0*m00 */
437
438 addq %rax, %rdx
439 pfacc %mm4, %mm3 /* x0*m01+x1*m11 | x0*m00+x1*m10 */
440
441 pfmul %mm2, %mm6 /* x3*m31 | x3*m30 */
442 prefetch 32(%rdx) /* hopefully stride is zero */
443
444 pfadd %mm6, %mm3 /* r1 | r0 */
445
446 movq %mm3, (%rdi) /* write r0, r1 */
447 movq %mm5, 8(%rdi) /* write r2, r3 */
448
449 addq $16, %rdi
450
451 decl %ecx
452 jnz p4_2d_loop
453
454 p4_2d_done:
455 femms
456 ret
457
458 #endif
459
460 #if defined (__ELF__) && defined (__linux__)
461 .section .note.GNU-stack,"",%progbits
462 #endif