nir: Move compute system value lowering to a separate pass
[mesa.git] / src / mesa / x86-64 / xform4.S
1 /*
2 * Mesa 3-D graphics library
3 *
4 * Copyright (C) 1999-2007 Brian Paul All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included
14 * in all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
17 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 * OTHER DEALINGS IN THE SOFTWARE.
23 */
24 #ifdef HAVE_CET_H
25 #include <cet.h>
26 #else
27 #define _CET_ENDBR
28 #endif
29
30 #ifdef USE_X86_64_ASM
31
32 #define MATH_ASM_PTR_SIZE 8
33 #include "math/m_vector_asm.h"
34
35 .text
36
37 .align 16
38 .globl _mesa_x86_64_cpuid
39 .hidden _mesa_x86_64_cpuid
40 _mesa_x86_64_cpuid:
41 _CET_ENDBR
42 pushq %rbx
43 movl (%rdi), %eax
44 movl 8(%rdi), %ecx
45
46 cpuid
47
48 movl %ebx, 4(%rdi)
49 movl %eax, (%rdi)
50 movl %ecx, 8(%rdi)
51 movl %edx, 12(%rdi)
52 popq %rbx
53 ret
54
55 .align 16
56 .globl _mesa_x86_64_transform_points4_general
57 .hidden _mesa_x86_64_transform_points4_general
58 _mesa_x86_64_transform_points4_general:
59 /*
60 * rdi = dest
61 * rsi = matrix
62 * rdx = source
63 */
64 _CET_ENDBR
65 movl V4F_COUNT(%rdx), %ecx /* count */
66 movzbl V4F_STRIDE(%rdx), %eax /* stride */
67
68 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
69 movl $4, V4F_SIZE(%rdi) /* set dest size */
70 .byte 0x66, 0x66, 0x66, 0x90 /* manual align += 3 */
71 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
72
73 testl %ecx, %ecx /* verify non-zero count */
74 prefetchnta 64(%rsi)
75 jz p4_general_done
76
77 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
78 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
79
80 prefetcht1 16(%rdx)
81
82 movaps 0(%rsi), %xmm4 /* m3 | m2 | m1 | m0 */
83 movaps 16(%rsi), %xmm5 /* m7 | m6 | m5 | m4 */
84 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
85 movaps 32(%rsi), %xmm6 /* m11 | m10 | m9 | m8 */
86 movaps 48(%rsi), %xmm7 /* m15 | m14 | m13 | m12 */
87
88 p4_general_loop:
89
90 movups (%rdx), %xmm8 /* ox | oy | oz | ow */
91 prefetcht1 16(%rdi)
92
93 pshufd $0x00, %xmm8, %xmm0 /* ox | ox | ox | ox */
94 addq %rax, %rdx
95 pshufd $0x55, %xmm8, %xmm1 /* oy | oy | oy | oy */
96 mulps %xmm4, %xmm0 /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
97 pshufd $0xAA, %xmm8, %xmm2 /* oz | oz | oz | ox */
98 mulps %xmm5, %xmm1 /* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
99 pshufd $0xFF, %xmm8, %xmm3 /* ow | ow | ow | ow */
100 mulps %xmm6, %xmm2 /* oz*m11 | oz*m10 | oz*m9 | oz*m8 */
101 addps %xmm1, %xmm0 /* ox*m3+oy*m7 | ... */
102 mulps %xmm7, %xmm3 /* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
103 addps %xmm2, %xmm0 /* ox*m3+oy*m7+oz*m11 | ... */
104 prefetcht1 16(%rdx)
105 addps %xmm3, %xmm0 /* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
106
107 movaps %xmm0, (%rdi) /* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
108 addq $16, %rdi
109
110 decl %ecx
111 jnz p4_general_loop
112
113 p4_general_done:
114 .byte 0xf3
115 ret
116
117 .section .rodata
118
119 .align 16
120 p4_constants:
121 .byte 0xff, 0xff, 0xff, 0xff
122 .byte 0xff, 0xff, 0xff, 0xff
123 .byte 0xff, 0xff, 0xff, 0xff
124 .byte 0x00, 0x00, 0x00, 0x00
125
126 .byte 0x00, 0x00, 0x00, 0x00
127 .byte 0x00, 0x00, 0x00, 0x00
128 .byte 0x00, 0x00, 0x00, 0x00
129 .float 1.0
130
131 .text
132 .align 16
133 .globl _mesa_x86_64_transform_points4_3d
134 .hidden _mesa_x86_64_transform_points4_3d
135 /*
136 * this is slower than _mesa_x86_64_transform_points4_general
137 * because it ensures that the last matrix row (or is it column?) is 0,0,0,1
138 */
139 _mesa_x86_64_transform_points4_3d:
140 _CET_ENDBR
141 leaq p4_constants(%rip), %rax
142
143 prefetchnta 64(%rsi)
144
145 movaps (%rax), %xmm9
146 movaps 16(%rax), %xmm10
147
148 movl V4F_COUNT(%rdx), %ecx /* count */
149 movzbl V4F_STRIDE(%rdx), %eax /* stride */
150
151 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
152 movl $4, V4F_SIZE(%rdi) /* set dest size */
153 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
154
155 testl %ecx, %ecx /* verify non-zero count */
156 jz p4_3d_done
157
158 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
159 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
160
161 prefetcht1 16(%rdx)
162
163 movaps 0(%rsi), %xmm4 /* m3 | m2 | m1 | m0 */
164 movaps 16(%rsi), %xmm5 /* m7 | m6 | m5 | m4 */
165 andps %xmm9, %xmm4 /* 0.0 | m2 | m1 | m0 */
166 movaps 32(%rsi), %xmm6 /* m11 | m10 | m9 | m8 */
167 andps %xmm9, %xmm5 /* 0.0 | m6 | m5 | m4 */
168 movaps 48(%rsi), %xmm7 /* m15 | m14 | m13 | m12 */
169 andps %xmm9, %xmm6 /* 0.0 | m10 | m9 | m8 */
170 andps %xmm9, %xmm7 /* 0.0 | m14 | m13 | m12 */
171 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
172 orps %xmm10, %xmm7 /* 1.0 | m14 | m13 | m12 */
173
174 p4_3d_loop:
175
176 movups (%rdx), %xmm8 /* ox | oy | oz | ow */
177 prefetcht1 16(%rdi)
178
179 pshufd $0x00, %xmm8, %xmm0 /* ox | ox | ox | ox */
180 addq %rax, %rdx
181 pshufd $0x55, %xmm8, %xmm1 /* oy | oy | oy | oy */
182 mulps %xmm4, %xmm0 /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
183 pshufd $0xAA, %xmm8, %xmm2 /* oz | oz | oz | ox */
184 mulps %xmm5, %xmm1 /* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
185 pshufd $0xFF, %xmm8, %xmm3 /* ow | ow | ow | ow */
186 mulps %xmm6, %xmm2 /* oz*m11 | oz*m10 | oz*m9 | oz*m8 */
187 addps %xmm1, %xmm0 /* ox*m3+oy*m7 | ... */
188 mulps %xmm7, %xmm3 /* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
189 addps %xmm2, %xmm0 /* ox*m3+oy*m7+oz*m11 | ... */
190 prefetcht1 16(%rdx)
191 addps %xmm3, %xmm0 /* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
192
193 movaps %xmm0, (%rdi) /* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
194 addq $16, %rdi
195
196 dec %ecx
197 jnz p4_3d_loop
198
199 p4_3d_done:
200 .byte 0xf3
201 ret
202
203
204 .align 16
205 .globl _mesa_x86_64_transform_points4_identity
206 .hidden _mesa_x86_64_transform_points4_identity
207 _mesa_x86_64_transform_points4_identity:
208 _CET_ENDBR
209 movl V4F_COUNT(%rdx), %ecx /* count */
210 movzbl V4F_STRIDE(%rdx), %eax /* stride */
211
212 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
213 movl $4, V4F_SIZE(%rdi) /* set dest size */
214 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
215
216 test %ecx, %ecx
217 jz p4_identity_done
218
219 movq V4F_START(%rdx), %rsi /* ptr to first src vertex */
220 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
221 prefetcht1 64(%rsi)
222 prefetcht1 64(%rdi)
223
224 add %ecx, %ecx
225
226 rep movsq
227
228 p4_identity_done:
229 .byte 0xf3
230 ret
231
232
233 .align 16
234 .globl _mesa_3dnow_transform_points4_3d_no_rot
235 .hidden _mesa_3dnow_transform_points4_3d_no_rot
236 _mesa_3dnow_transform_points4_3d_no_rot:
237 _CET_ENDBR
238 movl V4F_COUNT(%rdx), %ecx /* count */
239 movzbl V4F_STRIDE(%rdx), %eax /* stride */
240
241 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
242 movl $4, V4F_SIZE(%rdi) /* set dest size */
243 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
244 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
245
246 test %ecx, %ecx
247 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
248 jz p4_3d_no_rot_done
249
250 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
251 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
252
253 prefetcht1 (%rdx)
254
255 movd (%rsi), %mm0 /* | m00 */
256 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
257 punpckldq 20(%rsi), %mm0 /* m11 | m00 */
258
259 movd 40(%rsi), %mm2 /* | m22 */
260 movq 48(%rsi), %mm1 /* m31 | m30 */
261
262 punpckldq 56(%rsi), %mm2 /* m11 | m00 */
263
264 p4_3d_no_rot_loop:
265
266 prefetcht1 32(%rdi)
267
268 movq (%rdx), %mm4 /* x1 | x0 */
269 movq 8(%rdx), %mm5 /* x3 | x2 */
270 movd 12(%rdx), %mm7 /* | x3 */
271
272 movq %mm5, %mm6 /* x3 | x2 */
273 pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */
274
275 punpckhdq %mm6, %mm6 /* x3 | x3 */
276 pfmul %mm2, %mm5 /* x3*m32 | x2*m22 */
277
278 pfmul %mm1, %mm6 /* x3*m31 | x3*m30 */
279 pfacc %mm7, %mm5 /* x3 | x2*m22+x3*m32 */
280
281 pfadd %mm6, %mm4 /* x1*m11+x3*m31 | x0*m00+x3*m30 */
282
283 addq %rax, %rdx
284 movq %mm4, (%rdi) /* write r0, r1 */
285 movq %mm5, 8(%rdi) /* write r2, r3 */
286
287 addq $16, %rdi
288
289 decl %ecx
290 prefetcht1 32(%rdx)
291 jnz p4_3d_no_rot_loop
292
293 p4_3d_no_rot_done:
294 femms
295 ret
296
297
298 .align 16
299 .globl _mesa_3dnow_transform_points4_perspective
300 .hidden _mesa_3dnow_transform_points4_perspective
301 _mesa_3dnow_transform_points4_perspective:
302 _CET_ENDBR
303 movl V4F_COUNT(%rdx), %ecx /* count */
304 movzbl V4F_STRIDE(%rdx), %eax /* stride */
305
306 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
307 movl $4, V4F_SIZE(%rdi) /* set dest size */
308 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
309
310 test %ecx, %ecx
311 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
312 jz p4_perspective_done
313
314 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
315 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
316
317 movd (%rsi), %mm0 /* | m00 */
318 pxor %mm7, %mm7 /* 0 | 0 */
319 punpckldq 20(%rsi), %mm0 /* m11 | m00 */
320
321 movq 32(%rsi), %mm2 /* m21 | m20 */
322 prefetcht1 (%rdx)
323
324 movd 40(%rsi), %mm1 /* | m22 */
325
326 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
327 punpckldq 56(%rsi), %mm1 /* m32 | m22 */
328
329
330 p4_perspective_loop:
331
332 prefetcht1 32(%rdi) /* prefetch 2 vertices ahead */
333
334 movq (%rdx), %mm4 /* x1 | x0 */
335 movq 8(%rdx), %mm5 /* x3 | x2 */
336 movd 8(%rdx), %mm3 /* | x2 */
337
338 movq %mm5, %mm6 /* x3 | x2 */
339 pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */
340
341 punpckldq %mm5, %mm5 /* x2 | x2 */
342
343 pfmul %mm2, %mm5 /* x2*m21 | x2*m20 */
344 pfsubr %mm7, %mm3 /* | -x2 */
345
346 pfmul %mm1, %mm6 /* x3*m32 | x2*m22 */
347 pfadd %mm4, %mm5 /* x1*m11+x2*m21 | x0*m00+x2*m20 */
348
349 pfacc %mm3, %mm6 /* -x2 | x2*m22+x3*m32 */
350
351 movq %mm5, (%rdi) /* write r0, r1 */
352 addq %rax, %rdx
353 movq %mm6, 8(%rdi) /* write r2, r3 */
354
355 addq $16, %rdi
356
357 decl %ecx
358 prefetcht1 32(%rdx) /* hopefully stride is zero */
359 jnz p4_perspective_loop
360
361 p4_perspective_done:
362 femms
363 ret
364
365 .align 16
366 .globl _mesa_3dnow_transform_points4_2d_no_rot
367 .hidden _mesa_3dnow_transform_points4_2d_no_rot
368 _mesa_3dnow_transform_points4_2d_no_rot:
369 _CET_ENDBR
370 movl V4F_COUNT(%rdx), %ecx /* count */
371 movzbl V4F_STRIDE(%rdx), %eax /* stride */
372
373 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
374 movl $4, V4F_SIZE(%rdi) /* set dest size */
375 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
376
377 test %ecx, %ecx
378 .byte 0x90 /* manual align += 1 */
379 jz p4_2d_no_rot_done
380
381 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
382 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
383
384 movd (%rsi), %mm0 /* | m00 */
385 prefetcht1 (%rdx)
386 punpckldq 20(%rsi), %mm0 /* m11 | m00 */
387
388 movq 48(%rsi), %mm1 /* m31 | m30 */
389
390 p4_2d_no_rot_loop:
391
392 prefetcht1 32(%rdi) /* prefetch 2 vertices ahead */
393
394 movq (%rdx), %mm4 /* x1 | x0 */
395 movq 8(%rdx), %mm5 /* x3 | x2 */
396
397 pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */
398 movq %mm5, %mm6 /* x3 | x2 */
399
400 punpckhdq %mm6, %mm6 /* x3 | x3 */
401
402 addq %rax, %rdx
403 pfmul %mm1, %mm6 /* x3*m31 | x3*m30 */
404
405 prefetcht1 32(%rdx) /* hopefully stride is zero */
406 pfadd %mm4, %mm6 /* x1*m11+x3*m31 | x0*m00+x3*m30 */
407
408 movq %mm6, (%rdi) /* write r0, r1 */
409 movq %mm5, 8(%rdi) /* write r2, r3 */
410
411 addq $16, %rdi
412
413 decl %ecx
414 jnz p4_2d_no_rot_loop
415
416 p4_2d_no_rot_done:
417 femms
418 ret
419
420
421 .align 16
422 .globl _mesa_3dnow_transform_points4_2d
423 .hidden _mesa_3dnow_transform_points4_2d
424 _mesa_3dnow_transform_points4_2d:
425 _CET_ENDBR
426 movl V4F_COUNT(%rdx), %ecx /* count */
427 movzbl V4F_STRIDE(%rdx), %eax /* stride */
428
429 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
430 movl $4, V4F_SIZE(%rdi) /* set dest size */
431 .byte 0x66, 0x66, 0x90 /* manual align += 4 */
432 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
433
434 test %ecx, %ecx
435 .byte 0x66, 0x66, 0x90 /* manual align += 4 */
436 jz p4_2d_done
437
438 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
439 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
440
441 movd (%rsi), %mm0 /* | m00 */
442 movd 4(%rsi), %mm1 /* | m01 */
443
444 prefetcht1 (%rdx)
445
446 punpckldq 16(%rsi), %mm0 /* m10 | m00 */
447 .byte 0x66, 0x66, 0x90 /* manual align += 4 */
448 punpckldq 20(%rsi), %mm1 /* m11 | m01 */
449
450 movq 48(%rsi), %mm2 /* m31 | m30 */
451
452 p4_2d_loop:
453
454 prefetcht1 32(%rdi) /* prefetch 2 vertices ahead */
455
456 movq (%rdx), %mm3 /* x1 | x0 */
457 movq 8(%rdx), %mm5 /* x3 | x2 */
458
459 movq %mm3, %mm4 /* x1 | x0 */
460 movq %mm5, %mm6 /* x3 | x2 */
461
462 pfmul %mm1, %mm4 /* x1*m11 | x0*m01 */
463 punpckhdq %mm6, %mm6 /* x3 | x3 */
464
465 pfmul %mm0, %mm3 /* x1*m10 | x0*m00 */
466
467 addq %rax, %rdx
468 pfacc %mm4, %mm3 /* x0*m01+x1*m11 | x0*m00+x1*m10 */
469
470 pfmul %mm2, %mm6 /* x3*m31 | x3*m30 */
471 prefetcht1 32(%rdx) /* hopefully stride is zero */
472
473 pfadd %mm6, %mm3 /* r1 | r0 */
474
475 movq %mm3, (%rdi) /* write r0, r1 */
476 movq %mm5, 8(%rdi) /* write r2, r3 */
477
478 addq $16, %rdi
479
480 decl %ecx
481 jnz p4_2d_loop
482
483 p4_2d_done:
484 femms
485 ret
486
487 #endif
488
489 #if defined (__ELF__) && defined (__linux__)
490 .section .note.GNU-stack,"",%progbits
491 #endif