X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;ds=sidebyside;f=src%2Fmesa%2Fx86-64%2Fxform4.S;h=e36a6276d2ae902f45ab451eb8262f27e9eb00b4;hb=f3af7886fe46706df9d21deb1ccb5de3d04a5507;hp=805969127db5f4ed2663b5122099da017f1704b2;hpb=916de35d677ca5238e9515840fa5aa9f81302c5b;p=mesa.git diff --git a/src/mesa/x86-64/xform4.S b/src/mesa/x86-64/xform4.S index 805969127db..e36a6276d2a 100644 --- a/src/mesa/x86-64/xform4.S +++ b/src/mesa/x86-64/xform4.S @@ -1,6 +1,5 @@ /* * Mesa 3-D graphics library - * Version: 7.1 * * Copyright (C) 1999-2007 Brian Paul All Rights Reserved. * @@ -17,20 +16,29 @@ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN - * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. */ +#ifdef HAVE_CET_H +#include +#else +#define _CET_ENDBR +#endif #ifdef USE_X86_64_ASM -#include "matypes.h" +#define MATH_ASM_PTR_SIZE 8 +#include "math/m_vector_asm.h" .text .align 16 .globl _mesa_x86_64_cpuid +.hidden _mesa_x86_64_cpuid _mesa_x86_64_cpuid: + _CET_ENDBR pushq %rbx movl (%rdi), %eax movl 8(%rdi), %ecx @@ -46,14 +54,16 @@ _mesa_x86_64_cpuid: .align 16 .globl _mesa_x86_64_transform_points4_general +.hidden _mesa_x86_64_transform_points4_general _mesa_x86_64_transform_points4_general: /* * rdi = dest * rsi = matrix * rdx = source */ + _CET_ENDBR movl V4F_COUNT(%rdx), %ecx /* count */ - movzx V4F_STRIDE(%rdx), %eax /* stride */ + movzbl V4F_STRIDE(%rdx), %eax /* stride */ movl %ecx, V4F_COUNT(%rdi) /* set dest count */ movl $4, V4F_SIZE(%rdi) /* set dest size */ @@ -67,7 +77,7 @@ _mesa_x86_64_transform_points4_general: movq V4F_START(%rdx), %rdx /* ptr to first src vertex */ movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ - prefetch 16(%rdx) + prefetcht1 16(%rdx) movaps 0(%rsi), %xmm4 /* m3 | m2 | m1 | m0 */ movaps 16(%rsi), %xmm5 /* m7 | m6 | m5 | m4 */ @@ -78,7 +88,7 @@ _mesa_x86_64_transform_points4_general: p4_general_loop: movups (%rdx), %xmm8 /* ox | oy | oz | ow */ - prefetchw 16(%rdi) + prefetcht1 16(%rdi) pshufd $0x00, %xmm8, %xmm0 /* ox | ox | ox | ox */ addq %rax, %rdx @@ -91,7 +101,7 @@ p4_general_loop: addps %xmm1, %xmm0 /* ox*m3+oy*m7 | ... */ mulps %xmm7, %xmm3 /* ow*m15 | ow*m14 | ow*m13 | ow*m12 */ addps %xmm2, %xmm0 /* ox*m3+oy*m7+oz*m11 | ... */ - prefetch 16(%rdx) + prefetcht1 16(%rdx) addps %xmm3, %xmm0 /* ox*m3+oy*m7+oz*m11+ow*m15 | ... */ movaps %xmm0, (%rdi) /* ->D(3) | ->D(2) | ->D(1) | ->D(0) */ @@ -116,17 +126,18 @@ p4_constants: .byte 0x00, 0x00, 0x00, 0x00 .byte 0x00, 0x00, 0x00, 0x00 .byte 0x00, 0x00, 0x00, 0x00 -.float 0f+1.0 +.float 1.0 .text .align 16 .globl _mesa_x86_64_transform_points4_3d +.hidden _mesa_x86_64_transform_points4_3d /* * this is slower than _mesa_x86_64_transform_points4_general * because it ensures that the last matrix row (or is it column?) is 0,0,0,1 */ _mesa_x86_64_transform_points4_3d: - + _CET_ENDBR leaq p4_constants(%rip), %rax prefetchnta 64(%rsi) @@ -135,7 +146,7 @@ _mesa_x86_64_transform_points4_3d: movaps 16(%rax), %xmm10 movl V4F_COUNT(%rdx), %ecx /* count */ - movzx V4F_STRIDE(%rdx), %eax /* stride */ + movzbl V4F_STRIDE(%rdx), %eax /* stride */ movl %ecx, V4F_COUNT(%rdi) /* set dest count */ movl $4, V4F_SIZE(%rdi) /* set dest size */ @@ -147,7 +158,7 @@ _mesa_x86_64_transform_points4_3d: movq V4F_START(%rdx), %rdx /* ptr to first src vertex */ movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ - prefetch 16(%rdx) + prefetcht1 16(%rdx) movaps 0(%rsi), %xmm4 /* m3 | m2 | m1 | m0 */ movaps 16(%rsi), %xmm5 /* m7 | m6 | m5 | m4 */ @@ -163,7 +174,7 @@ _mesa_x86_64_transform_points4_3d: p4_3d_loop: movups (%rdx), %xmm8 /* ox | oy | oz | ow */ - prefetchw 16(%rdi) + prefetcht1 16(%rdi) pshufd $0x00, %xmm8, %xmm0 /* ox | ox | ox | ox */ addq %rax, %rdx @@ -176,7 +187,7 @@ p4_3d_loop: addps %xmm1, %xmm0 /* ox*m3+oy*m7 | ... */ mulps %xmm7, %xmm3 /* ow*m15 | ow*m14 | ow*m13 | ow*m12 */ addps %xmm2, %xmm0 /* ox*m3+oy*m7+oz*m11 | ... */ - prefetch 16(%rdx) + prefetcht1 16(%rdx) addps %xmm3, %xmm0 /* ox*m3+oy*m7+oz*m11+ow*m15 | ... */ movaps %xmm0, (%rdi) /* ->D(3) | ->D(2) | ->D(1) | ->D(0) */ @@ -192,10 +203,11 @@ p4_3d_done: .align 16 .globl _mesa_x86_64_transform_points4_identity +.hidden _mesa_x86_64_transform_points4_identity _mesa_x86_64_transform_points4_identity: - + _CET_ENDBR movl V4F_COUNT(%rdx), %ecx /* count */ - movzx V4F_STRIDE(%rdx), %eax /* stride */ + movzbl V4F_STRIDE(%rdx), %eax /* stride */ movl %ecx, V4F_COUNT(%rdi) /* set dest count */ movl $4, V4F_SIZE(%rdi) /* set dest size */ @@ -206,8 +218,8 @@ _mesa_x86_64_transform_points4_identity: movq V4F_START(%rdx), %rsi /* ptr to first src vertex */ movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ - prefetch 64(%rsi) - prefetchw 64(%rdi) + prefetcht1 64(%rsi) + prefetcht1 64(%rdi) add %ecx, %ecx @@ -220,10 +232,11 @@ p4_identity_done: .align 16 .globl _mesa_3dnow_transform_points4_3d_no_rot +.hidden _mesa_3dnow_transform_points4_3d_no_rot _mesa_3dnow_transform_points4_3d_no_rot: - + _CET_ENDBR movl V4F_COUNT(%rdx), %ecx /* count */ - movzx V4F_STRIDE(%rdx), %eax /* stride */ + movzbl V4F_STRIDE(%rdx), %eax /* stride */ movl %ecx, V4F_COUNT(%rdi) /* set dest count */ movl $4, V4F_SIZE(%rdi) /* set dest size */ @@ -237,7 +250,7 @@ _mesa_3dnow_transform_points4_3d_no_rot: movq V4F_START(%rdx), %rdx /* ptr to first src vertex */ movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ - prefetch (%rdx) + prefetcht1 (%rdx) movd (%rsi), %mm0 /* | m00 */ .byte 0x66, 0x66, 0x90 /* manual align += 3 */ @@ -250,7 +263,7 @@ _mesa_3dnow_transform_points4_3d_no_rot: p4_3d_no_rot_loop: - prefetchw 32(%rdi) + prefetcht1 32(%rdi) movq (%rdx), %mm4 /* x1 | x0 */ movq 8(%rdx), %mm5 /* x3 | x2 */ @@ -274,7 +287,7 @@ p4_3d_no_rot_loop: addq $16, %rdi decl %ecx - prefetch 32(%rdx) + prefetcht1 32(%rdx) jnz p4_3d_no_rot_loop p4_3d_no_rot_done: @@ -284,10 +297,11 @@ p4_3d_no_rot_done: .align 16 .globl _mesa_3dnow_transform_points4_perspective +.hidden _mesa_3dnow_transform_points4_perspective _mesa_3dnow_transform_points4_perspective: - + _CET_ENDBR movl V4F_COUNT(%rdx), %ecx /* count */ - movzx V4F_STRIDE(%rdx), %eax /* stride */ + movzbl V4F_STRIDE(%rdx), %eax /* stride */ movl %ecx, V4F_COUNT(%rdi) /* set dest count */ movl $4, V4F_SIZE(%rdi) /* set dest size */ @@ -305,7 +319,7 @@ _mesa_3dnow_transform_points4_perspective: punpckldq 20(%rsi), %mm0 /* m11 | m00 */ movq 32(%rsi), %mm2 /* m21 | m20 */ - prefetch (%rdx) + prefetcht1 (%rdx) movd 40(%rsi), %mm1 /* | m22 */ @@ -315,7 +329,7 @@ _mesa_3dnow_transform_points4_perspective: p4_perspective_loop: - prefetchw 32(%rdi) /* prefetch 2 vertices ahead */ + prefetcht1 32(%rdi) /* prefetch 2 vertices ahead */ movq (%rdx), %mm4 /* x1 | x0 */ movq 8(%rdx), %mm5 /* x3 | x2 */ @@ -341,7 +355,7 @@ p4_perspective_loop: addq $16, %rdi decl %ecx - prefetch 32(%rdx) /* hopefully stride is zero */ + prefetcht1 32(%rdx) /* hopefully stride is zero */ jnz p4_perspective_loop p4_perspective_done: @@ -350,10 +364,11 @@ p4_perspective_done: .align 16 .globl _mesa_3dnow_transform_points4_2d_no_rot +.hidden _mesa_3dnow_transform_points4_2d_no_rot _mesa_3dnow_transform_points4_2d_no_rot: - + _CET_ENDBR movl V4F_COUNT(%rdx), %ecx /* count */ - movzx V4F_STRIDE(%rdx), %eax /* stride */ + movzbl V4F_STRIDE(%rdx), %eax /* stride */ movl %ecx, V4F_COUNT(%rdi) /* set dest count */ movl $4, V4F_SIZE(%rdi) /* set dest size */ @@ -367,14 +382,14 @@ _mesa_3dnow_transform_points4_2d_no_rot: movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ movd (%rsi), %mm0 /* | m00 */ - prefetch (%rdx) + prefetcht1 (%rdx) punpckldq 20(%rsi), %mm0 /* m11 | m00 */ movq 48(%rsi), %mm1 /* m31 | m30 */ p4_2d_no_rot_loop: - prefetchw 32(%rdi) /* prefetch 2 vertices ahead */ + prefetcht1 32(%rdi) /* prefetch 2 vertices ahead */ movq (%rdx), %mm4 /* x1 | x0 */ movq 8(%rdx), %mm5 /* x3 | x2 */ @@ -387,7 +402,7 @@ p4_2d_no_rot_loop: addq %rax, %rdx pfmul %mm1, %mm6 /* x3*m31 | x3*m30 */ - prefetch 32(%rdx) /* hopefully stride is zero */ + prefetcht1 32(%rdx) /* hopefully stride is zero */ pfadd %mm4, %mm6 /* x1*m11+x3*m31 | x0*m00+x3*m30 */ movq %mm6, (%rdi) /* write r0, r1 */ @@ -405,10 +420,11 @@ p4_2d_no_rot_done: .align 16 .globl _mesa_3dnow_transform_points4_2d +.hidden _mesa_3dnow_transform_points4_2d _mesa_3dnow_transform_points4_2d: - + _CET_ENDBR movl V4F_COUNT(%rdx), %ecx /* count */ - movzx V4F_STRIDE(%rdx), %eax /* stride */ + movzbl V4F_STRIDE(%rdx), %eax /* stride */ movl %ecx, V4F_COUNT(%rdi) /* set dest count */ movl $4, V4F_SIZE(%rdi) /* set dest size */ @@ -425,7 +441,7 @@ _mesa_3dnow_transform_points4_2d: movd (%rsi), %mm0 /* | m00 */ movd 4(%rsi), %mm1 /* | m01 */ - prefetch (%rdx) + prefetcht1 (%rdx) punpckldq 16(%rsi), %mm0 /* m10 | m00 */ .byte 0x66, 0x66, 0x90 /* manual align += 4 */ @@ -435,7 +451,7 @@ _mesa_3dnow_transform_points4_2d: p4_2d_loop: - prefetchw 32(%rdi) /* prefetch 2 vertices ahead */ + prefetcht1 32(%rdi) /* prefetch 2 vertices ahead */ movq (%rdx), %mm3 /* x1 | x0 */ movq 8(%rdx), %mm5 /* x3 | x2 */ @@ -452,7 +468,7 @@ p4_2d_loop: pfacc %mm4, %mm3 /* x0*m01+x1*m11 | x0*m00+x1*m10 */ pfmul %mm2, %mm6 /* x3*m31 | x3*m30 */ - prefetch 32(%rdx) /* hopefully stride is zero */ + prefetcht1 32(%rdx) /* hopefully stride is zero */ pfadd %mm6, %mm3 /* r1 | r0 */