X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fmesa%2Fx86-64%2Fxform4.S;h=b0aca19c8b0963d00e22c4e42d608e97fc2e4c4e;hb=73f1e33d34b2044f2252a73e0fdd827d39724505;hp=667ecf6e589ac328de5c2ed0de3ef1f869fcecec;hpb=938d9d596324e411fde5312f2bb65b444c502c37;p=mesa.git diff --git a/src/mesa/x86-64/xform4.S b/src/mesa/x86-64/xform4.S index 667ecf6e589..b0aca19c8b0 100644 --- a/src/mesa/x86-64/xform4.S +++ b/src/mesa/x86-64/xform4.S @@ -1,9 +1,7 @@ - /* * Mesa 3-D graphics library - * Version: 3.5 * - * Copyright (C) 1999-2001 Brian Paul All Rights Reserved. + * Copyright (C) 1999-2007 Brian Paul All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -18,9 +16,10 @@ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN - * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. */ #ifdef USE_X86_64_ASM @@ -30,8 +29,25 @@ .text .align 16 +.globl _mesa_x86_64_cpuid +.hidden _mesa_x86_64_cpuid +_mesa_x86_64_cpuid: + pushq %rbx + movl (%rdi), %eax + movl 8(%rdi), %ecx + + cpuid + + movl %ebx, 4(%rdi) + movl %eax, (%rdi) + movl %ecx, 8(%rdi) + movl %edx, 12(%rdi) + popq %rbx + ret +.align 16 .globl _mesa_x86_64_transform_points4_general +.hidden _mesa_x86_64_transform_points4_general _mesa_x86_64_transform_points4_general: /* * rdi = dest @@ -39,7 +55,7 @@ _mesa_x86_64_transform_points4_general: * rdx = source */ movl V4F_COUNT(%rdx), %ecx /* count */ - movzx V4F_STRIDE(%rdx), %eax /* stride */ + movzbl V4F_STRIDE(%rdx), %eax /* stride */ movl %ecx, V4F_COUNT(%rdi) /* set dest count */ movl $4, V4F_SIZE(%rdi) /* set dest size */ @@ -53,7 +69,7 @@ _mesa_x86_64_transform_points4_general: movq V4F_START(%rdx), %rdx /* ptr to first src vertex */ movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ - prefetch 16(%rdx) + prefetcht1 16(%rdx) movaps 0(%rsi), %xmm4 /* m3 | m2 | m1 | m0 */ movaps 16(%rsi), %xmm5 /* m7 | m6 | m5 | m4 */ @@ -63,8 +79,8 @@ _mesa_x86_64_transform_points4_general: p4_general_loop: - movaps (%rdx), %xmm8 /* ox | oy | oz | ow */ - prefetchw 16(%rdi) + movups (%rdx), %xmm8 /* ox | oy | oz | ow */ + prefetcht1 16(%rdi) pshufd $0x00, %xmm8, %xmm0 /* ox | ox | ox | ox */ addq %rax, %rdx @@ -77,7 +93,7 @@ p4_general_loop: addps %xmm1, %xmm0 /* ox*m3+oy*m7 | ... */ mulps %xmm7, %xmm3 /* ow*m15 | ow*m14 | ow*m13 | ow*m12 */ addps %xmm2, %xmm0 /* ox*m3+oy*m7+oz*m11 | ... */ - prefetch 16(%rdx) + prefetcht1 16(%rdx) addps %xmm3, %xmm0 /* ox*m3+oy*m7+oz*m11+ow*m15 | ... */ movaps %xmm0, (%rdi) /* ->D(3) | ->D(2) | ->D(1) | ->D(0) */ @@ -102,11 +118,12 @@ p4_constants: .byte 0x00, 0x00, 0x00, 0x00 .byte 0x00, 0x00, 0x00, 0x00 .byte 0x00, 0x00, 0x00, 0x00 -.float 0f+1.0 +.float 1.0 .text .align 16 .globl _mesa_x86_64_transform_points4_3d +.hidden _mesa_x86_64_transform_points4_3d /* * this is slower than _mesa_x86_64_transform_points4_general * because it ensures that the last matrix row (or is it column?) is 0,0,0,1 @@ -121,7 +138,7 @@ _mesa_x86_64_transform_points4_3d: movaps 16(%rax), %xmm10 movl V4F_COUNT(%rdx), %ecx /* count */ - movzx V4F_STRIDE(%rdx), %eax /* stride */ + movzbl V4F_STRIDE(%rdx), %eax /* stride */ movl %ecx, V4F_COUNT(%rdi) /* set dest count */ movl $4, V4F_SIZE(%rdi) /* set dest size */ @@ -133,7 +150,7 @@ _mesa_x86_64_transform_points4_3d: movq V4F_START(%rdx), %rdx /* ptr to first src vertex */ movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ - prefetch 16(%rdx) + prefetcht1 16(%rdx) movaps 0(%rsi), %xmm4 /* m3 | m2 | m1 | m0 */ movaps 16(%rsi), %xmm5 /* m7 | m6 | m5 | m4 */ @@ -148,8 +165,8 @@ _mesa_x86_64_transform_points4_3d: p4_3d_loop: - movaps (%rdx), %xmm8 /* ox | oy | oz | ow */ - prefetchw 16(%rdi) + movups (%rdx), %xmm8 /* ox | oy | oz | ow */ + prefetcht1 16(%rdi) pshufd $0x00, %xmm8, %xmm0 /* ox | ox | ox | ox */ addq %rax, %rdx @@ -162,7 +179,7 @@ p4_3d_loop: addps %xmm1, %xmm0 /* ox*m3+oy*m7 | ... */ mulps %xmm7, %xmm3 /* ow*m15 | ow*m14 | ow*m13 | ow*m12 */ addps %xmm2, %xmm0 /* ox*m3+oy*m7+oz*m11 | ... */ - prefetch 16(%rdx) + prefetcht1 16(%rdx) addps %xmm3, %xmm0 /* ox*m3+oy*m7+oz*m11+ow*m15 | ... */ movaps %xmm0, (%rdi) /* ->D(3) | ->D(2) | ->D(1) | ->D(0) */ @@ -178,10 +195,11 @@ p4_3d_done: .align 16 .globl _mesa_x86_64_transform_points4_identity +.hidden _mesa_x86_64_transform_points4_identity _mesa_x86_64_transform_points4_identity: movl V4F_COUNT(%rdx), %ecx /* count */ - movzx V4F_STRIDE(%rdx), %eax /* stride */ + movzbl V4F_STRIDE(%rdx), %eax /* stride */ movl %ecx, V4F_COUNT(%rdi) /* set dest count */ movl $4, V4F_SIZE(%rdi) /* set dest size */ @@ -192,8 +210,8 @@ _mesa_x86_64_transform_points4_identity: movq V4F_START(%rdx), %rsi /* ptr to first src vertex */ movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ - prefetch 64(%rsi) - prefetchw 64(%rdi) + prefetcht1 64(%rsi) + prefetcht1 64(%rdi) add %ecx, %ecx @@ -205,11 +223,12 @@ p4_identity_done: .align 16 -.globl _mesa_x86_64_transform_points4_3d_no_rot -_mesa_x86_64_transform_points4_3d_no_rot: +.globl _mesa_3dnow_transform_points4_3d_no_rot +.hidden _mesa_3dnow_transform_points4_3d_no_rot +_mesa_3dnow_transform_points4_3d_no_rot: movl V4F_COUNT(%rdx), %ecx /* count */ - movzx V4F_STRIDE(%rdx), %eax /* stride */ + movzbl V4F_STRIDE(%rdx), %eax /* stride */ movl %ecx, V4F_COUNT(%rdi) /* set dest count */ movl $4, V4F_SIZE(%rdi) /* set dest size */ @@ -223,7 +242,7 @@ _mesa_x86_64_transform_points4_3d_no_rot: movq V4F_START(%rdx), %rdx /* ptr to first src vertex */ movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ - prefetch (%rdx) + prefetcht1 (%rdx) movd (%rsi), %mm0 /* | m00 */ .byte 0x66, 0x66, 0x90 /* manual align += 3 */ @@ -236,7 +255,7 @@ _mesa_x86_64_transform_points4_3d_no_rot: p4_3d_no_rot_loop: - prefetchw 32(%rdi) + prefetcht1 32(%rdi) movq (%rdx), %mm4 /* x1 | x0 */ movq 8(%rdx), %mm5 /* x3 | x2 */ @@ -260,7 +279,7 @@ p4_3d_no_rot_loop: addq $16, %rdi decl %ecx - prefetch 32(%rdx) + prefetcht1 32(%rdx) jnz p4_3d_no_rot_loop p4_3d_no_rot_done: @@ -269,11 +288,12 @@ p4_3d_no_rot_done: .align 16 -.globl _mesa_x86_64_transform_points4_perspective -_mesa_x86_64_transform_points4_perspective: +.globl _mesa_3dnow_transform_points4_perspective +.hidden _mesa_3dnow_transform_points4_perspective +_mesa_3dnow_transform_points4_perspective: movl V4F_COUNT(%rdx), %ecx /* count */ - movzx V4F_STRIDE(%rdx), %eax /* stride */ + movzbl V4F_STRIDE(%rdx), %eax /* stride */ movl %ecx, V4F_COUNT(%rdi) /* set dest count */ movl $4, V4F_SIZE(%rdi) /* set dest size */ @@ -291,7 +311,7 @@ _mesa_x86_64_transform_points4_perspective: punpckldq 20(%rsi), %mm0 /* m11 | m00 */ movq 32(%rsi), %mm2 /* m21 | m20 */ - prefetch (%rdx) + prefetcht1 (%rdx) movd 40(%rsi), %mm1 /* | m22 */ @@ -301,7 +321,7 @@ _mesa_x86_64_transform_points4_perspective: p4_perspective_loop: - prefetchw 32(%rdi) /* prefetch 2 vertices ahead */ + prefetcht1 32(%rdi) /* prefetch 2 vertices ahead */ movq (%rdx), %mm4 /* x1 | x0 */ movq 8(%rdx), %mm5 /* x3 | x2 */ @@ -327,7 +347,7 @@ p4_perspective_loop: addq $16, %rdi decl %ecx - prefetch 32(%rdx) /* hopefully stride is zero */ + prefetcht1 32(%rdx) /* hopefully stride is zero */ jnz p4_perspective_loop p4_perspective_done: @@ -335,11 +355,12 @@ p4_perspective_done: ret .align 16 -.globl _mesa_x86_64_transform_points4_2d_no_rot -_mesa_x86_64_transform_points4_2d_no_rot: +.globl _mesa_3dnow_transform_points4_2d_no_rot +.hidden _mesa_3dnow_transform_points4_2d_no_rot +_mesa_3dnow_transform_points4_2d_no_rot: movl V4F_COUNT(%rdx), %ecx /* count */ - movzx V4F_STRIDE(%rdx), %eax /* stride */ + movzbl V4F_STRIDE(%rdx), %eax /* stride */ movl %ecx, V4F_COUNT(%rdi) /* set dest count */ movl $4, V4F_SIZE(%rdi) /* set dest size */ @@ -353,14 +374,14 @@ _mesa_x86_64_transform_points4_2d_no_rot: movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ movd (%rsi), %mm0 /* | m00 */ - prefetch (%rdx) + prefetcht1 (%rdx) punpckldq 20(%rsi), %mm0 /* m11 | m00 */ movq 48(%rsi), %mm1 /* m31 | m30 */ p4_2d_no_rot_loop: - prefetchw 32(%rdi) /* prefetch 2 vertices ahead */ + prefetcht1 32(%rdi) /* prefetch 2 vertices ahead */ movq (%rdx), %mm4 /* x1 | x0 */ movq 8(%rdx), %mm5 /* x3 | x2 */ @@ -373,7 +394,7 @@ p4_2d_no_rot_loop: addq %rax, %rdx pfmul %mm1, %mm6 /* x3*m31 | x3*m30 */ - prefetch 32(%rdx) /* hopefully stride is zero */ + prefetcht1 32(%rdx) /* hopefully stride is zero */ pfadd %mm4, %mm6 /* x1*m11+x3*m31 | x0*m00+x3*m30 */ movq %mm6, (%rdi) /* write r0, r1 */ @@ -390,11 +411,12 @@ p4_2d_no_rot_done: .align 16 -.globl _mesa_x86_64_transform_points4_2d -_mesa_x86_64_transform_points4_2d: +.globl _mesa_3dnow_transform_points4_2d +.hidden _mesa_3dnow_transform_points4_2d +_mesa_3dnow_transform_points4_2d: movl V4F_COUNT(%rdx), %ecx /* count */ - movzx V4F_STRIDE(%rdx), %eax /* stride */ + movzbl V4F_STRIDE(%rdx), %eax /* stride */ movl %ecx, V4F_COUNT(%rdi) /* set dest count */ movl $4, V4F_SIZE(%rdi) /* set dest size */ @@ -411,7 +433,7 @@ _mesa_x86_64_transform_points4_2d: movd (%rsi), %mm0 /* | m00 */ movd 4(%rsi), %mm1 /* | m01 */ - prefetch (%rdx) + prefetcht1 (%rdx) punpckldq 16(%rsi), %mm0 /* m10 | m00 */ .byte 0x66, 0x66, 0x90 /* manual align += 4 */ @@ -421,7 +443,7 @@ _mesa_x86_64_transform_points4_2d: p4_2d_loop: - prefetchw 32(%rdi) /* prefetch 2 vertices ahead */ + prefetcht1 32(%rdi) /* prefetch 2 vertices ahead */ movq (%rdx), %mm3 /* x1 | x0 */ movq 8(%rdx), %mm5 /* x3 | x2 */ @@ -438,7 +460,7 @@ p4_2d_loop: pfacc %mm4, %mm3 /* x0*m01+x1*m11 | x0*m00+x1*m10 */ pfmul %mm2, %mm6 /* x3*m31 | x3*m30 */ - prefetch 32(%rdx) /* hopefully stride is zero */ + prefetcht1 32(%rdx) /* hopefully stride is zero */ pfadd %mm6, %mm3 /* r1 | r0 */