src/mesa/tnl/t_vtx_x86_gcc.S

   1 /**************************************************************************
   2
   3 Copyright 2004 Tungsten Graphics Inc., Cedar Park, Texas.
   4
   5 All Rights Reserved.
   6
   7 Permission is hereby granted, free of charge, to any person obtaining a
   8 copy of this software and associated documentation files (the "Software"),
   9 to deal in the Software without restriction, including without limitation
  10 on the rights to use, copy, modify, merge, publish, distribute, sub
  11 license, and/or sell copies of the Software, and to permit persons to whom
  12 the Software is furnished to do so, subject to the following conditions:
  13
  14 The above copyright notice and this permission notice (including the next
  15 paragraph) shall be included in all copies or substantial portions of the
  16 Software.
  17
  18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  20 FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  21 ATI, TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  22 DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  23 OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  24 USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26 **************************************************************************/
  27
  28 /*
  29  * Authors:
  30  *   Keith Whitwell <keith@tungstengraphics.com>
  31  */
  32
  33 #if !defined (__DJGPP__)
  34
  35 #define GLOBL( x )      \
  36 .globl x;               \
  37 x:
  38
  39 #else  /* defined(__DJGPP__) */
  40
  41 #define GLOBL( x )      \
  42 .globl _##x;            \
  43 _##x:
  44
  45 #endif /* defined(__DJGPP__) */
  46
  47 .data
  48 .align 4
  49
  50 // Someone who knew a lot about this sort of thing would use this
  51 // macro to note current offsets, etc in a special region of the
  52 // object file & just make everything work out neat.  I don't know
  53 // enough to do that...
  54
  55 #define SUBST( x ) (0x10101010 + x)
  56
  57
  58
  59 // [dBorca] TODO
  60 // Unfold functions for each vertex size?
  61 // Build super-specialized MMX/SSE versions?
  62 // STDCALL woes (HAVE_NONSTANDARD_GLAPIENTRY):
  63 //   need separate routine for the non "fv" case,
  64 //   to clean up the stack (I guess we could codegen
  65 //   'ret nn' insn)! Also we need to call notify, then
  66 //   return, instead of jump!
  67
  68 GLOBL ( _tnl_x86_Vertex1fv )
  69         movl    4(%esp), %ecx
  70         push    %edi
  71         push    %esi
  72         movl    SUBST(0), %edi  // 0x0 --> tnl->vtx.vbptr
  73         movl    (%ecx), %edx    // load v[0]
  74         movl    %edx, (%edi)    // tnl->vtx.vbptr[0] = v[0]
  75         addl    $4, %edi        // tnl->vtx.vbptr += 1
  76         movl    $SUBST(1), %ecx // 0x1 --> (tnl->vtx.vertex_size - 1)
  77         movl    $SUBST(2), %esi // 0x2 --> (tnl->vtx.vertex + 1)
  78         repz
  79         movsl   %ds:(%esi), %es:(%edi)
  80         movl    %edi, SUBST(0)  // 0x0 --> tnl->vtx.vbptr
  81         movl    SUBST(3), %edx  // 0x3 --> counter
  82         pop     %esi
  83         pop     %edi
  84         dec     %edx            // counter--
  85         movl    %edx, SUBST(3)  // 0x3 --> counter
  86         jne     .0              // if (counter != 0) return
  87         pushl   $SUBST(4)       // 0x4 --> ctx
  88         .byte   0xe8            // call ...
  89         .long   SUBST(5)        // ... _tnl_wrap_filled_vertex(ctx)
  90         pop     %eax
  91 .0:
  92         ret                     // return
  93 GLOBL ( _tnl_x86_Vertex1fv_end )
  94
  95
  96 .align 4
  97 GLOBL ( _tnl_x86_Vertex2fv )
  98         movl    4(%esp), %ecx
  99         push    %edi
 100         push    %esi
 101         movl    SUBST(0), %edi  // load tnl->vtx.vbptr
 102         movl    (%ecx), %edx    // load v[0]
 103         movl    4(%ecx), %eax   // load v[1]
 104         movl    %edx, (%edi)    // tnl->vtx.vbptr[0] = v[0]
 105         movl    %eax, 4(%edi)   // tnl->vtx.vbptr[1] = v[1]
 106         addl    $8, %edi        // tnl->vtx.vbptr += 2
 107         movl    $SUBST(1), %ecx // vertex_size - 2
 108         movl    $SUBST(2), %esi // tnl->vtx.vertex + 2
 109         repz
 110         movsl   %ds:(%esi), %es:(%edi)
 111         movl    %edi, SUBST(0)  // save tnl->vtx.vbptr
 112         movl    SUBST(3), %edx  // load counter
 113         pop     %esi
 114         pop     %edi
 115         dec     %edx            // counter--
 116         movl    %edx, SUBST(3)  // save counter
 117         jne     .1              // if (counter != 0) return
 118         pushl   $SUBST(4)       // load ctx
 119         .byte   0xe8            // call ...
 120         .long   SUBST(5)        // ... _tnl_wrap_filled_vertex(ctx)
 121         pop     %eax
 122 .1:
 123         ret                     // return
 124 GLOBL ( _tnl_x86_Vertex2fv_end )
 125
 126 .align 4
 127 GLOBL ( _tnl_x86_Vertex3fv )
 128         movl    4(%esp), %ecx
 129         push    %edi
 130         push    %esi
 131         movl    SUBST(0), %edi  // load tnl->vtx.vbptr
 132         movl    (%ecx), %edx    // load v[0]
 133         movl    4(%ecx), %eax   // load v[1]
 134         movl    8(%ecx), %esi   // load v[2]
 135         movl    %edx, (%edi)    // tnl->vtx.vbptr[0] = v[0]
 136         movl    %eax, 4(%edi)   // tnl->vtx.vbptr[1] = v[1]
 137         movl    %esi, 8(%edi)   // tnl->vtx.vbptr[2] = v[2]
 138         addl    $12, %edi       // tnl->vtx.vbptr += 3
 139         movl    $SUBST(1), %ecx // vertex_size - 3
 140         movl    $SUBST(2), %esi // tnl->vtx.vertex + 3
 141         repz
 142         movsl   %ds:(%esi), %es:(%edi)
 143         movl    %edi, SUBST(0)  // save tnl->vtx.vbptr
 144         movl    SUBST(3), %edx  // load counter
 145         pop     %esi
 146         pop     %edi
 147         dec     %edx            // counter--
 148         movl    %edx, SUBST(3)  // save counter
 149         jne     .2              // if (counter != 0) return
 150         pushl   $SUBST(4)       // load ctx
 151         .byte   0xe8            // call ...
 152         .long   SUBST(5)        // ... _tnl_wrap_filled_vertex(ctx)
 153         pop     %eax
 154 .2:
 155         ret                     // return
 156 GLOBL ( _tnl_x86_Vertex3fv_end )
 157
 158
 159 .align 4
 160 GLOBL ( _tnl_x86_Vertex4fv )
 161         movl    4(%esp), %ecx
 162         push    %edi
 163         push    %esi
 164         movl    SUBST(0), %edi  // load tnl->vtx.vbptr
 165         movl    (%ecx), %edx    // load v[0]
 166         movl    4(%ecx), %eax   // load v[1]
 167         movl    8(%ecx), %esi   // load v[2]
 168         movl    12(%ecx), %ecx  // load v[3]
 169         movl    %edx, (%edi)    // tnl->vtx.vbptr[0] = v[0]
 170         movl    %eax, 4(%edi)   // tnl->vtx.vbptr[1] = v[1]
 171         movl    %esi, 8(%edi)   // tnl->vtx.vbptr[2] = v[2]
 172         movl    %ecx, 12(%edi)  // tnl->vtx.vbptr[3] = v[3]
 173         addl    $16, %edi       // tnl->vtx.vbptr += 4
 174         movl    $SUBST(1), %ecx // vertex_size - 4
 175         movl    $SUBST(2), %esi // tnl->vtx.vertex + 3
 176         repz
 177         movsl   %ds:(%esi), %es:(%edi)
 178         movl    %edi, SUBST(0)  // save tnl->vtx.vbptr
 179         movl    SUBST(3), %edx  // load counter
 180         pop     %esi
 181         pop     %edi
 182         dec     %edx            // counter--
 183         movl    %edx, SUBST(3)  // save counter
 184         jne     .3              // if (counter != 0) return
 185         pushl   $SUBST(4)       // load ctx
 186         .byte   0xe8            // call ...
 187         .long   SUBST(5)        // ... _tnl_wrap_filled_vertex(ctx)
 188         pop     %eax
 189 .3:
 190         ret                     // return
 191 GLOBL ( _tnl_x86_Vertex4fv_end )
 192
 193
 194
 195 /**
 196  * Generic handlers for vector format data.
 197  */
 198
 199 GLOBL( _tnl_x86_Attribute1fv)
 200         movl    4(%esp), %ecx
 201         movl    (%ecx), %eax    /* load v[0] */
 202         movl    %eax, SUBST(0)  /* store v[0] to current vertex */
 203         ret
 204 GLOBL ( _tnl_x86_Attribute1fv_end )
 205
 206 GLOBL( _tnl_x86_Attribute2fv)
 207         movl    4(%esp), %ecx
 208         movl    (%ecx), %eax    /* load v[0] */
 209         movl    4(%ecx), %edx   /* load v[1] */
 210         movl    %eax, SUBST(0)  /* store v[0] to current vertex */
 211         movl    %edx, SUBST(1)  /* store v[1] to current vertex */
 212         ret
 213 GLOBL ( _tnl_x86_Attribute2fv_end )
 214
 215
 216 GLOBL( _tnl_x86_Attribute3fv)
 217         movl    4(%esp), %ecx
 218         movl    (%ecx), %eax    /* load v[0] */
 219         movl    4(%ecx), %edx   /* load v[1] */
 220         movl    8(%ecx), %ecx   /* load v[2] */
 221         movl    %eax, SUBST(0)  /* store v[0] to current vertex */
 222         movl    %edx, SUBST(1)  /* store v[1] to current vertex */
 223         movl    %ecx, SUBST(2)  /* store v[2] to current vertex */
 224         ret
 225 GLOBL ( _tnl_x86_Attribute3fv_end )
 226
 227 GLOBL( _tnl_x86_Attribute4fv)
 228         movl    4(%esp), %ecx
 229         movl    (%ecx), %eax    /* load v[0] */
 230         movl    4(%ecx), %edx   /* load v[1] */
 231         movl    %eax, SUBST(0)  /* store v[0] to current vertex */
 232         movl    %edx, SUBST(1)  /* store v[1] to current vertex */
 233         movl    8(%ecx), %eax   /* load v[2] */
 234         movl    12(%ecx), %edx  /* load v[3] */
 235         movl    %eax, SUBST(2)  /* store v[2] to current vertex */
 236         movl    %edx, SUBST(3)  /* store v[3] to current vertex */
 237         ret
 238 GLOBL ( _tnl_x86_Attribute4fv_end )
 239
 240
 241 // Choosers:
 242
 243 // Must generate all of these ahead of first usage.  Generate at
 244 // compile-time?
 245
 246
 247 GLOBL( _tnl_x86_choose_fv)
 248         subl    $12, %esp       // gcc does 16 byte alignment of stack frames?
 249         movl    $SUBST(0), (%esp)       // arg 0 - attrib
 250         movl    $SUBST(1), 4(%esp)      // arg 1 - N
 251         .byte   0xe8                    // call ...
 252         .long   SUBST(2)                // ... do_choose
 253         add     $12, %esp               // tear down stack frame
 254         jmp     *%eax                   // jump to new func
 255 GLOBL ( _tnl_x86_choose_fv_end )
 256
 257
 258
 259 // FIRST LEVEL FUNCTIONS -- these are plugged directly into GL dispatch.
 260
 261
 262
 263 // In the 1st level dispatch functions, switch to a different
 264 // calling convention -- (const GLfloat *v) in %ecx.
 265 //
 266 // As with regular (x86) dispatch, don't create a new stack frame -
 267 // just let the 'ret' in the dispatched function return straight
 268 // back to the original caller.
 269
 270
 271
 272 // Vertex/Normal/Color, etc: the address of the function pointer
 273 // is known at codegen time.
 274
 275
 276 // Unfortunately, have to play with the stack in the non-fv case:
 277 //
 278 GLOBL( _tnl_x86_dispatch_attrf )
 279         subl    $12, %esp       // gcc does 16 byte alignment of stack frames?
 280         leal    16(%esp), %edx  // address of first float on stack
 281         movl    %edx, (%esp)    // save as 'v'
 282         call    *SUBST(0)       // 0x0 --> tabfv[attr][n]
 283         addl    $12, %esp       // tear down frame
 284         ret                     // return
 285 GLOBL( _tnl_x86_dispatch_attrf_end )
 286
 287 // The fv case is simpler:
 288 //
 289 GLOBL( _tnl_x86_dispatch_attrfv )
 290         jmp     *SUBST(0)       // 0x0 --> tabfv[attr][n]
 291 GLOBL( _tnl_x86_dispatch_attrfv_end )
 292
 293
 294 // MultiTexcoord: the address of the function pointer must be
 295 // calculated, but can use the index argument slot to hold 'v', and
 296 // avoid setting up a new stack frame.
 297 //
 298 // [dBorca]
 299 // right, this would be the preferred approach, but gcc does not
 300 // clean up the stack after each function call when optimizing (-fdefer-pop);
 301 // can it make assumptions about what's already on the stack?  I dunno,
 302 // but in this case, we can't mess with the caller's stack frame, and
 303 // we must use a model like `_x86_dispatch_attrfv' above.  Caveat emptor!
 304
 305 // Also, will only need a maximum of four of each of these per context:
 306 //
 307 GLOBL( _tnl_x86_dispatch_multitexcoordf )
 308         movl    4(%esp), %ecx
 309         leal    8(%esp), %edx
 310         andl    $7, %ecx
 311         movl    %edx, 4(%esp)
 312         sall    $4, %ecx
 313         jmp     *SUBST(0)(%ecx) // 0x0 - tabfv[tex0][n]
 314 GLOBL( _tnl_x86_dispatch_multitexcoordf_end )
 315
 316 GLOBL( _tnl_x86_dispatch_multitexcoordfv )
 317         movl    4(%esp), %ecx
 318         movl    8(%esp), %edx
 319         andl    $7, %ecx
 320         movl    %edx, 4(%esp)
 321         sall    $4, %ecx
 322         jmp     *SUBST(0)(%ecx) // 0x0 - tabfv[tex0][n]
 323 GLOBL( _tnl_x86_dispatch_multitexcoordfv_end )
 324
 325 // VertexAttrib: the address of the function pointer must be
 326 // calculated.
 327
 328 GLOBL( _tnl_x86_dispatch_vertexattribf )
 329         movl    4(%esp), %eax
 330         cmpl    $16, %eax
 331         jb      .8              // "cmovge" is not supported on all CPUs
 332         movl    $16, %eax
 333 .8:
 334         leal    8(%esp), %ecx   // calculate 'v'
 335         movl    %ecx, 4(%esp)   // save in 1st arg slot
 336         sall    $4, %eax
 337         jmp     *SUBST(0)(%eax) // 0x0 - tabfv[0][n]
 338 GLOBL( _tnl_x86_dispatch_vertexattribf_end )
 339
 340 GLOBL( _tnl_x86_dispatch_vertexattribfv )
 341         movl    4(%esp), %eax
 342         cmpl    $16, %eax
 343         jb      .9              // "cmovge" is not supported on all CPUs
 344         movl    $16, %eax
 345 .9:
 346         movl    8(%esp), %ecx   // load 'v'
 347         movl    %ecx, 4(%esp)   // save in 1st arg slot
 348         sall    $4, %eax
 349         jmp     *SUBST(0)(%eax) // 0x0 - tabfv[0][n]
 350 GLOBL( _tnl_x86_dispatch_vertexattribfv_end )