src/mesa/tnl/t_vertex_sse.c

   1 /*
   2  * Copyright 2003 Tungsten Graphics, inc.
   3  * All Rights Reserved.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * on the rights to use, copy, modify, merge, publish, distribute, sub
   9  * license, and/or sell copies of the Software, and to permit persons to whom
  10  * the Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
  19  * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  20  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  21  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  *
  24  * Authors:
  25  *    Keith Whitwell <keithw@tungstengraphics.com>
  26  */
  27
  28 #include "glheader.h"
  29 #include "context.h"
  30 #include "colormac.h"
  31 #include "t_context.h"
  32 #include "t_vertex.h"
  33 #include "simple_list.h"
  34 #include "enums.h"
  35
  36 #if defined(USE_SSE_ASM)
  37
  38 #include "x86/rtasm/x86sse.h"
  39 #include "x86/common_x86_asm.h"
  40
  41
  42 #define X    0
  43 #define Y    1
  44 #define Z    2
  45 #define W    3
  46
  47
  48 struct x86_program {
  49    struct x86_function func;
  50
  51    GLcontext *ctx;
  52    GLboolean inputs_safe;
  53    GLboolean outputs_safe;
  54    GLboolean have_sse2;
  55
  56    struct x86_reg identity;
  57    struct x86_reg chan0;
  58 };
  59
  60
  61 static struct x86_reg get_identity( struct x86_program *p )
  62 {
  63    return p->identity;
  64 }
  65
  66 static void emit_load4f_4( struct x86_program *p,
  67                            struct x86_reg dest,
  68                            struct x86_reg arg0 )
  69 {
  70    sse_movups(&p->func, dest, arg0);
  71 }
  72
  73 static void emit_load4f_3( struct x86_program *p,
  74                            struct x86_reg dest,
  75                            struct x86_reg arg0 )
  76 {
  77    /* Have to jump through some hoops:
  78     *
  79     * c 0 0 0
  80     * c 0 0 1
  81     * 0 0 c 1
  82     * a b c 1
  83     */
  84    sse_movss(&p->func, dest, x86_make_disp(arg0, 8));
  85    sse_shufps(&p->func, dest, get_identity(p), SHUF(X,Y,Z,W) );
  86    sse_shufps(&p->func, dest, dest, SHUF(Y,Z,X,W) );
  87    sse_movlps(&p->func, dest, arg0);
  88 }
  89
  90 static void emit_load4f_2( struct x86_program *p,
  91                            struct x86_reg dest,
  92                            struct x86_reg arg0 )
  93 {
  94    /* Initialize from identity, then pull in low two words:
  95     */
  96    sse_movups(&p->func, dest, get_identity(p));
  97    sse_movlps(&p->func, dest, arg0);
  98 }
  99
 100 static void emit_load4f_1( struct x86_program *p,
 101                            struct x86_reg dest,
 102                            struct x86_reg arg0 )
 103 {
 104    /* Pull in low word, then swizzle in identity */
 105    sse_movss(&p->func, dest, arg0);
 106    sse_shufps(&p->func, dest, get_identity(p), SHUF(X,Y,Z,W) );
 107 }
 108
 109
 110
 111 static void emit_load3f_3( struct x86_program *p,
 112                            struct x86_reg dest,
 113                            struct x86_reg arg0 )
 114 {
 115    /* Over-reads by 1 dword - potential SEGV if input is a vertex
 116     * array.
 117     */
 118    if (p->inputs_safe) {
 119       sse_movups(&p->func, dest, arg0);
 120    }
 121    else {
 122       /* c 0 0 0
 123        * c c c c
 124        * a b c c
 125        */
 126       sse_movss(&p->func, dest, x86_make_disp(arg0, 8));
 127       sse_shufps(&p->func, dest, dest, SHUF(X,X,X,X));
 128       sse_movlps(&p->func, dest, arg0);
 129    }
 130 }
 131
 132 static void emit_load3f_2( struct x86_program *p,
 133                            struct x86_reg dest,
 134                            struct x86_reg arg0 )
 135 {
 136    emit_load4f_2(p, dest, arg0);
 137 }
 138
 139 static void emit_load3f_1( struct x86_program *p,
 140                            struct x86_reg dest,
 141                            struct x86_reg arg0 )
 142 {
 143    emit_load4f_1(p, dest, arg0);
 144 }
 145
 146 static void emit_load2f_2( struct x86_program *p,
 147                            struct x86_reg dest,
 148                            struct x86_reg arg0 )
 149 {
 150    sse_movlps(&p->func, dest, arg0);
 151 }
 152
 153 static void emit_load2f_1( struct x86_program *p,
 154                            struct x86_reg dest,
 155                            struct x86_reg arg0 )
 156 {
 157    emit_load4f_1(p, dest, arg0);
 158 }
 159
 160 static void emit_load1f_1( struct x86_program *p,
 161                            struct x86_reg dest,
 162                            struct x86_reg arg0 )
 163 {
 164    sse_movss(&p->func, dest, arg0);
 165 }
 166
 167 static void (*load[4][4])( struct x86_program *p,
 168                            struct x86_reg dest,
 169                            struct x86_reg arg0 ) = {
 170    { emit_load1f_1,
 171      emit_load1f_1,
 172      emit_load1f_1,
 173      emit_load1f_1 },
 174
 175    { emit_load2f_1,
 176      emit_load2f_2,
 177      emit_load2f_2,
 178      emit_load2f_2 },
 179
 180    { emit_load3f_1,
 181      emit_load3f_2,
 182      emit_load3f_3,
 183      emit_load3f_3 },
 184
 185    { emit_load4f_1,
 186      emit_load4f_2,
 187      emit_load4f_3,
 188      emit_load4f_4 }
 189 };
 190
 191 static void emit_load( struct x86_program *p,
 192                        struct x86_reg dest,
 193                        GLuint sz,
 194                        struct x86_reg src,
 195                        GLuint src_sz)
 196 {
 197    load[sz-1][src_sz-1](p, dest, src);
 198 }
 199
 200 static void emit_store4f( struct x86_program *p,
 201                           struct x86_reg dest,
 202                           struct x86_reg arg0 )
 203 {
 204    sse_movups(&p->func, dest, arg0);
 205 }
 206
 207 static void emit_store3f( struct x86_program *p,
 208                           struct x86_reg dest,
 209                           struct x86_reg arg0 )
 210 {
 211    if (p->outputs_safe) {
 212       /* Emit the extra dword anyway.  This may hurt writecombining,
 213        * may cause other problems.
 214        */
 215       sse_movups(&p->func, dest, arg0);
 216    }
 217    else {
 218       /* Alternate strategy - emit two, shuffle, emit one.
 219        */
 220       sse_movlps(&p->func, dest, arg0);
 221       sse_shufps(&p->func, arg0, arg0, SHUF(Z,Z,Z,Z) ); /* NOTE! destructive */
 222       sse_movss(&p->func, x86_make_disp(dest,8), arg0);
 223    }
 224 }
 225
 226 static void emit_store2f( struct x86_program *p,
 227                            struct x86_reg dest,
 228                            struct x86_reg arg0 )
 229 {
 230    sse_movlps(&p->func, dest, arg0);
 231 }
 232
 233 static void emit_store1f( struct x86_program *p,
 234                           struct x86_reg dest,
 235                           struct x86_reg arg0 )
 236 {
 237    sse_movss(&p->func, dest, arg0);
 238 }
 239
 240
 241 static void (*store[4])( struct x86_program *p,
 242                          struct x86_reg dest,
 243                          struct x86_reg arg0 ) =
 244 {
 245    emit_store1f,
 246    emit_store2f,
 247    emit_store3f,
 248    emit_store4f
 249 };
 250
 251 static void emit_store( struct x86_program *p,
 252                         struct x86_reg dest,
 253                         GLuint sz,
 254                         struct x86_reg temp )
 255
 256 {
 257    store[sz-1](p, dest, temp);
 258 }
 259
 260 static void emit_pack_store_4ub( struct x86_program *p,
 261                                  struct x86_reg dest,
 262                                  struct x86_reg temp )
 263 {
 264    /* Scale by 255.0
 265     */
 266    sse_mulps(&p->func, temp, p->chan0);
 267
 268    if (p->have_sse2) {
 269       sse2_cvtps2dq(&p->func, temp, temp);
 270       sse2_packssdw(&p->func, temp, temp);
 271       sse2_packuswb(&p->func, temp, temp);
 272       sse_movss(&p->func, dest, temp);
 273    }
 274    else {
 275       struct x86_reg mmx0 = x86_make_reg(file_MMX, 0);
 276       struct x86_reg mmx1 = x86_make_reg(file_MMX, 1);
 277       sse_cvtps2pi(&p->func, mmx0, temp);
 278       sse_movhlps(&p->func, temp, temp);
 279       sse_cvtps2pi(&p->func, mmx1, temp);
 280       mmx_packssdw(&p->func, mmx0, mmx1);
 281       mmx_packuswb(&p->func, mmx0, mmx0);
 282       mmx_movd(&p->func, dest, mmx0);
 283    }
 284 }
 285
 286 static GLint get_offset( const void *a, const void *b )
 287 {
 288    return (const char *)b - (const char *)a;
 289 }
 290
 291 /* Not much happens here.  Eventually use this function to try and
 292  * avoid saving/reloading the source pointers each vertex (if some of
 293  * them can fit in registers).
 294  */
 295 static void get_src_ptr( struct x86_program *p,
 296                          struct x86_reg srcREG,
 297                          struct x86_reg vtxREG,
 298                          struct tnl_clipspace_attr *a )
 299 {
 300    struct tnl_clipspace *vtx = GET_VERTEX_STATE(p->ctx);
 301    struct x86_reg ptr_to_src = x86_make_disp(vtxREG, get_offset(vtx, &a->inputptr));
 302
 303    /* Load current a[j].inputptr
 304     */
 305    x86_mov(&p->func, srcREG, ptr_to_src);
 306 }
 307
 308 static void update_src_ptr( struct x86_program *p,
 309                          struct x86_reg srcREG,
 310                          struct x86_reg vtxREG,
 311                          struct tnl_clipspace_attr *a )
 312 {
 313    if (a->inputstride) {
 314       struct tnl_clipspace *vtx = GET_VERTEX_STATE(p->ctx);
 315       struct x86_reg ptr_to_src = x86_make_disp(vtxREG, get_offset(vtx, &a->inputptr));
 316
 317       /* add a[j].inputstride (hardcoded value - could just as easily
 318        * pull the stride value from memory each time).
 319        */
 320       x86_lea(&p->func, srcREG, x86_make_disp(srcREG, a->inputstride));
 321
 322       /* save new value of a[j].inputptr
 323        */
 324       x86_mov(&p->func, ptr_to_src, srcREG);
 325    }
 326 }
 327
 328
 329 /* Lots of hardcoding
 330  *
 331  * EAX -- pointer to current output vertex
 332  * ECX -- pointer to current attribute
 333  *
 334  */
 335 static GLboolean build_vertex_emit( struct x86_program *p )
 336 {
 337    GLcontext *ctx = p->ctx;
 338    TNLcontext *tnl = TNL_CONTEXT(ctx);
 339    struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);
 340    GLuint j = 0;
 341
 342    struct x86_reg vertexEAX = x86_make_reg(file_REG32, reg_AX);
 343    struct x86_reg srcECX = x86_make_reg(file_REG32, reg_CX);
 344    struct x86_reg countEBP = x86_make_reg(file_REG32, reg_BP);
 345    struct x86_reg vtxESI = x86_make_reg(file_REG32, reg_SI);
 346    struct x86_reg temp = x86_make_reg(file_XMM, 0);
 347    struct x86_reg vp0 = x86_make_reg(file_XMM, 1);
 348    struct x86_reg vp1 = x86_make_reg(file_XMM, 2);
 349    GLubyte *fixup, *label;
 350
 351    x86_init_func(&p->func);
 352
 353    /* Push a few regs?
 354     */
 355    x86_push(&p->func, countEBP);
 356    x86_push(&p->func, vtxESI);
 357
 358
 359    /* Get vertex count, compare to zero
 360     */
 361    x86_xor(&p->func, srcECX, srcECX);
 362    x86_mov(&p->func, countEBP, x86_fn_arg(&p->func, 2));
 363    x86_cmp(&p->func, countEBP, srcECX);
 364    fixup = x86_jcc_forward(&p->func, cc_E);
 365
 366    /* Initialize destination register.
 367     */
 368    x86_mov(&p->func, vertexEAX, x86_fn_arg(&p->func, 3));
 369
 370    /* Dereference ctx to get tnl, then vtx:
 371     */
 372    x86_mov(&p->func, vtxESI, x86_fn_arg(&p->func, 1));
 373    x86_mov(&p->func, vtxESI, x86_make_disp(vtxESI, get_offset(ctx, &ctx->swtnl_context)));
 374    vtxESI = x86_make_disp(vtxESI, get_offset(tnl, &tnl->clipspace));
 375
 376
 377    /* Possibly load vp0, vp1 for viewport calcs:
 378     */
 379    if (vtx->need_viewport) {
 380       sse_movups(&p->func, vp0, x86_make_disp(vtxESI, get_offset(vtx, &vtx->vp_scale[0])));
 381       sse_movups(&p->func, vp1, x86_make_disp(vtxESI, get_offset(vtx, &vtx->vp_xlate[0])));
 382    }
 383
 384    /* always load, needed or not:
 385     */
 386    sse_movups(&p->func, p->chan0, x86_make_disp(vtxESI, get_offset(vtx, &vtx->chan_scale[0])));
 387    sse_movups(&p->func, p->identity, x86_make_disp(vtxESI, get_offset(vtx, &vtx->identity[0])));
 388
 389    /* Note address for loop jump */
 390    label = x86_get_label(&p->func);
 391
 392    /* Emit code for each of the attributes.  Currently routes
 393     * everything through SSE registers, even when it might be more
 394     * efficient to stick with regular old x86.  No optimization or
 395     * other tricks - enough new ground to cover here just getting
 396     * things working.
 397     */
 398    while (j < vtx->attr_count) {
 399       struct tnl_clipspace_attr *a = &vtx->attr[j];
 400       struct x86_reg dest = x86_make_disp(vertexEAX, a->vertoffset);
 401
 402       /* Now, load an XMM reg from src, perhaps transform, then save.
 403        * Could be shortcircuited in specific cases:
 404        */
 405       switch (a->format) {
 406       case EMIT_1F:
 407          get_src_ptr(p, srcECX, vtxESI, a);
 408          emit_load(p, temp, 1, x86_deref(srcECX), a->inputsize);
 409          emit_store(p, dest, 1, temp);
 410          update_src_ptr(p, srcECX, vtxESI, a);
 411          break;
 412       case EMIT_2F:
 413          get_src_ptr(p, srcECX, vtxESI, a);
 414          emit_load(p, temp, 2, x86_deref(srcECX), a->inputsize);
 415          emit_store(p, dest, 2, temp);
 416          update_src_ptr(p, srcECX, vtxESI, a);
 417          break;
 418       case EMIT_3F:
 419          /* Potentially the worst case - hardcode 2+1 copying:
 420           */
 421          if (0) {
 422             get_src_ptr(p, srcECX, vtxESI, a);
 423             emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
 424             emit_store(p, dest, 3, temp);
 425             update_src_ptr(p, srcECX, vtxESI, a);
 426          }
 427          else {
 428             get_src_ptr(p, srcECX, vtxESI, a);
 429             emit_load(p, temp, 2, x86_deref(srcECX), a->inputsize);
 430             emit_store(p, dest, 2, temp);
 431             if (a->inputsize > 2) {
 432                emit_load(p, temp, 1, x86_make_disp(srcECX, 8), 1);
 433                emit_store(p, x86_make_disp(dest,8), 1, temp);
 434             }
 435             else {
 436                sse_movss(&p->func, x86_make_disp(dest,8), get_identity(p));
 437             }
 438             update_src_ptr(p, srcECX, vtxESI, a);
 439          }
 440          break;
 441       case EMIT_4F:
 442          get_src_ptr(p, srcECX, vtxESI, a);
 443          emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
 444          emit_store(p, dest, 4, temp);
 445          update_src_ptr(p, srcECX, vtxESI, a);
 446          break;
 447       case EMIT_2F_VIEWPORT:
 448          get_src_ptr(p, srcECX, vtxESI, a);
 449          emit_load(p, temp, 2, x86_deref(srcECX), a->inputsize);
 450          sse_mulps(&p->func, temp, vp0);
 451          sse_addps(&p->func, temp, vp1);
 452          emit_store(p, dest, 2, temp);
 453          update_src_ptr(p, srcECX, vtxESI, a);
 454          break;
 455       case EMIT_3F_VIEWPORT:
 456          get_src_ptr(p, srcECX, vtxESI, a);
 457          emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
 458          sse_mulps(&p->func, temp, vp0);
 459          sse_addps(&p->func, temp, vp1);
 460          emit_store(p, dest, 3, temp);
 461          update_src_ptr(p, srcECX, vtxESI, a);
 462          break;
 463       case EMIT_4F_VIEWPORT:
 464          get_src_ptr(p, srcECX, vtxESI, a);
 465          emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
 466          sse_mulps(&p->func, temp, vp0);
 467          sse_addps(&p->func, temp, vp1);
 468          emit_store(p, dest, 4, temp);
 469          update_src_ptr(p, srcECX, vtxESI, a);
 470          break;
 471       case EMIT_3F_XYW:
 472          get_src_ptr(p, srcECX, vtxESI, a);
 473          emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
 474          sse_shufps(&p->func, temp, temp, SHUF(X,Y,W,Z));
 475          emit_store(p, dest, 3, temp);
 476          update_src_ptr(p, srcECX, vtxESI, a);
 477          break;
 478
 479       case EMIT_1UB_1F:
 480          /* Test for PAD3 + 1UB:
 481           */
 482          if (j > 0 &&
 483              a[-1].vertoffset + a[-1].vertattrsize <= a->vertoffset - 3)
 484          {
 485             get_src_ptr(p, srcECX, vtxESI, a);
 486             emit_load(p, temp, 1, x86_deref(srcECX), a->inputsize);
 487             sse_shufps(&p->func, temp, temp, SHUF(X,X,X,X));
 488             emit_pack_store_4ub(p, x86_make_disp(dest, -3), temp); /* overkill! */
 489             update_src_ptr(p, srcECX, vtxESI, a);
 490          }
 491          else {
 492             _mesa_printf("Can't emit 1ub %x %x %d\n", a->vertoffset, a[-1].vertoffset, a[-1].vertattrsize );
 493             return GL_FALSE;
 494          }
 495          break;
 496       case EMIT_3UB_3F_RGB:
 497       case EMIT_3UB_3F_BGR:
 498          /* Test for 3UB + PAD1:
 499           */
 500          if (j == vtx->attr_count - 1 ||
 501              a[1].vertoffset >= a->vertoffset + 4) {
 502             get_src_ptr(p, srcECX, vtxESI, a);
 503             emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
 504             if (a->format == EMIT_3UB_3F_BGR)
 505                sse_shufps(&p->func, temp, temp, SHUF(Z,Y,X,W));
 506             emit_pack_store_4ub(p, dest, temp);
 507             update_src_ptr(p, srcECX, vtxESI, a);
 508          }
 509          /* Test for 3UB + 1UB:
 510           */
 511          else if (j < vtx->attr_count - 1 &&
 512                   a[1].format == EMIT_1UB_1F &&
 513                   a[1].vertoffset == a->vertoffset + 3) {
 514             get_src_ptr(p, srcECX, vtxESI, a);
 515             emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
 516             update_src_ptr(p, srcECX, vtxESI, a);
 517
 518             /* Make room for incoming value:
 519              */
 520             sse_shufps(&p->func, temp, temp, SHUF(W,X,Y,Z));
 521
 522             get_src_ptr(p, srcECX, vtxESI, &a[1]);
 523             emit_load(p, temp, 1, x86_deref(srcECX), a[1].inputsize);
 524             update_src_ptr(p, srcECX, vtxESI, &a[1]);
 525
 526             /* Rearrange and possibly do BGR conversion:
 527              */
 528             if (a->format == EMIT_3UB_3F_BGR)
 529                sse_shufps(&p->func, temp, temp, SHUF(W,Z,Y,X));
 530             else
 531                sse_shufps(&p->func, temp, temp, SHUF(Y,Z,W,X));
 532
 533             emit_pack_store_4ub(p, dest, temp);
 534             j++;                /* NOTE: two attrs consumed */
 535          }
 536          else {
 537             _mesa_printf("Can't emit 3ub\n");
 538          }
 539          return GL_FALSE;       /* add this later */
 540          break;
 541
 542       case EMIT_4UB_4F_RGBA:
 543          get_src_ptr(p, srcECX, vtxESI, a);
 544          emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
 545          emit_pack_store_4ub(p, dest, temp);
 546          update_src_ptr(p, srcECX, vtxESI, a);
 547          break;
 548       case EMIT_4UB_4F_BGRA:
 549          get_src_ptr(p, srcECX, vtxESI, a);
 550          emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
 551          sse_shufps(&p->func, temp, temp, SHUF(Z,Y,X,W));
 552          emit_pack_store_4ub(p, dest, temp);
 553          update_src_ptr(p, srcECX, vtxESI, a);
 554          break;
 555       case EMIT_4UB_4F_ARGB:
 556          get_src_ptr(p, srcECX, vtxESI, a);
 557          emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
 558          sse_shufps(&p->func, temp, temp, SHUF(W,X,Y,Z));
 559          emit_pack_store_4ub(p, dest, temp);
 560          update_src_ptr(p, srcECX, vtxESI, a);
 561          break;
 562       case EMIT_4UB_4F_ABGR:
 563          get_src_ptr(p, srcECX, vtxESI, a);
 564          emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
 565          sse_shufps(&p->func, temp, temp, SHUF(W,Z,Y,X));
 566          emit_pack_store_4ub(p, dest, temp);
 567          update_src_ptr(p, srcECX, vtxESI, a);
 568          break;
 569       case EMIT_4CHAN_4F_RGBA:
 570          switch (CHAN_TYPE) {
 571          case GL_UNSIGNED_BYTE:
 572             get_src_ptr(p, srcECX, vtxESI, a);
 573             emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
 574             emit_pack_store_4ub(p, dest, temp);
 575             update_src_ptr(p, srcECX, vtxESI, a);
 576             break;
 577          case GL_FLOAT:
 578             get_src_ptr(p, srcECX, vtxESI, a);
 579             emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
 580             emit_store(p, dest, 4, temp);
 581             update_src_ptr(p, srcECX, vtxESI, a);
 582             break;
 583          case GL_UNSIGNED_SHORT:
 584          default:
 585             _mesa_printf("unknown CHAN_TYPE %s\n", _mesa_lookup_enum_by_nr(CHAN_TYPE));
 586             return GL_FALSE;
 587          }
 588          break;
 589       default:
 590          _mesa_printf("unknown a[%d].format %d\n", j, a->format);
 591          return GL_FALSE;       /* catch any new opcodes */
 592       }
 593
 594       /* Increment j by at least 1 - may have been incremented above also:
 595        */
 596       j++;
 597    }
 598
 599    /* Next vertex:
 600     */
 601    x86_lea(&p->func, vertexEAX, x86_make_disp(vertexEAX, vtx->vertex_size));
 602
 603    /* decr count, loop if not zero
 604     */
 605    x86_dec(&p->func, countEBP);
 606    x86_test(&p->func, countEBP, countEBP);
 607    x86_jcc(&p->func, cc_NZ, label);
 608
 609    /* Exit mmx state?
 610     */
 611    if (p->func.need_emms)
 612       mmx_emms(&p->func);
 613
 614    /* Land forward jump here:
 615     */
 616    x86_fixup_fwd_jump(&p->func, fixup);
 617
 618    /* Pop regs and return
 619     */
 620    x86_pop(&p->func, x86_get_base_reg(vtxESI));
 621    x86_pop(&p->func, countEBP);
 622    x86_ret(&p->func);
 623
 624    vtx->emit = (tnl_emit_func)x86_get_func(&p->func);
 625    return GL_TRUE;
 626 }
 627
 628
 629
 630 void _tnl_generate_sse_emit( GLcontext *ctx )
 631 {
 632    struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);
 633    struct x86_program p;
 634
 635    if (!cpu_has_xmm) {
 636       vtx->codegen_emit = NULL;
 637       return;
 638    }
 639
 640    _mesa_memset(&p, 0, sizeof(p));
 641
 642    p.ctx = ctx;
 643    p.inputs_safe = 0;           /* for now */
 644    p.outputs_safe = 1;          /* for now */
 645    p.have_sse2 = cpu_has_xmm2;
 646    p.identity = x86_make_reg(file_XMM, 6);
 647    p.chan0 = x86_make_reg(file_XMM, 7);
 648
 649    x86_init_func(&p.func);
 650
 651    if (build_vertex_emit(&p)) {
 652       _tnl_register_fastpath( vtx, GL_TRUE );
 653    }
 654    else {
 655       /* Note the failure so that we don't keep trying to codegen an
 656        * impossible state:
 657        */
 658       _tnl_register_fastpath( vtx, GL_FALSE );
 659       x86_release_func(&p.func);
 660    }
 661 }
 662
 663 #else
 664
 665 void _tnl_generate_sse_emit( GLcontext *ctx )
 666 {
 667    /* Dummy version for when USE_SSE_ASM not defined */
 668 }
 669
 670 #endif