src/mesa/vf/vf_sse.c

   1 /*
   2  * Copyright 2003 Tungsten Graphics, inc.
   3  * All Rights Reserved.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * on the rights to use, copy, modify, merge, publish, distribute, sub
   9  * license, and/or sell copies of the Software, and to permit persons to whom
  10  * the Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
  19  * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  20  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  21  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  *
  24  * Authors:
  25  *    Keith Whitwell <keithw@tungstengraphics.com>
  26  */
  27
  28 #include "glheader.h"
  29 #include "colormac.h"
  30 #include "simple_list.h"
  31 #include "enums.h"
  32
  33 #include "vf/vf.h"
  34
  35 #if defined(USE_SSE_ASM)
  36
  37 #include "x86/rtasm/x86sse.h"
  38 #include "x86/common_x86_asm.h"
  39
  40
  41 #define X    0
  42 #define Y    1
  43 #define Z    2
  44 #define W    3
  45
  46
  47 struct x86_program {
  48    struct x86_function func;
  49
  50    struct vertex_fetch *vf;
  51    GLboolean inputs_safe;
  52    GLboolean outputs_safe;
  53    GLboolean have_sse2;
  54
  55    struct x86_reg identity;
  56    struct x86_reg chan0;
  57 };
  58
  59
  60 static struct x86_reg get_identity( struct x86_program *p )
  61 {
  62    return p->identity;
  63 }
  64
  65 static void emit_load4f_4( struct x86_program *p,
  66                            struct x86_reg dest,
  67                            struct x86_reg arg0 )
  68 {
  69    sse_movups(&p->func, dest, arg0);
  70 }
  71
  72 static void emit_load4f_3( struct x86_program *p,
  73                            struct x86_reg dest,
  74                            struct x86_reg arg0 )
  75 {
  76    /* Have to jump through some hoops:
  77     *
  78     * c 0 0 0
  79     * c 0 0 1
  80     * 0 0 c 1
  81     * a b c 1
  82     */
  83    sse_movss(&p->func, dest, x86_make_disp(arg0, 8));
  84    sse_shufps(&p->func, dest, get_identity(p), SHUF(X,Y,Z,W) );
  85    sse_shufps(&p->func, dest, dest, SHUF(Y,Z,X,W) );
  86    sse_movlps(&p->func, dest, arg0);
  87 }
  88
  89 static void emit_load4f_2( struct x86_program *p,
  90                            struct x86_reg dest,
  91                            struct x86_reg arg0 )
  92 {
  93    /* Initialize from identity, then pull in low two words:
  94     */
  95    sse_movups(&p->func, dest, get_identity(p));
  96    sse_movlps(&p->func, dest, arg0);
  97 }
  98
  99 static void emit_load4f_1( struct x86_program *p,
 100                            struct x86_reg dest,
 101                            struct x86_reg arg0 )
 102 {
 103    /* Pull in low word, then swizzle in identity */
 104    sse_movss(&p->func, dest, arg0);
 105    sse_shufps(&p->func, dest, get_identity(p), SHUF(X,Y,Z,W) );
 106 }
 107
 108
 109
 110 static void emit_load3f_3( struct x86_program *p,
 111                            struct x86_reg dest,
 112                            struct x86_reg arg0 )
 113 {
 114    /* Over-reads by 1 dword - potential SEGV if input is a vertex
 115     * array.
 116     */
 117    if (p->inputs_safe) {
 118       sse_movups(&p->func, dest, arg0);
 119    }
 120    else {
 121       /* c 0 0 0
 122        * c c c c
 123        * a b c c
 124        */
 125       sse_movss(&p->func, dest, x86_make_disp(arg0, 8));
 126       sse_shufps(&p->func, dest, dest, SHUF(X,X,X,X));
 127       sse_movlps(&p->func, dest, arg0);
 128    }
 129 }
 130
 131 static void emit_load3f_2( struct x86_program *p,
 132                            struct x86_reg dest,
 133                            struct x86_reg arg0 )
 134 {
 135    emit_load4f_2(p, dest, arg0);
 136 }
 137
 138 static void emit_load3f_1( struct x86_program *p,
 139                            struct x86_reg dest,
 140                            struct x86_reg arg0 )
 141 {
 142    emit_load4f_1(p, dest, arg0);
 143 }
 144
 145 static void emit_load2f_2( struct x86_program *p,
 146                            struct x86_reg dest,
 147                            struct x86_reg arg0 )
 148 {
 149    sse_movlps(&p->func, dest, arg0);
 150 }
 151
 152 static void emit_load2f_1( struct x86_program *p,
 153                            struct x86_reg dest,
 154                            struct x86_reg arg0 )
 155 {
 156    emit_load4f_1(p, dest, arg0);
 157 }
 158
 159 static void emit_load1f_1( struct x86_program *p,
 160                            struct x86_reg dest,
 161                            struct x86_reg arg0 )
 162 {
 163    sse_movss(&p->func, dest, arg0);
 164 }
 165
 166 static void (*load[4][4])( struct x86_program *p,
 167                            struct x86_reg dest,
 168                            struct x86_reg arg0 ) = {
 169    { emit_load1f_1,
 170      emit_load1f_1,
 171      emit_load1f_1,
 172      emit_load1f_1 },
 173
 174    { emit_load2f_1,
 175      emit_load2f_2,
 176      emit_load2f_2,
 177      emit_load2f_2 },
 178
 179    { emit_load3f_1,
 180      emit_load3f_2,
 181      emit_load3f_3,
 182      emit_load3f_3 },
 183
 184    { emit_load4f_1,
 185      emit_load4f_2,
 186      emit_load4f_3,
 187      emit_load4f_4 }
 188 };
 189
 190 static void emit_load( struct x86_program *p,
 191                        struct x86_reg dest,
 192                        GLuint sz,
 193                        struct x86_reg src,
 194                        GLuint src_sz)
 195 {
 196    load[sz-1][src_sz-1](p, dest, src);
 197 }
 198
 199 static void emit_store4f( struct x86_program *p,
 200                           struct x86_reg dest,
 201                           struct x86_reg arg0 )
 202 {
 203    sse_movups(&p->func, dest, arg0);
 204 }
 205
 206 static void emit_store3f( struct x86_program *p,
 207                           struct x86_reg dest,
 208                           struct x86_reg arg0 )
 209 {
 210    if (p->outputs_safe) {
 211       /* Emit the extra dword anyway.  This may hurt writecombining,
 212        * may cause other problems.
 213        */
 214       sse_movups(&p->func, dest, arg0);
 215    }
 216    else {
 217       /* Alternate strategy - emit two, shuffle, emit one.
 218        */
 219       sse_movlps(&p->func, dest, arg0);
 220       sse_shufps(&p->func, arg0, arg0, SHUF(Z,Z,Z,Z) ); /* NOTE! destructive */
 221       sse_movss(&p->func, x86_make_disp(dest,8), arg0);
 222    }
 223 }
 224
 225 static void emit_store2f( struct x86_program *p,
 226                            struct x86_reg dest,
 227                            struct x86_reg arg0 )
 228 {
 229    sse_movlps(&p->func, dest, arg0);
 230 }
 231
 232 static void emit_store1f( struct x86_program *p,
 233                           struct x86_reg dest,
 234                           struct x86_reg arg0 )
 235 {
 236    sse_movss(&p->func, dest, arg0);
 237 }
 238
 239
 240 static void (*store[4])( struct x86_program *p,
 241                          struct x86_reg dest,
 242                          struct x86_reg arg0 ) =
 243 {
 244    emit_store1f,
 245    emit_store2f,
 246    emit_store3f,
 247    emit_store4f
 248 };
 249
 250 static void emit_store( struct x86_program *p,
 251                         struct x86_reg dest,
 252                         GLuint sz,
 253                         struct x86_reg temp )
 254
 255 {
 256    store[sz-1](p, dest, temp);
 257 }
 258
 259 static void emit_pack_store_4ub( struct x86_program *p,
 260                                  struct x86_reg dest,
 261                                  struct x86_reg temp )
 262 {
 263    /* Scale by 255.0
 264     */
 265    sse_mulps(&p->func, temp, p->chan0);
 266
 267    if (p->have_sse2) {
 268       sse2_cvtps2dq(&p->func, temp, temp);
 269       sse2_packssdw(&p->func, temp, temp);
 270       sse2_packuswb(&p->func, temp, temp);
 271       sse_movss(&p->func, dest, temp);
 272    }
 273    else {
 274       struct x86_reg mmx0 = x86_make_reg(file_MMX, 0);
 275       struct x86_reg mmx1 = x86_make_reg(file_MMX, 1);
 276       sse_cvtps2pi(&p->func, mmx0, temp);
 277       sse_movhlps(&p->func, temp, temp);
 278       sse_cvtps2pi(&p->func, mmx1, temp);
 279       mmx_packssdw(&p->func, mmx0, mmx1);
 280       mmx_packuswb(&p->func, mmx0, mmx0);
 281       mmx_movd(&p->func, dest, mmx0);
 282    }
 283 }
 284
 285 static GLint get_offset( const void *a, const void *b )
 286 {
 287    return (const char *)b - (const char *)a;
 288 }
 289
 290 /* Not much happens here.  Eventually use this function to try and
 291  * avoid saving/reloading the source pointers each vertex (if some of
 292  * them can fit in registers).
 293  */
 294 static void get_src_ptr( struct x86_program *p,
 295                          struct x86_reg srcREG,
 296                          struct x86_reg vfREG,
 297                          struct vf_attr *a )
 298 {
 299    struct vertex_fetch *vf = p->vf;
 300    struct x86_reg ptr_to_src = x86_make_disp(vfREG, get_offset(vf, &a->inputptr));
 301
 302    /* Load current a[j].inputptr
 303     */
 304    x86_mov(&p->func, srcREG, ptr_to_src);
 305 }
 306
 307 static void update_src_ptr( struct x86_program *p,
 308                          struct x86_reg srcREG,
 309                          struct x86_reg vfREG,
 310                          struct vf_attr *a )
 311 {
 312    if (a->inputstride) {
 313       struct vertex_fetch *vf = p->vf;
 314       struct x86_reg ptr_to_src = x86_make_disp(vfREG, get_offset(vf, &a->inputptr));
 315
 316       /* add a[j].inputstride (hardcoded value - could just as easily
 317        * pull the stride value from memory each time).
 318        */
 319       x86_lea(&p->func, srcREG, x86_make_disp(srcREG, a->inputstride));
 320
 321       /* save new value of a[j].inputptr
 322        */
 323       x86_mov(&p->func, ptr_to_src, srcREG);
 324    }
 325 }
 326
 327
 328 /* Lots of hardcoding
 329  *
 330  * EAX -- pointer to current output vertex
 331  * ECX -- pointer to current attribute
 332  *
 333  */
 334 static GLboolean build_vertex_emit( struct x86_program *p )
 335 {
 336    struct vertex_fetch *vf = p->vf;
 337    GLuint j = 0;
 338
 339    struct x86_reg vertexEAX = x86_make_reg(file_REG32, reg_AX);
 340    struct x86_reg srcECX = x86_make_reg(file_REG32, reg_CX);
 341    struct x86_reg countEBP = x86_make_reg(file_REG32, reg_BP);
 342    struct x86_reg vfESI = x86_make_reg(file_REG32, reg_SI);
 343    struct x86_reg temp = x86_make_reg(file_XMM, 0);
 344    struct x86_reg vp0 = x86_make_reg(file_XMM, 1);
 345    struct x86_reg vp1 = x86_make_reg(file_XMM, 2);
 346    GLubyte *fixup, *label;
 347
 348    /* Push a few regs?
 349     */
 350    x86_push(&p->func, countEBP);
 351    x86_push(&p->func, vfESI);
 352
 353
 354    /* Get vertex count, compare to zero
 355     */
 356    x86_xor(&p->func, srcECX, srcECX);
 357    x86_mov(&p->func, countEBP, x86_fn_arg(&p->func, 2));
 358    x86_cmp(&p->func, countEBP, srcECX);
 359    fixup = x86_jcc_forward(&p->func, cc_E);
 360
 361    /* Initialize destination register.
 362     */
 363    x86_mov(&p->func, vertexEAX, x86_fn_arg(&p->func, 3));
 364
 365    /* Move argument 1 (vf) into a reg:
 366     */
 367    x86_mov(&p->func, vfESI, x86_fn_arg(&p->func, 1));
 368
 369
 370    /* Possibly load vp0, vp1 for viewport calcs:
 371     */
 372    if (vf->allow_viewport_emits) {
 373       sse_movups(&p->func, vp0, x86_make_disp(vfESI, get_offset(vf, &vf->vp[0])));
 374       sse_movups(&p->func, vp1, x86_make_disp(vfESI, get_offset(vf, &vf->vp[4])));
 375    }
 376
 377    /* always load, needed or not:
 378     */
 379    sse_movups(&p->func, p->chan0, x86_make_disp(vfESI, get_offset(vf, &vf->chan_scale[0])));
 380    sse_movups(&p->func, p->identity, x86_make_disp(vfESI, get_offset(vf, &vf->identity[0])));
 381
 382    /* Note address for loop jump */
 383    label = x86_get_label(&p->func);
 384
 385    /* Emit code for each of the attributes.  Currently routes
 386     * everything through SSE registers, even when it might be more
 387     * efficient to stick with regular old x86.  No optimization or
 388     * other tricks - enough new ground to cover here just getting
 389     * things working.
 390     */
 391    while (j < vf->attr_count) {
 392       struct vf_attr *a = &vf->attr[j];
 393       struct x86_reg dest = x86_make_disp(vertexEAX, a->vertoffset);
 394
 395       /* Now, load an XMM reg from src, perhaps transform, then save.
 396        * Could be shortcircuited in specific cases:
 397        */
 398       switch (a->format) {
 399       case EMIT_1F:
 400          get_src_ptr(p, srcECX, vfESI, a);
 401          emit_load(p, temp, 1, x86_deref(srcECX), a->inputsize);
 402          emit_store(p, dest, 1, temp);
 403          update_src_ptr(p, srcECX, vfESI, a);
 404          break;
 405       case EMIT_2F:
 406          get_src_ptr(p, srcECX, vfESI, a);
 407          emit_load(p, temp, 2, x86_deref(srcECX), a->inputsize);
 408          emit_store(p, dest, 2, temp);
 409          update_src_ptr(p, srcECX, vfESI, a);
 410          break;
 411       case EMIT_3F:
 412          /* Potentially the worst case - hardcode 2+1 copying:
 413           */
 414          if (0) {
 415             get_src_ptr(p, srcECX, vfESI, a);
 416             emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
 417             emit_store(p, dest, 3, temp);
 418             update_src_ptr(p, srcECX, vfESI, a);
 419          }
 420          else {
 421             get_src_ptr(p, srcECX, vfESI, a);
 422             emit_load(p, temp, 2, x86_deref(srcECX), a->inputsize);
 423             emit_store(p, dest, 2, temp);
 424             if (a->inputsize > 2) {
 425                emit_load(p, temp, 1, x86_make_disp(srcECX, 8), 1);
 426                emit_store(p, x86_make_disp(dest,8), 1, temp);
 427             }
 428             else {
 429                sse_movss(&p->func, x86_make_disp(dest,8), get_identity(p));
 430             }
 431             update_src_ptr(p, srcECX, vfESI, a);
 432          }
 433          break;
 434       case EMIT_4F:
 435          get_src_ptr(p, srcECX, vfESI, a);
 436          emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
 437          emit_store(p, dest, 4, temp);
 438          update_src_ptr(p, srcECX, vfESI, a);
 439          break;
 440       case EMIT_2F_VIEWPORT:
 441          get_src_ptr(p, srcECX, vfESI, a);
 442          emit_load(p, temp, 2, x86_deref(srcECX), a->inputsize);
 443          sse_mulps(&p->func, temp, vp0);
 444          sse_addps(&p->func, temp, vp1);
 445          emit_store(p, dest, 2, temp);
 446          update_src_ptr(p, srcECX, vfESI, a);
 447          break;
 448       case EMIT_3F_VIEWPORT:
 449          get_src_ptr(p, srcECX, vfESI, a);
 450          emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
 451          sse_mulps(&p->func, temp, vp0);
 452          sse_addps(&p->func, temp, vp1);
 453          emit_store(p, dest, 3, temp);
 454          update_src_ptr(p, srcECX, vfESI, a);
 455          break;
 456       case EMIT_4F_VIEWPORT:
 457          get_src_ptr(p, srcECX, vfESI, a);
 458          emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
 459          sse_mulps(&p->func, temp, vp0);
 460          sse_addps(&p->func, temp, vp1);
 461          emit_store(p, dest, 4, temp);
 462          update_src_ptr(p, srcECX, vfESI, a);
 463          break;
 464       case EMIT_3F_XYW:
 465          get_src_ptr(p, srcECX, vfESI, a);
 466          emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
 467          sse_shufps(&p->func, temp, temp, SHUF(X,Y,W,Z));
 468          emit_store(p, dest, 3, temp);
 469          update_src_ptr(p, srcECX, vfESI, a);
 470          break;
 471
 472       case EMIT_1UB_1F:
 473          /* Test for PAD3 + 1UB:
 474           */
 475          if (j > 0 &&
 476              a[-1].vertoffset + a[-1].vertattrsize <= a->vertoffset - 3)
 477          {
 478             get_src_ptr(p, srcECX, vfESI, a);
 479             emit_load(p, temp, 1, x86_deref(srcECX), a->inputsize);
 480             sse_shufps(&p->func, temp, temp, SHUF(X,X,X,X));
 481             emit_pack_store_4ub(p, x86_make_disp(dest, -3), temp); /* overkill! */
 482             update_src_ptr(p, srcECX, vfESI, a);
 483          }
 484          else {
 485             _mesa_printf("Can't emit 1ub %x %x %d\n", a->vertoffset, a[-1].vertoffset, a[-1].vertattrsize );
 486             return GL_FALSE;
 487          }
 488          break;
 489       case EMIT_3UB_3F_RGB:
 490       case EMIT_3UB_3F_BGR:
 491          /* Test for 3UB + PAD1:
 492           */
 493          if (j == vf->attr_count - 1 ||
 494              a[1].vertoffset >= a->vertoffset + 4) {
 495             get_src_ptr(p, srcECX, vfESI, a);
 496             emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
 497             if (a->format == EMIT_3UB_3F_BGR)
 498                sse_shufps(&p->func, temp, temp, SHUF(Z,Y,X,W));
 499             emit_pack_store_4ub(p, dest, temp);
 500             update_src_ptr(p, srcECX, vfESI, a);
 501          }
 502          /* Test for 3UB + 1UB:
 503           */
 504          else if (j < vf->attr_count - 1 &&
 505                   a[1].format == EMIT_1UB_1F &&
 506                   a[1].vertoffset == a->vertoffset + 3) {
 507             get_src_ptr(p, srcECX, vfESI, a);
 508             emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
 509             update_src_ptr(p, srcECX, vfESI, a);
 510
 511             /* Make room for incoming value:
 512              */
 513             sse_shufps(&p->func, temp, temp, SHUF(W,X,Y,Z));
 514
 515             get_src_ptr(p, srcECX, vfESI, &a[1]);
 516             emit_load(p, temp, 1, x86_deref(srcECX), a[1].inputsize);
 517             update_src_ptr(p, srcECX, vfESI, &a[1]);
 518
 519             /* Rearrange and possibly do BGR conversion:
 520              */
 521             if (a->format == EMIT_3UB_3F_BGR)
 522                sse_shufps(&p->func, temp, temp, SHUF(W,Z,Y,X));
 523             else
 524                sse_shufps(&p->func, temp, temp, SHUF(Y,Z,W,X));
 525
 526             emit_pack_store_4ub(p, dest, temp);
 527             j++;                /* NOTE: two attrs consumed */
 528          }
 529          else {
 530             _mesa_printf("Can't emit 3ub\n");
 531          }
 532          return GL_FALSE;       /* add this later */
 533          break;
 534
 535       case EMIT_4UB_4F_RGBA:
 536          get_src_ptr(p, srcECX, vfESI, a);
 537          emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
 538          emit_pack_store_4ub(p, dest, temp);
 539          update_src_ptr(p, srcECX, vfESI, a);
 540          break;
 541       case EMIT_4UB_4F_BGRA:
 542          get_src_ptr(p, srcECX, vfESI, a);
 543          emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
 544          sse_shufps(&p->func, temp, temp, SHUF(Z,Y,X,W));
 545          emit_pack_store_4ub(p, dest, temp);
 546          update_src_ptr(p, srcECX, vfESI, a);
 547          break;
 548       case EMIT_4UB_4F_ARGB:
 549          get_src_ptr(p, srcECX, vfESI, a);
 550          emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
 551          sse_shufps(&p->func, temp, temp, SHUF(W,X,Y,Z));
 552          emit_pack_store_4ub(p, dest, temp);
 553          update_src_ptr(p, srcECX, vfESI, a);
 554          break;
 555       case EMIT_4UB_4F_ABGR:
 556          get_src_ptr(p, srcECX, vfESI, a);
 557          emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
 558          sse_shufps(&p->func, temp, temp, SHUF(W,Z,Y,X));
 559          emit_pack_store_4ub(p, dest, temp);
 560          update_src_ptr(p, srcECX, vfESI, a);
 561          break;
 562       case EMIT_4CHAN_4F_RGBA:
 563          switch (CHAN_TYPE) {
 564          case GL_UNSIGNED_BYTE:
 565             get_src_ptr(p, srcECX, vfESI, a);
 566             emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
 567             emit_pack_store_4ub(p, dest, temp);
 568             update_src_ptr(p, srcECX, vfESI, a);
 569             break;
 570          case GL_FLOAT:
 571             get_src_ptr(p, srcECX, vfESI, a);
 572             emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
 573             emit_store(p, dest, 4, temp);
 574             update_src_ptr(p, srcECX, vfESI, a);
 575             break;
 576          case GL_UNSIGNED_SHORT:
 577          default:
 578             _mesa_printf("unknown CHAN_TYPE %s\n", _mesa_lookup_enum_by_nr(CHAN_TYPE));
 579             return GL_FALSE;
 580          }
 581          break;
 582       default:
 583          _mesa_printf("unknown a[%d].format %d\n", j, a->format);
 584          return GL_FALSE;       /* catch any new opcodes */
 585       }
 586
 587       /* Increment j by at least 1 - may have been incremented above also:
 588        */
 589       j++;
 590    }
 591
 592    /* Next vertex:
 593     */
 594    x86_lea(&p->func, vertexEAX, x86_make_disp(vertexEAX, vf->vertex_stride));
 595
 596    /* decr count, loop if not zero
 597     */
 598    x86_dec(&p->func, countEBP);
 599    x86_test(&p->func, countEBP, countEBP);
 600    x86_jcc(&p->func, cc_NZ, label);
 601
 602    /* Exit mmx state?
 603     */
 604    if (p->func.need_emms)
 605       mmx_emms(&p->func);
 606
 607    /* Land forward jump here:
 608     */
 609    x86_fixup_fwd_jump(&p->func, fixup);
 610
 611    /* Pop regs and return
 612     */
 613    x86_pop(&p->func, x86_get_base_reg(vfESI));
 614    x86_pop(&p->func, countEBP);
 615    x86_ret(&p->func);
 616
 617    vf->emit = (vf_emit_func)x86_get_func(&p->func);
 618    return GL_TRUE;
 619 }
 620
 621
 622
 623 void vf_generate_sse_emit( struct vertex_fetch *vf )
 624 {
 625    struct x86_program p;
 626
 627    if (!cpu_has_xmm) {
 628       vf->codegen_emit = NULL;
 629       return;
 630    }
 631
 632    _mesa_memset(&p, 0, sizeof(p));
 633
 634    p.vf = vf;
 635    p.inputs_safe = 0;           /* for now */
 636    p.outputs_safe = 0;          /* for now */
 637    p.have_sse2 = cpu_has_xmm2;
 638    p.identity = x86_make_reg(file_XMM, 6);
 639    p.chan0 = x86_make_reg(file_XMM, 7);
 640
 641    x86_init_func(&p.func);
 642
 643    if (build_vertex_emit(&p)) {
 644       vf_register_fastpath( vf, GL_TRUE );
 645    }
 646    else {
 647       /* Note the failure so that we don't keep trying to codegen an
 648        * impossible state:
 649        */
 650       vf_register_fastpath( vf, GL_FALSE );
 651       x86_release_func(&p.func);
 652    }
 653 }
 654
 655 #else
 656
 657 void vf_generate_sse_emit( struct vertex_fetch *vf )
 658 {
 659    /* Dummy version for when USE_SSE_ASM not defined */
 660 }
 661
 662 #endif