src/mesa/tnl/t_vertex_sse.c

   1 /*
   2  * Copyright 2003 Tungsten Graphics, inc.
   3  * All Rights Reserved.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * on the rights to use, copy, modify, merge, publish, distribute, sub
   9  * license, and/or sell copies of the Software, and to permit persons to whom
  10  * the Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
  19  * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  20  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  21  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  *
  24  * Authors:
  25  *    Keith Whitwell <keithw@tungstengraphics.com>
  26  */
  27
  28 #include "glheader.h"
  29 #include "context.h"
  30 #include "colormac.h"
  31 #include "t_context.h"
  32 #include "t_vertex.h"
  33 #include "simple_list.h"
  34 #include "enums.h"
  35
  36 #if defined(USE_X86_ASM)
  37
  38 #define X    0
  39 #define Y    1
  40 #define Z    2
  41 #define W    3
  42
  43 #define DISASSEM 0
  44
  45 struct x86_reg {
  46    GLuint file:3;
  47    GLuint idx:3;
  48    GLuint mod:2;                /* mod_REG if this is just a register */
  49    GLint  disp:24;              /* only +/- 23bits of offset - should be enough... */
  50 };
  51
  52 struct x86_program {
  53    GLcontext *ctx;
  54
  55    GLubyte *store;
  56    GLubyte *csr;
  57
  58    GLuint stack_offset;
  59
  60    GLboolean inputs_safe;
  61    GLboolean outputs_safe;
  62    GLboolean have_sse2;
  63    GLboolean need_emms;
  64
  65    struct x86_reg identity;
  66    struct x86_reg chan0;
  67
  68 };
  69
  70
  71 #define X86_TWOB 0x0f
  72
  73 /* There are more but these are all we'll use:
  74  */
  75 enum x86_reg_file {
  76    file_REG32,
  77    file_MMX,
  78    file_XMM
  79 };
  80
  81 /* Values for mod field of modr/m byte
  82  */
  83 enum x86_reg_mod {
  84    mod_INDIRECT,
  85    mod_DISP8,
  86    mod_DISP32,
  87    mod_REG
  88 };
  89
  90 enum x86_reg_name {
  91    reg_AX,
  92    reg_CX,
  93    reg_DX,
  94    reg_BX,
  95    reg_SP,
  96    reg_BP,
  97    reg_SI,
  98    reg_DI
  99 };
 100
 101
 102 enum x86_cc {
 103    cc_O,                        /* overflow */
 104    cc_NO,                       /* not overflow */
 105    cc_NAE,                      /* not above or equal / carry */
 106    cc_AE,                       /* above or equal / not carry */
 107    cc_E,                        /* equal / zero */
 108    cc_NE                        /* not equal / not zero */
 109 };
 110
 111 #define cc_Z  cc_E
 112 #define cc_NZ cc_NE
 113
 114
 115 /* Create and manipulate registers and regmem values:
 116  */
 117 static struct x86_reg make_reg( GLuint file,
 118                                 GLuint idx )
 119 {
 120    struct x86_reg reg;
 121
 122    reg.file = file;
 123    reg.idx = idx;
 124    reg.mod = mod_REG;
 125    reg.disp = 0;
 126
 127    return reg;
 128 }
 129
 130 static struct x86_reg make_disp( struct x86_reg reg,
 131                                  GLint disp )
 132 {
 133    assert(reg.file == file_REG32);
 134
 135    if (reg.mod == mod_REG)
 136       reg.disp = disp;
 137    else
 138       reg.disp += disp;
 139
 140    if (reg.disp == 0)
 141       reg.mod = mod_INDIRECT;
 142    else if (reg.disp <= 127 && reg.disp >= -128)
 143       reg.mod = mod_DISP8;
 144    else
 145       reg.mod = mod_DISP32;
 146
 147    return reg;
 148 }
 149
 150 static struct x86_reg deref( struct x86_reg reg )
 151 {
 152    return make_disp(reg, 0);
 153 }
 154
 155 static struct x86_reg get_base_reg( struct x86_reg reg )
 156 {
 157    return make_reg( reg.file, reg.idx );
 158 }
 159
 160
 161 /* Retreive a reference to one of the function arguments, taking into
 162  * account any push/pop activity:
 163  */
 164 static struct x86_reg make_fn_arg( struct x86_program *p,
 165                                    GLuint arg )
 166 {
 167    return make_disp(make_reg(file_REG32, reg_SP),
 168                     p->stack_offset + arg * 4); /* ??? */
 169 }
 170
 171 static struct x86_reg get_identity( struct x86_program *p )
 172 {
 173    return p->identity;
 174 }
 175
 176
 177 /* Emit bytes to the instruction stream:
 178  */
 179 static void emit_1b( struct x86_program *p, GLbyte b0 )
 180 {
 181    *(GLbyte *)(p->csr++) = b0;
 182 }
 183
 184 static void emit_1i( struct x86_program *p, GLint i0 )
 185 {
 186    *(GLint *)(p->csr) = i0;
 187    p->csr += 4;
 188 }
 189
 190 static void disassem( struct x86_program *p, const char *fn )
 191 {
 192 #if DISASSEM
 193    static const char *last_fn;
 194    if (fn && fn != last_fn) {
 195       _mesa_printf("0x%x: %s\n", p->csr, fn);
 196       last_fn = fn;
 197    }
 198 #endif
 199 }
 200
 201 static void emit_1ub_fn( struct x86_program *p, GLubyte b0, const char *fn )
 202 {
 203    disassem(p, fn);
 204    *(p->csr++) = b0;
 205 }
 206
 207 static void emit_2ub_fn( struct x86_program *p, GLubyte b0, GLubyte b1, const char *fn )
 208 {
 209    disassem(p, fn);
 210    *(p->csr++) = b0;
 211    *(p->csr++) = b1;
 212 }
 213
 214 static void emit_3ub_fn( struct x86_program *p, GLubyte b0, GLubyte b1, GLubyte b2, const char *fn )
 215 {
 216    disassem(p, fn);
 217    *(p->csr++) = b0;
 218    *(p->csr++) = b1;
 219    *(p->csr++) = b2;
 220 }
 221
 222 #define emit_1ub(p, b0)         emit_1ub_fn(p, b0, __FUNCTION__)
 223 #define emit_2ub(p, b0, b1)     emit_2ub_fn(p, b0, b1, __FUNCTION__)
 224 #define emit_3ub(p, b0, b1, b2) emit_3ub_fn(p, b0, b1, b2, __FUNCTION__)
 225
 226
 227 /* Labels, jumps and fixup:
 228  */
 229 static GLubyte *get_label( struct x86_program *p )
 230 {
 231    return p->csr;
 232 }
 233
 234 static void x86_jcc( struct x86_program *p,
 235                       GLuint cc,
 236                       GLubyte *label )
 237 {
 238    GLint offset = label - (get_label(p) + 2);
 239
 240    if (offset <= 127 && offset >= -128) {
 241       emit_1ub(p, 0x70 + cc);
 242       emit_1b(p, (GLbyte) offset);
 243    }
 244    else {
 245       offset = label - (get_label(p) + 6);
 246       emit_2ub(p, 0x0f, 0x80 + cc);
 247       emit_1i(p, offset);
 248    }
 249 }
 250
 251 /* Always use a 32bit offset for forward jumps:
 252  */
 253 static GLubyte *x86_jcc_forward( struct x86_program *p,
 254                                  GLuint cc )
 255 {
 256    emit_2ub(p, 0x0f, 0x80 + cc);
 257    emit_1i(p, 0);
 258    return get_label(p);
 259 }
 260
 261 /* Fixup offset from forward jump:
 262  */
 263 static void do_fixup( struct x86_program *p,
 264                       GLubyte *fixup )
 265 {
 266    *(int *)(fixup - 4) = get_label(p) - fixup;
 267 }
 268
 269 static void x86_push( struct x86_program *p,
 270                        struct x86_reg reg )
 271 {
 272    assert(reg.mod == mod_REG);
 273    emit_1ub(p, 0x50 + reg.idx);
 274    p->stack_offset += 4;
 275 }
 276
 277 static void x86_pop( struct x86_program *p,
 278                        struct x86_reg reg )
 279 {
 280    assert(reg.mod == mod_REG);
 281    emit_1ub(p, 0x58 + reg.idx);
 282    p->stack_offset -= 4;
 283 }
 284
 285 static void x86_inc( struct x86_program *p,
 286                        struct x86_reg reg )
 287 {
 288    assert(reg.mod == mod_REG);
 289    emit_1ub(p, 0x40 + reg.idx);
 290 }
 291
 292 static void x86_dec( struct x86_program *p,
 293                        struct x86_reg reg )
 294 {
 295    assert(reg.mod == mod_REG);
 296    emit_1ub(p, 0x48 + reg.idx);
 297 }
 298
 299 static void x86_ret( struct x86_program *p )
 300 {
 301    emit_1ub(p, 0xc3);
 302 }
 303
 304 static void mmx_emms( struct x86_program *p )
 305 {
 306    assert(p->need_emms);
 307    emit_2ub(p, 0x0f, 0x77);
 308    p->need_emms = 0;
 309 }
 310
 311
 312
 313
 314 /* Build a modRM byte + possible displacement.  No treatment of SIB
 315  * indexing.  BZZT - no way to encode an absolute address.
 316  */
 317 static void emit_modrm( struct x86_program *p,
 318                         struct x86_reg reg,
 319                         struct x86_reg regmem )
 320 {
 321    GLubyte val = 0;
 322
 323    assert(reg.mod == mod_REG);
 324
 325    val |= regmem.mod << 6;      /* mod field */
 326    val |= reg.idx << 3;         /* reg field */
 327    val |= regmem.idx;           /* r/m field */
 328
 329    emit_1ub_fn(p, val, 0);
 330
 331    /* Oh-oh we've stumbled into the SIB thing.
 332     */
 333    if (regmem.idx == reg_SP) {
 334       emit_1ub_fn(p, 0x24, 0);          /* simplistic! */
 335    }
 336
 337    switch (regmem.mod) {
 338    case mod_REG:
 339    case mod_INDIRECT:
 340       break;
 341    case mod_DISP8:
 342       emit_1b(p, regmem.disp);
 343       break;
 344    case mod_DISP32:
 345       emit_1i(p, regmem.disp);
 346       break;
 347    default:
 348       _mesa_printf("unknown regmem.mod %d\n", regmem.mod);
 349       abort();
 350       break;
 351    }
 352 }
 353
 354 /* Many x86 instructions have two opcodes to cope with the situations
 355  * where the destination is a register or memory reference
 356  * respectively.  This function selects the correct opcode based on
 357  * the arguments presented.
 358  */
 359 static void emit_op_modrm( struct x86_program *p,
 360                            GLubyte op_dst_is_reg,
 361                            GLubyte op_dst_is_mem,
 362                            struct x86_reg dst,
 363                            struct x86_reg src )
 364 {
 365    switch (dst.mod) {
 366    case mod_REG:
 367       emit_1ub_fn(p, op_dst_is_reg, 0);
 368       emit_modrm(p, dst, src);
 369       break;
 370    case mod_INDIRECT:
 371    case mod_DISP32:
 372    case mod_DISP8:
 373       assert(src.mod == mod_REG);
 374       emit_1ub_fn(p, op_dst_is_mem, 0);
 375       emit_modrm(p, src, dst);
 376       break;
 377    default:
 378       _mesa_printf("unknown dst.mod %d\n", dst.mod);
 379       abort();
 380       break;
 381    }
 382 }
 383
 384 static void x86_mov( struct x86_program *p,
 385                       struct x86_reg dst,
 386                       struct x86_reg src )
 387 {
 388    emit_op_modrm( p, 0x8b, 0x89, dst, src );
 389 }
 390
 391 static void x86_xor( struct x86_program *p,
 392                       struct x86_reg dst,
 393                       struct x86_reg src )
 394 {
 395    emit_op_modrm( p, 0x33, 0x31, dst, src );
 396 }
 397
 398 static void x86_cmp( struct x86_program *p,
 399                       struct x86_reg dst,
 400                       struct x86_reg src )
 401 {
 402    emit_op_modrm( p, 0x3b, 0x39, dst, src );
 403 }
 404
 405 static void sse2_movd( struct x86_program *p,
 406                        struct x86_reg dst,
 407                        struct x86_reg src )
 408 {
 409    assert(p->have_sse2);
 410    emit_2ub(p, 0x66, X86_TWOB);
 411    emit_op_modrm( p, 0x6e, 0x7e, dst, src );
 412 }
 413
 414 static void mmx_movd( struct x86_program *p,
 415                        struct x86_reg dst,
 416                        struct x86_reg src )
 417 {
 418    p->need_emms = 1;
 419    emit_1ub(p, X86_TWOB);
 420    emit_op_modrm( p, 0x6e, 0x7e, dst, src );
 421 }
 422
 423 static void mmx_movq( struct x86_program *p,
 424                        struct x86_reg dst,
 425                        struct x86_reg src )
 426 {
 427    p->need_emms = 1;
 428    emit_1ub(p, X86_TWOB);
 429    emit_op_modrm( p, 0x6f, 0x7f, dst, src );
 430 }
 431
 432
 433 static void sse_movss( struct x86_program *p,
 434                        struct x86_reg dst,
 435                        struct x86_reg src )
 436 {
 437    emit_2ub(p, 0xF3, X86_TWOB);
 438    emit_op_modrm( p, 0x10, 0x11, dst, src );
 439 }
 440
 441 static void sse_movaps( struct x86_program *p,
 442                          struct x86_reg dst,
 443                          struct x86_reg src )
 444 {
 445    emit_1ub(p, X86_TWOB);
 446    emit_op_modrm( p, 0x28, 0x29, dst, src );
 447 }
 448
 449 static void sse_movups( struct x86_program *p,
 450                          struct x86_reg dst,
 451                          struct x86_reg src )
 452 {
 453    emit_1ub(p, X86_TWOB);
 454    emit_op_modrm( p, 0x10, 0x11, dst, src );
 455 }
 456
 457 static void sse_movhps( struct x86_program *p,
 458                         struct x86_reg dst,
 459                         struct x86_reg src )
 460 {
 461    assert(dst.mod != mod_REG || src.mod != mod_REG);
 462    emit_1ub(p, X86_TWOB);
 463    emit_op_modrm( p, 0x16, 0x17, dst, src ); /* cf movlhps */
 464 }
 465
 466 static void sse_movlps( struct x86_program *p,
 467                         struct x86_reg dst,
 468                         struct x86_reg src )
 469 {
 470    assert(dst.mod != mod_REG || src.mod != mod_REG);
 471    emit_1ub(p, X86_TWOB);
 472    emit_op_modrm( p, 0x12, 0x13, dst, src ); /* cf movhlps */
 473 }
 474
 475 /* SSE operations often only have one format, with dest constrained to
 476  * be a register:
 477  */
 478 static void sse_mulps( struct x86_program *p,
 479                         struct x86_reg dst,
 480                         struct x86_reg src )
 481 {
 482    emit_2ub(p, X86_TWOB, 0x59);
 483    emit_modrm( p, dst, src );
 484 }
 485
 486 static void sse_addps( struct x86_program *p,
 487                         struct x86_reg dst,
 488                         struct x86_reg src )
 489 {
 490    emit_2ub(p, X86_TWOB, 0x58);
 491    emit_modrm( p, dst, src );
 492 }
 493
 494 static void sse_movhlps( struct x86_program *p,
 495                          struct x86_reg dst,
 496                          struct x86_reg src )
 497 {
 498    assert(dst.mod == mod_REG && src.mod == mod_REG);
 499    emit_2ub(p, X86_TWOB, 0x12);
 500    emit_modrm( p, dst, src );
 501 }
 502
 503 static void sse_movlhps( struct x86_program *p,
 504                          struct x86_reg dst,
 505                          struct x86_reg src )
 506 {
 507    assert(dst.mod == mod_REG && src.mod == mod_REG);
 508    emit_2ub(p, X86_TWOB, 0x16);
 509    emit_modrm( p, dst, src );
 510 }
 511
 512 static void sse2_cvtps2dq( struct x86_program *p,
 513                         struct x86_reg dst,
 514                         struct x86_reg src )
 515 {
 516    assert(p->have_sse2);
 517    emit_3ub(p, 0x66, X86_TWOB, 0x5B);
 518    emit_modrm( p, dst, src );
 519 }
 520
 521 static void sse2_packssdw( struct x86_program *p,
 522                         struct x86_reg dst,
 523                         struct x86_reg src )
 524 {
 525    assert(p->have_sse2);
 526    emit_3ub(p, 0x66, X86_TWOB, 0x6B);
 527    emit_modrm( p, dst, src );
 528 }
 529
 530 static void sse2_packsswb( struct x86_program *p,
 531                         struct x86_reg dst,
 532                         struct x86_reg src )
 533 {
 534    assert(p->have_sse2);
 535    emit_3ub(p, 0x66, X86_TWOB, 0x63);
 536    emit_modrm( p, dst, src );
 537 }
 538
 539 static void sse2_packuswb( struct x86_program *p,
 540                            struct x86_reg dst,
 541                            struct x86_reg src )
 542 {
 543    assert(p->have_sse2);
 544    emit_3ub(p, 0x66, X86_TWOB, 0x67);
 545    emit_modrm( p, dst, src );
 546 }
 547
 548 static void sse_cvtps2pi( struct x86_program *p,
 549                           struct x86_reg dst,
 550                           struct x86_reg src )
 551 {
 552    assert(dst.file == file_MMX &&
 553           (src.file == file_XMM || src.mod != mod_REG));
 554
 555    p->need_emms = 1;
 556
 557    emit_2ub(p, X86_TWOB, 0x2d);
 558    emit_modrm( p, dst, src );
 559 }
 560
 561 static void mmx_packssdw( struct x86_program *p,
 562                           struct x86_reg dst,
 563                           struct x86_reg src )
 564 {
 565    assert(dst.file == file_MMX &&
 566           (src.file == file_MMX || src.mod != mod_REG));
 567
 568    p->need_emms = 1;
 569
 570    emit_2ub(p, X86_TWOB, 0x6b);
 571    emit_modrm( p, dst, src );
 572 }
 573
 574 static void mmx_packuswb( struct x86_program *p,
 575                           struct x86_reg dst,
 576                           struct x86_reg src )
 577 {
 578    assert(dst.file == file_MMX &&
 579           (src.file == file_MMX || src.mod != mod_REG));
 580
 581    p->need_emms = 1;
 582
 583    emit_2ub(p, X86_TWOB, 0x67);
 584    emit_modrm( p, dst, src );
 585 }
 586
 587
 588 /* Load effective address:
 589  */
 590 static void x86_lea( struct x86_program *p,
 591                      struct x86_reg dst,
 592                      struct x86_reg src )
 593 {
 594    emit_1ub(p, 0x8d);
 595    emit_modrm( p, dst, src );
 596 }
 597
 598 static void x86_test( struct x86_program *p,
 599                       struct x86_reg dst,
 600                       struct x86_reg src )
 601 {
 602    emit_1ub(p, 0x85);
 603    emit_modrm( p, dst, src );
 604 }
 605
 606
 607
 608
 609 /**
 610  * Perform a reduced swizzle:
 611  */
 612 static void sse2_pshufd( struct x86_program *p,
 613                          struct x86_reg dest,
 614                          struct x86_reg arg0,
 615                          GLubyte x,
 616                          GLubyte y,
 617                          GLubyte z,
 618                          GLubyte w)
 619 {
 620    assert(p->have_sse2);
 621    emit_3ub(p, 0x66, X86_TWOB, 0x70);
 622    emit_modrm(p, dest, arg0);
 623    emit_1ub(p, (x|(y<<2)|(z<<4)|w<<6));
 624 }
 625
 626
 627 /* Shufps can also be used to implement a reduced swizzle when dest ==
 628  * arg0.
 629  */
 630 static void sse_shufps( struct x86_program *p,
 631                          struct x86_reg dest,
 632                          struct x86_reg arg0,
 633                          GLubyte x,
 634                          GLubyte y,
 635                          GLubyte z,
 636                          GLubyte w)
 637 {
 638    emit_2ub(p, X86_TWOB, 0xC6);
 639    emit_modrm(p, dest, arg0);
 640    emit_1ub(p, (x|(y<<2)|(z<<4)|w<<6));
 641 }
 642
 643
 644 static void emit_load4f_4( struct x86_program *p,
 645                            struct x86_reg dest,
 646                            struct x86_reg arg0 )
 647 {
 648    sse_movups(p, dest, arg0);
 649 }
 650
 651 static void emit_load4f_3( struct x86_program *p,
 652                            struct x86_reg dest,
 653                            struct x86_reg arg0 )
 654 {
 655    /* Have to jump through some hoops:
 656     *
 657     * c 0 0 0
 658     * c 0 0 1
 659     * 0 0 c 1
 660     * a b c 1
 661     */
 662    sse_movss(p, dest, make_disp(arg0, 8));
 663    sse_shufps(p, dest, get_identity(p), X,Y,Z,W );
 664    sse_shufps(p, dest, dest, Y,Z,X,W );
 665    sse_movlps(p, dest, arg0);
 666 }
 667
 668 static void emit_load4f_2( struct x86_program *p,
 669                            struct x86_reg dest,
 670                            struct x86_reg arg0 )
 671 {
 672    /* Initialize from identity, then pull in low two words:
 673     */
 674    sse_movups(p, dest, get_identity(p));
 675    sse_movlps(p, dest, arg0);
 676 }
 677
 678 static void emit_load4f_1( struct x86_program *p,
 679                            struct x86_reg dest,
 680                            struct x86_reg arg0 )
 681 {
 682    /* Pull in low word, then swizzle in identity */
 683    sse_movss(p, dest, arg0);
 684    sse_shufps(p, dest, get_identity(p), X,Y,Z,W );
 685 }
 686
 687
 688
 689 static void emit_load3f_3( struct x86_program *p,
 690                            struct x86_reg dest,
 691                            struct x86_reg arg0 )
 692 {
 693    /* Over-reads by 1 dword - potential SEGV if input is a vertex
 694     * array.
 695     */
 696    if (p->inputs_safe) {
 697       sse_movups(p, dest, arg0);
 698    }
 699    else {
 700       /* c 0 0 0
 701        * c c c c
 702        * a b c c
 703        */
 704       sse_movss(p, dest, make_disp(arg0, 8));
 705       sse_shufps(p, dest, dest, X,X,X,X);
 706       sse_movlps(p, dest, arg0);
 707    }
 708 }
 709
 710 static void emit_load3f_2( struct x86_program *p,
 711                            struct x86_reg dest,
 712                            struct x86_reg arg0 )
 713 {
 714    emit_load4f_2(p, dest, arg0);
 715 }
 716
 717 static void emit_load3f_1( struct x86_program *p,
 718                            struct x86_reg dest,
 719                            struct x86_reg arg0 )
 720 {
 721    emit_load4f_1(p, dest, arg0);
 722 }
 723
 724 static void emit_load2f_2( struct x86_program *p,
 725                            struct x86_reg dest,
 726                            struct x86_reg arg0 )
 727 {
 728    sse_movlps(p, dest, arg0);
 729 }
 730
 731 static void emit_load2f_1( struct x86_program *p,
 732                            struct x86_reg dest,
 733                            struct x86_reg arg0 )
 734 {
 735    emit_load4f_1(p, dest, arg0);
 736 }
 737
 738 static void emit_load1f_1( struct x86_program *p,
 739                            struct x86_reg dest,
 740                            struct x86_reg arg0 )
 741 {
 742    sse_movss(p, dest, arg0);
 743 }
 744
 745 static void (*load[4][4])( struct x86_program *p,
 746                            struct x86_reg dest,
 747                            struct x86_reg arg0 ) = {
 748    { emit_load1f_1,
 749      emit_load1f_1,
 750      emit_load1f_1,
 751      emit_load1f_1 },
 752
 753    { emit_load2f_1,
 754      emit_load2f_2,
 755      emit_load2f_2,
 756      emit_load2f_2 },
 757
 758    { emit_load3f_1,
 759      emit_load3f_2,
 760      emit_load3f_3,
 761      emit_load3f_3 },
 762
 763    { emit_load4f_1,
 764      emit_load4f_2,
 765      emit_load4f_3,
 766      emit_load4f_4 }
 767 };
 768
 769 static void emit_load( struct x86_program *p,
 770                        struct x86_reg dest,
 771                        GLuint sz,
 772                        struct x86_reg src,
 773                        GLuint src_sz)
 774 {
 775    if (DISASSEM)
 776       _mesa_printf("load %d/%d\n", sz, src_sz);
 777
 778    load[sz-1][src_sz-1](p, dest, src);
 779 }
 780
 781 static void emit_store4f( struct x86_program *p,
 782                           struct x86_reg dest,
 783                           struct x86_reg arg0 )
 784 {
 785    sse_movups(p, dest, arg0);
 786 }
 787
 788 static void emit_store3f( struct x86_program *p,
 789                           struct x86_reg dest,
 790                           struct x86_reg arg0 )
 791 {
 792    if (p->outputs_safe) {
 793       /* Emit the extra dword anyway.  This may hurt writecombining,
 794        * may cause other problems.
 795        */
 796       sse_movups(p, dest, arg0);
 797    }
 798    else {
 799       /* Alternate strategy - emit two, shuffle, emit one.
 800        */
 801       sse_movlps(p, dest, arg0);
 802       sse_shufps(p, arg0, arg0, Z, Z, Z, Z ); /* NOTE! destructive */
 803       sse_movss(p, make_disp(dest,8), arg0);
 804    }
 805 }
 806
 807 static void emit_store2f( struct x86_program *p,
 808                            struct x86_reg dest,
 809                            struct x86_reg arg0 )
 810 {
 811    sse_movlps(p, dest, arg0);
 812 }
 813
 814 static void emit_store1f( struct x86_program *p,
 815                           struct x86_reg dest,
 816                           struct x86_reg arg0 )
 817 {
 818    sse_movss(p, dest, arg0);
 819 }
 820
 821
 822 static void (*store[4])( struct x86_program *p,
 823                          struct x86_reg dest,
 824                          struct x86_reg arg0 ) =
 825 {
 826    emit_store1f,
 827    emit_store2f,
 828    emit_store3f,
 829    emit_store4f
 830 };
 831
 832 static void emit_store( struct x86_program *p,
 833                         struct x86_reg dest,
 834                         GLuint sz,
 835                         struct x86_reg temp )
 836
 837 {
 838    if (DISASSEM)
 839       _mesa_printf("store %d\n", sz);
 840    store[sz-1](p, dest, temp);
 841 }
 842
 843 static void emit_pack_store_4ub( struct x86_program *p,
 844                                  struct x86_reg dest,
 845                                  struct x86_reg temp )
 846 {
 847    /* Scale by 255.0
 848     */
 849    sse_mulps(p, temp, p->chan0);
 850
 851    if (p->have_sse2) {
 852       sse2_cvtps2dq(p, temp, temp);
 853       sse2_packssdw(p, temp, temp);
 854       sse2_packuswb(p, temp, temp);
 855       sse_movss(p, dest, temp);
 856    }
 857    else {
 858       struct x86_reg mmx0 = make_reg(file_MMX, 0);
 859       struct x86_reg mmx1 = make_reg(file_MMX, 1);
 860       sse_cvtps2pi(p, mmx0, temp);
 861       sse_movhlps(p, temp, temp);
 862       sse_cvtps2pi(p, mmx1, temp);
 863       mmx_packssdw(p, mmx0, mmx1);
 864       mmx_packuswb(p, mmx0, mmx0);
 865       mmx_movd(p, dest, mmx0);
 866    }
 867 }
 868
 869 static GLint get_offset( const void *a, const void *b )
 870 {
 871    return (const char *)b - (const char *)a;
 872 }
 873
 874 /* Not much happens here.  Eventually use this function to try and
 875  * avoid saving/reloading the source pointers each vertex (if some of
 876  * them can fit in registers).
 877  */
 878 static void get_src_ptr( struct x86_program *p,
 879                          struct x86_reg srcREG,
 880                          struct x86_reg vtxREG,
 881                          struct tnl_clipspace_attr *a )
 882 {
 883    struct tnl_clipspace *vtx = GET_VERTEX_STATE(p->ctx);
 884    struct x86_reg ptr_to_src = make_disp(vtxREG, get_offset(vtx, &a->inputptr));
 885
 886    /* Load current a[j].inputptr
 887     */
 888    x86_mov(p, srcREG, ptr_to_src);
 889 }
 890
 891 static void update_src_ptr( struct x86_program *p,
 892                          struct x86_reg srcREG,
 893                          struct x86_reg vtxREG,
 894                          struct tnl_clipspace_attr *a )
 895 {
 896    if (a->inputstride) {
 897       struct tnl_clipspace *vtx = GET_VERTEX_STATE(p->ctx);
 898       struct x86_reg ptr_to_src = make_disp(vtxREG, get_offset(vtx, &a->inputptr));
 899
 900       /* add a[j].inputstride (hardcoded value - could just as easily
 901        * pull the stride value from memory each time).
 902        */
 903       x86_lea(p, srcREG, make_disp(srcREG, a->inputstride));
 904
 905       /* save new value of a[j].inputptr
 906        */
 907       x86_mov(p, ptr_to_src, srcREG);
 908    }
 909 }
 910
 911
 912 /* Lots of hardcoding
 913  *
 914  * EAX -- pointer to current output vertex
 915  * ECX -- pointer to current attribute
 916  *
 917  */
 918 static GLboolean build_vertex_emit( struct x86_program *p )
 919 {
 920    GLcontext *ctx = p->ctx;
 921    TNLcontext *tnl = TNL_CONTEXT(ctx);
 922    struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);
 923    GLuint j = 0;
 924
 925    struct x86_reg vertexEAX = make_reg(file_REG32, reg_AX);
 926    struct x86_reg srcECX = make_reg(file_REG32, reg_CX);
 927    struct x86_reg countEBP = make_reg(file_REG32, reg_BP);
 928    struct x86_reg vtxESI = make_reg(file_REG32, reg_SI);
 929    struct x86_reg temp = make_reg(file_XMM, 0);
 930    struct x86_reg vp0 = make_reg(file_XMM, 1);
 931    struct x86_reg vp1 = make_reg(file_XMM, 2);
 932    GLubyte *fixup, *label;
 933
 934    p->csr = p->store;
 935
 936    /* Push a few regs?
 937     */
 938 /*    x86_push(p, srcECX); */
 939    x86_push(p, countEBP);
 940    x86_push(p, vtxESI);
 941
 942
 943    /* Get vertex count, compare to zero
 944     */
 945    x86_xor(p, srcECX, srcECX);
 946    x86_mov(p, countEBP, make_fn_arg(p, 2));
 947    x86_cmp(p, countEBP, srcECX);
 948    fixup = x86_jcc_forward(p, cc_E);
 949
 950    /* Initialize destination register.
 951     */
 952    x86_mov(p, vertexEAX, make_fn_arg(p, 3));
 953
 954    /* Dereference ctx to get tnl, then vtx:
 955     */
 956    x86_mov(p, vtxESI, make_fn_arg(p, 1));
 957    x86_mov(p, vtxESI, make_disp(vtxESI, get_offset(ctx, &ctx->swtnl_context)));
 958    vtxESI = make_disp(vtxESI, get_offset(tnl, &tnl->clipspace));
 959
 960
 961    /* Possibly load vp0, vp1 for viewport calcs:
 962     */
 963    if (vtx->need_viewport) {
 964       sse_movups(p, vp0, make_disp(vtxESI, get_offset(vtx, &vtx->vp_scale[0])));
 965       sse_movups(p, vp1, make_disp(vtxESI, get_offset(vtx, &vtx->vp_xlate[0])));
 966    }
 967
 968    /* always load, needed or not:
 969     */
 970    sse_movups(p, p->chan0, make_disp(vtxESI, get_offset(vtx, &vtx->chan_scale[0])));
 971    sse_movups(p, p->identity, make_disp(vtxESI, get_offset(vtx, &vtx->identity[0])));
 972
 973    /* Note address for loop jump */
 974    label = get_label(p);
 975
 976    /* Emit code for each of the attributes.  Currently routes
 977     * everything through SSE registers, even when it might be more
 978     * efficient to stick with regular old x86.  No optimization or
 979     * other tricks - enough new ground to cover here just getting
 980     * things working.
 981     */
 982    while (j < vtx->attr_count) {
 983       struct tnl_clipspace_attr *a = &vtx->attr[j];
 984       struct x86_reg dest = make_disp(vertexEAX, a->vertoffset);
 985
 986       /* Now, load an XMM reg from src, perhaps transform, then save.
 987        * Could be shortcircuited in specific cases:
 988        */
 989       switch (a->format) {
 990       case EMIT_1F:
 991          get_src_ptr(p, srcECX, vtxESI, a);
 992          emit_load(p, temp, 1, deref(srcECX), a->inputsize);
 993          emit_store(p, dest, 1, temp);
 994          update_src_ptr(p, srcECX, vtxESI, a);
 995          break;
 996       case EMIT_2F:
 997          get_src_ptr(p, srcECX, vtxESI, a);
 998          emit_load(p, temp, 2, deref(srcECX), a->inputsize);
 999          emit_store(p, dest, 2, temp);
1000          update_src_ptr(p, srcECX, vtxESI, a);
1001          break;
1002       case EMIT_3F:
1003          /* Potentially the worst case - hardcode 2+1 copying:
1004           */
1005          if (0) {
1006             get_src_ptr(p, srcECX, vtxESI, a);
1007             emit_load(p, temp, 3, deref(srcECX), a->inputsize);
1008             emit_store(p, dest, 3, temp);
1009             update_src_ptr(p, srcECX, vtxESI, a);
1010          }
1011          else {
1012             get_src_ptr(p, srcECX, vtxESI, a);
1013             emit_load(p, temp, 2, deref(srcECX), a->inputsize);
1014             emit_store(p, dest, 2, temp);
1015             if (a->inputsize > 2) {
1016                emit_load(p, temp, 1, make_disp(srcECX, 8), 1);
1017                emit_store(p, make_disp(dest,8), 1, temp);
1018             }
1019             else {
1020                sse_movss(p, make_disp(dest,8), get_identity(p));
1021             }
1022             update_src_ptr(p, srcECX, vtxESI, a);
1023          }
1024          break;
1025       case EMIT_4F:
1026          get_src_ptr(p, srcECX, vtxESI, a);
1027          emit_load(p, temp, 4, deref(srcECX), a->inputsize);
1028          emit_store(p, dest, 4, temp);
1029          update_src_ptr(p, srcECX, vtxESI, a);
1030          break;
1031       case EMIT_2F_VIEWPORT:
1032          get_src_ptr(p, srcECX, vtxESI, a);
1033          emit_load(p, temp, 2, deref(srcECX), a->inputsize);
1034          sse_mulps(p, temp, vp0);
1035          sse_addps(p, temp, vp1);
1036          emit_store(p, dest, 2, temp);
1037          update_src_ptr(p, srcECX, vtxESI, a);
1038          break;
1039       case EMIT_3F_VIEWPORT:
1040          get_src_ptr(p, srcECX, vtxESI, a);
1041          emit_load(p, temp, 3, deref(srcECX), a->inputsize);
1042          sse_mulps(p, temp, vp0);
1043          sse_addps(p, temp, vp1);
1044          emit_store(p, dest, 3, temp);
1045          update_src_ptr(p, srcECX, vtxESI, a);
1046          break;
1047       case EMIT_4F_VIEWPORT:
1048          get_src_ptr(p, srcECX, vtxESI, a);
1049          emit_load(p, temp, 4, deref(srcECX), a->inputsize);
1050          sse_mulps(p, temp, vp0);
1051          sse_addps(p, temp, vp1);
1052          emit_store(p, dest, 4, temp);
1053          update_src_ptr(p, srcECX, vtxESI, a);
1054          break;
1055       case EMIT_3F_XYW:
1056          get_src_ptr(p, srcECX, vtxESI, a);
1057          emit_load(p, temp, 4, deref(srcECX), a->inputsize);
1058          sse_shufps(p, temp, temp, X, Y, W, Z);
1059          emit_store(p, dest, 3, temp);
1060          update_src_ptr(p, srcECX, vtxESI, a);
1061          break;
1062
1063       case EMIT_1UB_1F:
1064          /* Test for PAD3 + 1UB:
1065           */
1066          if (j > 0 &&
1067              a[-1].vertoffset + a[-1].vertattrsize <= a->vertoffset - 3)
1068          {
1069             get_src_ptr(p, srcECX, vtxESI, a);
1070             emit_load(p, temp, 1, deref(srcECX), a->inputsize);
1071             sse_shufps(p, temp, temp, X, X, X, X);
1072             emit_pack_store_4ub(p, make_disp(dest, -3), temp); /* overkill! */
1073             update_src_ptr(p, srcECX, vtxESI, a);
1074          }
1075          else {
1076             _mesa_printf("Can't emit 1ub %x %x %d\n", a->vertoffset, a[-1].vertoffset, a[-1].vertattrsize );
1077             return GL_FALSE;
1078          }
1079          break;
1080       case EMIT_3UB_3F_RGB:
1081       case EMIT_3UB_3F_BGR:
1082          /* Test for 3UB + PAD1:
1083           */
1084          if (j == vtx->attr_count - 1 ||
1085              a[1].vertoffset >= a->vertoffset + 4) {
1086             get_src_ptr(p, srcECX, vtxESI, a);
1087             emit_load(p, temp, 3, deref(srcECX), a->inputsize);
1088             if (a->format == EMIT_3UB_3F_BGR)
1089                sse_shufps(p, temp, temp, Z, Y, X, W);
1090             emit_pack_store_4ub(p, dest, temp);
1091             update_src_ptr(p, srcECX, vtxESI, a);
1092          }
1093          /* Test for 3UB + 1UB:
1094           */
1095          else if (j < vtx->attr_count - 1 &&
1096                   a[1].format == EMIT_1UB_1F &&
1097                   a[1].vertoffset == a->vertoffset + 3) {
1098             get_src_ptr(p, srcECX, vtxESI, a);
1099             emit_load(p, temp, 3, deref(srcECX), a->inputsize);
1100             update_src_ptr(p, srcECX, vtxESI, a);
1101
1102             /* Make room for incoming value:
1103              */
1104             sse_shufps(p, temp, temp, W, X, Y, Z);
1105
1106             get_src_ptr(p, srcECX, vtxESI, &a[1]);
1107             emit_load(p, temp, 1, deref(srcECX), a[1].inputsize);
1108             update_src_ptr(p, srcECX, vtxESI, &a[1]);
1109
1110             /* Rearrange and possibly do BGR conversion:
1111              */
1112             if (a->format == EMIT_3UB_3F_BGR)
1113                sse_shufps(p, temp, temp, W, Z, Y, X);
1114             else
1115                sse_shufps(p, temp, temp, Y, Z, W, X);
1116
1117             emit_pack_store_4ub(p, dest, temp);
1118             j++;                /* NOTE: two attrs consumed */
1119          }
1120          else {
1121             _mesa_printf("Can't emit 3ub\n");
1122          }
1123          return GL_FALSE;       /* add this later */
1124          break;
1125
1126       case EMIT_4UB_4F_RGBA:
1127          get_src_ptr(p, srcECX, vtxESI, a);
1128          emit_load(p, temp, 4, deref(srcECX), a->inputsize);
1129          emit_pack_store_4ub(p, dest, temp);
1130          update_src_ptr(p, srcECX, vtxESI, a);
1131          break;
1132       case EMIT_4UB_4F_BGRA:
1133          get_src_ptr(p, srcECX, vtxESI, a);
1134          emit_load(p, temp, 4, deref(srcECX), a->inputsize);
1135          sse_shufps(p, temp, temp, Z, Y, X, W);
1136          emit_pack_store_4ub(p, dest, temp);
1137          update_src_ptr(p, srcECX, vtxESI, a);
1138          break;
1139       case EMIT_4UB_4F_ARGB:
1140          get_src_ptr(p, srcECX, vtxESI, a);
1141          emit_load(p, temp, 4, deref(srcECX), a->inputsize);
1142          sse_shufps(p, temp, temp, W, X, Y, Z);
1143          emit_pack_store_4ub(p, dest, temp);
1144          update_src_ptr(p, srcECX, vtxESI, a);
1145          break;
1146       case EMIT_4UB_4F_ABGR:
1147          get_src_ptr(p, srcECX, vtxESI, a);
1148          emit_load(p, temp, 4, deref(srcECX), a->inputsize);
1149          sse_shufps(p, temp, temp, W, Z, Y, X);
1150          emit_pack_store_4ub(p, dest, temp);
1151          update_src_ptr(p, srcECX, vtxESI, a);
1152          break;
1153       case EMIT_4CHAN_4F_RGBA:
1154          switch (CHAN_TYPE) {
1155          case GL_UNSIGNED_BYTE:
1156             get_src_ptr(p, srcECX, vtxESI, a);
1157             emit_load(p, temp, 4, deref(srcECX), a->inputsize);
1158             emit_pack_store_4ub(p, dest, temp);
1159             update_src_ptr(p, srcECX, vtxESI, a);
1160             break;
1161          case GL_FLOAT:
1162             get_src_ptr(p, srcECX, vtxESI, a);
1163             emit_load(p, temp, 4, deref(srcECX), a->inputsize);
1164             emit_store(p, dest, 4, temp);
1165             update_src_ptr(p, srcECX, vtxESI, a);
1166             break;
1167          case GL_UNSIGNED_SHORT:
1168          default:
1169             _mesa_printf("unknown CHAN_TYPE %s\n", _mesa_lookup_enum_by_nr(CHAN_TYPE));
1170             return GL_FALSE;
1171          }
1172          break;
1173       default:
1174          _mesa_printf("unknown a[%d].format %d\n", j, a->format);
1175          return GL_FALSE;       /* catch any new opcodes */
1176       }
1177
1178       /* Increment j by at least 1 - may have been incremented above also:
1179        */
1180       j++;
1181    }
1182
1183    /* Next vertex:
1184     */
1185    x86_lea(p, vertexEAX, make_disp(vertexEAX, vtx->vertex_size));
1186
1187    /* decr count, loop if not zero
1188     */
1189    x86_dec(p, countEBP);
1190    x86_test(p, countEBP, countEBP);
1191    x86_jcc(p, cc_NZ, label);
1192
1193    /* Exit mmx state?
1194     */
1195    if (p->need_emms)
1196       mmx_emms(p);
1197
1198    /* Land forward jump here:
1199     */
1200    do_fixup(p, fixup);
1201
1202    /* Pop regs and return
1203     */
1204    x86_pop(p, get_base_reg(vtxESI));
1205    x86_pop(p, countEBP);
1206 /*    x86_pop(p, srcECX); */
1207    x86_ret(p);
1208
1209    vtx->emit = (tnl_emit_func)p->store;
1210    return GL_TRUE;
1211 }
1212
1213 #include "x86/common_x86_asm.h"
1214
1215
1216 void _tnl_generate_sse_emit( GLcontext *ctx )
1217 {
1218    struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);
1219    struct x86_program p;
1220
1221    if (!cpu_has_xmm) {
1222       vtx->codegen_emit = NULL;
1223       return;
1224    }
1225
1226    memset(&p, 0, sizeof(p));
1227    p.ctx = ctx;
1228    p.store = MALLOC(1024);
1229
1230    p.inputs_safe = 0;           /* for now */
1231    p.outputs_safe = 1;          /* for now */
1232    p.have_sse2 = cpu_has_xmm2;
1233    p.identity = make_reg(file_XMM, 6);
1234    p.chan0 = make_reg(file_XMM, 7);
1235
1236    if (build_vertex_emit(&p)) {
1237       _tnl_register_fastpath( vtx, GL_TRUE );
1238       if (DISASSEM)
1239          _mesa_printf("disassemble 0x%x 0x%x\n", p.store, p.csr);
1240    }
1241    else {
1242       /* Note the failure so that we don't keep trying to codegen an
1243        * impossible state:
1244        */
1245       _tnl_register_fastpath( vtx, GL_FALSE );
1246       FREE(p.store);
1247    }
1248
1249    (void)sse2_movd;
1250    (void)x86_inc;
1251    (void)x86_xor;
1252    (void)mmx_movq;
1253    (void)sse_movlhps;
1254    (void)sse_movhps;
1255    (void)sse_movaps;
1256    (void)sse2_packsswb;
1257    (void)sse2_pshufd;
1258 }
1259
1260 #else
1261
1262 void _tnl_generate_sse_emit( GLcontext *ctx )
1263 {
1264    /* Dummy version for when USE_SSE_ASM not defined */
1265 }
1266
1267 #endif