src/gallium/auxiliary/translate/translate_sse.c

   1 /*
   2  * Copyright 2003 Tungsten Graphics, inc.
   3  * All Rights Reserved.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * on the rights to use, copy, modify, merge, publish, distribute, sub
   9  * license, and/or sell copies of the Software, and to permit persons to whom
  10  * the Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
  19  * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  20  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  21  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  *
  24  * Authors:
  25  *    Keith Whitwell <keithw@tungstengraphics.com>
  26  */
  27
  28
  29 #include "pipe/p_config.h"
  30 #include "pipe/p_compiler.h"
  31 #include "util/u_memory.h"
  32 #include "util/u_math.h"
  33 #include "util/u_format.h"
  34
  35 #include "translate.h"
  36
  37
  38 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
  39
  40 #include "rtasm/rtasm_cpu.h"
  41 #include "rtasm/rtasm_x86sse.h"
  42
  43
  44 #define X    0
  45 #define Y    1
  46 #define Z    2
  47 #define W    3
  48
  49
  50 struct translate_buffer {
  51    const void *base_ptr;
  52    uintptr_t stride;
  53    unsigned max_index;
  54 };
  55
  56 struct translate_buffer_varient {
  57    unsigned buffer_index;
  58    unsigned instance_divisor;
  59    void *ptr;                    /* updated either per vertex or per instance */
  60 };
  61
  62
  63 #define ELEMENT_BUFFER_INSTANCE_ID  1001
  64
  65 #define NUM_CONSTS 7
  66
  67 enum
  68 {
  69    CONST_IDENTITY,
  70    CONST_INV_127,
  71    CONST_INV_255,
  72    CONST_INV_32767,
  73    CONST_INV_65535,
  74    CONST_INV_2147483647,
  75    CONST_255
  76 };
  77
  78 #define C(v) {(float)(v), (float)(v), (float)(v), (float)(v)}
  79 static float consts[NUM_CONSTS][4] = {
  80       {0, 0, 0, 1},
  81       C(1.0 / 127.0),
  82       C(1.0 / 255.0),
  83       C(1.0 / 32767.0),
  84       C(1.0 / 65535.0),
  85       C(1.0 / 2147483647.0),
  86       C(255.0)
  87 };
  88 #undef C
  89
  90 struct translate_sse {
  91    struct translate translate;
  92
  93    struct x86_function linear_func;
  94    struct x86_function elt_func;
  95    struct x86_function elt16_func;
  96    struct x86_function elt8_func;
  97    struct x86_function *func;
  98
  99    PIPE_ALIGN_VAR(16) float consts[NUM_CONSTS][4];
 100    int8_t reg_to_const[16];
 101    int8_t const_to_reg[NUM_CONSTS];
 102
 103    struct translate_buffer buffer[PIPE_MAX_ATTRIBS];
 104    unsigned nr_buffers;
 105
 106    /* Multiple buffer varients can map to a single buffer. */
 107    struct translate_buffer_varient buffer_varient[PIPE_MAX_ATTRIBS];
 108    unsigned nr_buffer_varients;
 109
 110    /* Multiple elements can map to a single buffer varient. */
 111    unsigned element_to_buffer_varient[PIPE_MAX_ATTRIBS];
 112
 113    boolean use_instancing;
 114    unsigned instance_id;
 115
 116    /* these are actually known values, but putting them in a struct
 117     * like this is helpful to keep them in sync across the file.
 118     */
 119    struct x86_reg tmp_EAX;
 120    struct x86_reg tmp2_EDX;
 121    struct x86_reg src_ECX;
 122    struct x86_reg idx_ESI;     /* either start+i or &elt[i] */
 123    struct x86_reg machine_EDI;
 124    struct x86_reg outbuf_EBX;
 125    struct x86_reg count_EBP;    /* decrements to zero */
 126 };
 127
 128 static int get_offset( const void *a, const void *b )
 129 {
 130    return (const char *)b - (const char *)a;
 131 }
 132
 133 static struct x86_reg get_const( struct translate_sse *p, unsigned id)
 134 {
 135    struct x86_reg reg;
 136    unsigned i;
 137
 138    if(p->const_to_reg[id] >= 0)
 139       return x86_make_reg(file_XMM, p->const_to_reg[id]);
 140
 141    for(i = 2; i < 8; ++i)
 142    {
 143       if(p->reg_to_const[i] < 0)
 144          break;
 145    }
 146
 147    /* TODO: be smarter here */
 148    if(i == 8)
 149       --i;
 150
 151    reg = x86_make_reg(file_XMM, i);
 152
 153    if(p->reg_to_const[i] >= 0)
 154       p->const_to_reg[p->reg_to_const[i]] = -1;
 155
 156    p->reg_to_const[i] = id;
 157    p->const_to_reg[id] = i;
 158
 159    /* TODO: this should happen outside the loop, if possible */
 160    sse_movaps(p->func, reg,
 161          x86_make_disp(p->machine_EDI,
 162                get_offset(p, &p->consts[id][0])));
 163
 164    return reg;
 165 }
 166
 167 /* load the data in a SSE2 register, padding with zeros */
 168 static boolean emit_load_sse2( struct translate_sse *p,
 169                                        struct x86_reg data,
 170                                        struct x86_reg src,
 171                                        unsigned size)
 172 {
 173    struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
 174    struct x86_reg tmp = p->tmp_EAX;
 175    switch(size)
 176    {
 177    case 1:
 178       x86_movzx8(p->func, tmp, src);
 179       sse2_movd(p->func, data, tmp);
 180       break;
 181    case 2:
 182       x86_movzx16(p->func, tmp, src);
 183       sse2_movd(p->func, data, tmp);
 184       break;
 185    case 3:
 186       x86_movzx8(p->func, tmp, x86_make_disp(src, 2));
 187       x86_shl_imm(p->func, tmp, 16);
 188       x86_mov16(p->func, tmp, src);
 189       sse2_movd(p->func, data, tmp);
 190       break;
 191    case 4:
 192       sse2_movd(p->func, data, src);
 193       break;
 194    case 6:
 195       sse2_movd(p->func, data, src);
 196       x86_movzx16(p->func, tmp, x86_make_disp(src, 4));
 197       sse2_movd(p->func, tmpXMM, tmp);
 198       sse2_punpckldq(p->func, data, tmpXMM);
 199       break;
 200    case 8:
 201       sse2_movq(p->func, data, src);
 202       break;
 203    case 12:
 204       sse2_movq(p->func, data, src);
 205       sse2_movd(p->func, tmpXMM, x86_make_disp(src, 8));
 206       sse2_punpcklqdq(p->func, data, tmpXMM);
 207       break;
 208    case 16:
 209       sse2_movdqu(p->func, data, src);
 210       break;
 211    default:
 212       return FALSE;
 213    }
 214    return TRUE;
 215 }
 216
 217 /* this value can be passed for the out_chans argument */
 218 #define CHANNELS_0001 5
 219
 220 /* this function will load #chans float values, and will
 221  * pad the register with zeroes at least up to out_chans.
 222  *
 223  * If out_chans is set to CHANNELS_0001, then the fourth
 224  * value will be padded with 1. Only pass this value if
 225  * chans < 4 or results are undefined.
 226  */
 227 static void emit_load_float32( struct translate_sse *p,
 228                                        struct x86_reg data,
 229                                        struct x86_reg arg0,
 230                                        unsigned out_chans,
 231                                        unsigned chans)
 232 {
 233    switch(chans)
 234    {
 235    case 1:
 236       /* a 0 0 0
 237        * a 0 0 1
 238        */
 239       sse_movss(p->func, data, arg0);
 240       if(out_chans == CHANNELS_0001)
 241          sse_orps(p->func, data, get_const(p, CONST_IDENTITY) );
 242       break;
 243    case 2:
 244       /* 0 0 0 1
 245        * a b 0 1
 246        */
 247       if(out_chans == CHANNELS_0001)
 248          sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X, Y, Z, W) );
 249       else if(out_chans > 2)
 250          sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY) );
 251       sse_movlps(p->func, data, arg0);
 252       break;
 253    case 3:
 254       /* Have to jump through some hoops:
 255        *
 256        * c 0 0 0
 257        * c 0 0 1 if out_chans == CHANNELS_0001
 258        * 0 0 c 0/1
 259        * a b c 0/1
 260        */
 261       sse_movss(p->func, data, x86_make_disp(arg0, 8));
 262       if(out_chans == CHANNELS_0001)
 263          sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X,Y,Z,W) );
 264       sse_shufps(p->func, data, data, SHUF(Y,Z,X,W) );
 265       sse_movlps(p->func, data, arg0);
 266       break;
 267    case 4:
 268       sse_movups(p->func, data, arg0);
 269       break;
 270    }
 271 }
 272
 273 /* this function behaves like emit_load_float32, but loads
 274    64-bit floating point numbers, converting them to 32-bit
 275   ones */
 276 static void emit_load_float64to32( struct translate_sse *p,
 277                                        struct x86_reg data,
 278                                        struct x86_reg arg0,
 279                                        unsigned out_chans,
 280                                        unsigned chans)
 281 {
 282    struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
 283    switch(chans)
 284    {
 285    case 1:
 286       sse2_movsd(p->func, data, arg0);
 287       if(out_chans > 1)
 288          sse2_cvtpd2ps(p->func, data, data);
 289       else
 290          sse2_cvtsd2ss(p->func, data, data);
 291       if(out_chans == CHANNELS_0001)
 292          sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X, Y, Z, W)  );
 293       break;
 294    case 2:
 295       sse2_movupd(p->func, data, arg0);
 296       sse2_cvtpd2ps(p->func, data, data);
 297       if(out_chans == CHANNELS_0001)
 298          sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X, Y, Z, W) );
 299       else if(out_chans > 2)
 300          sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY) );
 301        break;
 302    case 3:
 303       sse2_movupd(p->func, data, arg0);
 304       sse2_cvtpd2ps(p->func, data, data);
 305       sse2_movsd(p->func, tmpXMM, x86_make_disp(arg0, 16));
 306       if(out_chans > 3)
 307          sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM);
 308       else
 309          sse2_cvtsd2ss(p->func, tmpXMM, tmpXMM);
 310       sse_movlhps(p->func, data, tmpXMM);
 311       if(out_chans == CHANNELS_0001)
 312          sse_orps(p->func, data, get_const(p, CONST_IDENTITY) );
 313       break;
 314    case 4:
 315       sse2_movupd(p->func, data, arg0);
 316       sse2_cvtpd2ps(p->func, data, data);
 317       sse2_movupd(p->func, tmpXMM, x86_make_disp(arg0, 16));
 318       sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM);
 319       sse_movlhps(p->func, data, tmpXMM);
 320       break;
 321    }
 322 }
 323
 324 static void emit_mov64(struct translate_sse *p, struct x86_reg dst_gpr, struct x86_reg dst_xmm, struct x86_reg src_gpr,  struct x86_reg src_xmm)
 325 {
 326    if(x86_target(p->func) != X86_32)
 327       x64_mov64(p->func, dst_gpr, src_gpr);
 328    else
 329    {
 330       /* TODO: when/on which CPUs is SSE2 actually better than SSE? */
 331       if(x86_target_caps(p->func) & X86_SSE2)
 332          sse2_movq(p->func, dst_xmm, src_xmm);
 333       else
 334          sse_movlps(p->func, dst_xmm, src_xmm);
 335    }
 336 }
 337
 338 static void emit_load64(struct translate_sse *p, struct x86_reg dst_gpr, struct x86_reg dst_xmm, struct x86_reg src)
 339 {
 340    emit_mov64(p, dst_gpr, dst_xmm, src, src);
 341 }
 342
 343 static void emit_store64(struct translate_sse *p, struct x86_reg dst, struct x86_reg src_gpr, struct x86_reg src_xmm)
 344 {
 345    emit_mov64(p, dst, dst, src_gpr, src_xmm);
 346 }
 347
 348 static void emit_mov128(struct translate_sse *p, struct x86_reg dst, struct x86_reg src)
 349 {
 350    if(x86_target_caps(p->func) & X86_SSE2)
 351       sse2_movdqu(p->func, dst, src);
 352    else
 353       sse_movups(p->func, dst, src);
 354 }
 355
 356 /* TODO: this uses unaligned accesses liberally, which is great on Nehalem,
 357  * but may or may not be good on older processors
 358  * TODO: may perhaps want to use non-temporal stores here if possible
 359  */
 360 static void emit_memcpy(struct translate_sse *p, struct x86_reg dst, struct x86_reg src, unsigned size)
 361 {
 362    struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
 363    struct x86_reg dataXMM2 = x86_make_reg(file_XMM, 1);
 364    struct x86_reg dataGPR = p->tmp_EAX;
 365    struct x86_reg dataGPR2 = p->tmp2_EDX;
 366
 367    if(size < 8)
 368    {
 369       switch (size)
 370       {
 371       case 1:
 372          x86_mov8(p->func, dataGPR, src);
 373          x86_mov8(p->func, dst, dataGPR);
 374          break;
 375       case 2:
 376          x86_mov16(p->func, dataGPR, src);
 377          x86_mov16(p->func, dst, dataGPR);
 378          break;
 379       case 3:
 380          x86_mov16(p->func, dataGPR, src);
 381          x86_mov8(p->func, dataGPR2, x86_make_disp(src, 2));
 382          x86_mov16(p->func, dst, dataGPR);
 383          x86_mov8(p->func, x86_make_disp(dst, 2), dataGPR2);
 384          break;
 385       case 4:
 386          x86_mov(p->func, dataGPR, src);
 387          x86_mov(p->func, dst, dataGPR);
 388          break;
 389       case 6:
 390          x86_mov(p->func, dataGPR, src);
 391          x86_mov16(p->func, dataGPR2, x86_make_disp(src, 4));
 392          x86_mov(p->func, dst, dataGPR);
 393          x86_mov16(p->func, x86_make_disp(dst, 4), dataGPR2);
 394          break;
 395       }
 396    }
 397    else if(!(x86_target_caps(p->func) & X86_SSE))
 398    {
 399       unsigned i = 0;
 400       assert((size & 3) == 0);
 401       for(i = 0; i < size; i += 4)
 402       {
 403          x86_mov(p->func, dataGPR, x86_make_disp(src, i));
 404          x86_mov(p->func, x86_make_disp(dst, i), dataGPR);
 405       }
 406    }
 407    else
 408    {
 409       switch(size)
 410       {
 411       case 8:
 412          emit_load64(p, dataGPR, dataXMM, src);
 413          emit_store64(p, dst, dataGPR, dataXMM);
 414          break;
 415       case 12:
 416          emit_load64(p, dataGPR2, dataXMM, src);
 417          x86_mov(p->func, dataGPR, x86_make_disp(src, 8));
 418          emit_store64(p, dst, dataGPR2, dataXMM);
 419          x86_mov(p->func, x86_make_disp(dst, 8), dataGPR);
 420          break;
 421       case 16:
 422          emit_mov128(p, dataXMM, src);
 423          emit_mov128(p, dst, dataXMM);
 424          break;
 425       case 24:
 426          emit_mov128(p, dataXMM, src);
 427          emit_load64(p, dataGPR, dataXMM2, x86_make_disp(src, 16));
 428          emit_mov128(p, dst, dataXMM);
 429          emit_store64(p, x86_make_disp(dst, 16), dataGPR, dataXMM2);
 430          break;
 431       case 32:
 432          emit_mov128(p, dataXMM, src);
 433          emit_mov128(p, dataXMM2, x86_make_disp(src, 16));
 434          emit_mov128(p, dst, dataXMM);
 435          emit_mov128(p, x86_make_disp(dst, 16), dataXMM2);
 436          break;
 437       default:
 438          assert(0);
 439       }
 440    }
 441 }
 442
 443 static boolean translate_attr_convert( struct translate_sse *p,
 444                                const struct translate_element *a,
 445                                struct x86_reg src,
 446                                struct x86_reg dst)
 447
 448 {
 449    const struct util_format_description* input_desc = util_format_description(a->input_format);
 450    const struct util_format_description* output_desc = util_format_description(a->output_format);
 451    unsigned i;
 452    boolean id_swizzle = TRUE;
 453    unsigned swizzle[4] = {UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE};
 454    unsigned needed_chans = 0;
 455    unsigned imms[2] = {0, 0x3f800000};
 456
 457    if(a->output_format == PIPE_FORMAT_NONE || a->input_format == PIPE_FORMAT_NONE)
 458       return FALSE;
 459
 460    if(input_desc->channel[0].size & 7)
 461       return FALSE;
 462
 463    if(input_desc->colorspace != output_desc->colorspace)
 464       return FALSE;
 465
 466    for(i = 1; i < input_desc->nr_channels; ++i)
 467    {
 468       if(memcmp(&input_desc->channel[i], &input_desc->channel[0], sizeof(input_desc->channel[0])))
 469          return FALSE;
 470    }
 471
 472    for(i = 1; i < output_desc->nr_channels; ++i)
 473    {
 474       if(memcmp(&output_desc->channel[i], &output_desc->channel[0], sizeof(output_desc->channel[0])))
 475          return FALSE;
 476    }
 477
 478    for(i = 0; i < output_desc->nr_channels; ++i)
 479    {
 480       if(output_desc->swizzle[i] < 4)
 481          swizzle[output_desc->swizzle[i]] = input_desc->swizzle[i];
 482    }
 483
 484    if((x86_target_caps(p->func) & X86_SSE) && (0
 485          || a->output_format == PIPE_FORMAT_R32_FLOAT
 486          || a->output_format == PIPE_FORMAT_R32G32_FLOAT
 487          || a->output_format == PIPE_FORMAT_R32G32B32_FLOAT
 488          || a->output_format == PIPE_FORMAT_R32G32B32A32_FLOAT))
 489    {
 490       struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
 491
 492       for(i = 0; i < output_desc->nr_channels; ++i)
 493       {
 494          if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0 && i >= input_desc->nr_channels)
 495             swizzle[i] = i;
 496       }
 497
 498       for(i = 0; i < output_desc->nr_channels; ++i)
 499       {
 500          if(swizzle[i] < 4)
 501             needed_chans = MAX2(needed_chans, swizzle[i] + 1);
 502          if(swizzle[i] < UTIL_FORMAT_SWIZZLE_0 && swizzle[i] != i)
 503             id_swizzle = FALSE;
 504       }
 505
 506       if(needed_chans > 0)
 507       {
 508          switch(input_desc->channel[0].type)
 509          {
 510          case UTIL_FORMAT_TYPE_UNSIGNED:
 511             if(!(x86_target_caps(p->func) & X86_SSE2))
 512                return FALSE;
 513             emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3);
 514
 515             /* TODO: add support for SSE4.1 pmovzx */
 516             switch(input_desc->channel[0].size)
 517             {
 518             case 8:
 519                /* TODO: this may be inefficient due to get_identity() being used both as a float and integer register */
 520                sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
 521                sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
 522                break;
 523             case 16:
 524                sse2_punpcklwd(p->func, dataXMM, get_const(p, CONST_IDENTITY));
 525                break;
 526             case 32: /* we lose precision here */
 527                sse2_psrld_imm(p->func, dataXMM, 1);
 528                break;
 529             default:
 530                return FALSE;
 531             }
 532             sse2_cvtdq2ps(p->func, dataXMM, dataXMM);
 533             if(input_desc->channel[0].normalized)
 534             {
 535                struct x86_reg factor;
 536                switch(input_desc->channel[0].size)
 537                {
 538                case 8:
 539                   factor = get_const(p, CONST_INV_255);
 540                   break;
 541                case 16:
 542                   factor = get_const(p, CONST_INV_65535);
 543                   break;
 544                case 32:
 545                   factor = get_const(p, CONST_INV_2147483647);
 546                   break;
 547                default:
 548                   assert(0);
 549                   factor.disp = 0;
 550                   factor.file = 0;
 551                   factor.idx = 0;
 552                   factor.mod = 0;
 553                   break;
 554                }
 555                sse_mulps(p->func, dataXMM, factor);
 556             }
 557             else if(input_desc->channel[0].size == 32)
 558                sse_addps(p->func, dataXMM, dataXMM); /* compensate for the bit we threw away to fit u32 into s32 */
 559             break;
 560          case UTIL_FORMAT_TYPE_SIGNED:
 561             if(!(x86_target_caps(p->func) & X86_SSE2))
 562                return FALSE;
 563             emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3);
 564
 565             /* TODO: add support for SSE4.1 pmovsx */
 566             switch(input_desc->channel[0].size)
 567             {
 568             case 8:
 569                sse2_punpcklbw(p->func, dataXMM, dataXMM);
 570                sse2_punpcklbw(p->func, dataXMM, dataXMM);
 571                sse2_psrad_imm(p->func, dataXMM, 24);
 572                break;
 573             case 16:
 574                sse2_punpcklwd(p->func, dataXMM, dataXMM);
 575                sse2_psrad_imm(p->func, dataXMM, 16);
 576                break;
 577             case 32: /* we lose precision here */
 578                break;
 579             default:
 580                return FALSE;
 581             }
 582             sse2_cvtdq2ps(p->func, dataXMM, dataXMM);
 583             if(input_desc->channel[0].normalized)
 584             {
 585                struct x86_reg factor;
 586                switch(input_desc->channel[0].size)
 587                {
 588                case 8:
 589                   factor = get_const(p, CONST_INV_127);
 590                   break;
 591                case 16:
 592                   factor = get_const(p, CONST_INV_32767);
 593                   break;
 594                case 32:
 595                   factor = get_const(p, CONST_INV_2147483647);
 596                   break;
 597                default:
 598                   assert(0);
 599                   factor.disp = 0;
 600                   factor.file = 0;
 601                   factor.idx = 0;
 602                   factor.mod = 0;
 603                   break;
 604                }
 605                sse_mulps(p->func, dataXMM, factor);
 606             }
 607             break;
 608
 609             break;
 610          case UTIL_FORMAT_TYPE_FLOAT:
 611             if(input_desc->channel[0].size != 32 && input_desc->channel[0].size != 64)
 612                return FALSE;
 613             if(swizzle[3] == UTIL_FORMAT_SWIZZLE_1 && input_desc->nr_channels <= 3)
 614             {
 615                swizzle[3] = UTIL_FORMAT_SWIZZLE_W;
 616                needed_chans = CHANNELS_0001;
 617             }
 618             switch(input_desc->channel[0].size)
 619             {
 620             case 32:
 621                emit_load_float32(p, dataXMM, src, needed_chans, input_desc->nr_channels);
 622                break;
 623             case 64: /* we lose precision here */
 624                if(!(x86_target_caps(p->func) & X86_SSE2))
 625                   return FALSE;
 626                emit_load_float64to32(p, dataXMM, src, needed_chans, input_desc->nr_channels);
 627                break;
 628             default:
 629                return FALSE;
 630             }
 631             break;
 632          default:
 633             return FALSE;
 634          }
 635
 636          if(!id_swizzle)
 637             sse_shufps(p->func, dataXMM, dataXMM, SHUF(swizzle[0], swizzle[1], swizzle[2], swizzle[3]) );
 638       }
 639
 640       if(output_desc->nr_channels >= 4
 641             && swizzle[0] < UTIL_FORMAT_SWIZZLE_0
 642             && swizzle[1] < UTIL_FORMAT_SWIZZLE_0
 643             && swizzle[2] < UTIL_FORMAT_SWIZZLE_0
 644             && swizzle[3] < UTIL_FORMAT_SWIZZLE_0
 645             )
 646          sse_movups(p->func, dst, dataXMM);
 647       else
 648       {
 649          if(output_desc->nr_channels >= 2
 650                && swizzle[0] < UTIL_FORMAT_SWIZZLE_0
 651                && swizzle[1] < UTIL_FORMAT_SWIZZLE_0)
 652             sse_movlps(p->func, dst, dataXMM);
 653          else
 654          {
 655             if(swizzle[0] < UTIL_FORMAT_SWIZZLE_0)
 656                sse_movss(p->func, dst, dataXMM);
 657             else
 658                x86_mov_imm(p->func, dst, imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]);
 659
 660             if(output_desc->nr_channels >= 2)
 661             {
 662                if(swizzle[1] < UTIL_FORMAT_SWIZZLE_0)
 663                {
 664                   sse_shufps(p->func, dataXMM, dataXMM, SHUF(1, 1, 2, 3));
 665                   sse_movss(p->func, x86_make_disp(dst, 4), dataXMM);
 666                }
 667                else
 668                   x86_mov_imm(p->func, x86_make_disp(dst, 4), imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0]);
 669             }
 670          }
 671
 672          if(output_desc->nr_channels >= 3)
 673          {
 674             if(output_desc->nr_channels >= 4
 675                   && swizzle[2] < UTIL_FORMAT_SWIZZLE_0
 676                   && swizzle[3] < UTIL_FORMAT_SWIZZLE_0)
 677                sse_movhps(p->func, x86_make_disp(dst, 8), dataXMM);
 678             else
 679             {
 680                if(swizzle[2] < UTIL_FORMAT_SWIZZLE_0)
 681                {
 682                   sse_shufps(p->func, dataXMM, dataXMM, SHUF(2, 2, 2, 3));
 683                   sse_movss(p->func, x86_make_disp(dst, 8), dataXMM);
 684                }
 685                else
 686                   x86_mov_imm(p->func, x86_make_disp(dst, 8), imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]);
 687
 688                if(output_desc->nr_channels >= 4)
 689                {
 690                   if(swizzle[3] < UTIL_FORMAT_SWIZZLE_0)
 691                   {
 692                      sse_shufps(p->func, dataXMM, dataXMM, SHUF(3, 3, 3, 3));
 693                      sse_movss(p->func, x86_make_disp(dst, 12), dataXMM);
 694                   }
 695                   else
 696                      x86_mov_imm(p->func, x86_make_disp(dst, 12), imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0]);
 697                }
 698             }
 699          }
 700       }
 701       return TRUE;
 702    }
 703    else if((x86_target_caps(p->func) & X86_SSE2) && input_desc->channel[0].size == 8 && output_desc->channel[0].size == 16
 704          && output_desc->channel[0].normalized == input_desc->channel[0].normalized
 705          && (0
 706                || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED)
 707                || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)
 708                || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)
 709                ))
 710    {
 711       struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
 712       struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
 713       struct x86_reg tmp = p->tmp_EAX;
 714       unsigned imms[2] = {0, 1};
 715
 716       for(i = 0; i < output_desc->nr_channels; ++i)
 717       {
 718          if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0 && i >= input_desc->nr_channels)
 719             swizzle[i] = i;
 720       }
 721
 722       for(i = 0; i < output_desc->nr_channels; ++i)
 723       {
 724          if(swizzle[i] < 4)
 725             needed_chans = MAX2(needed_chans, swizzle[i] + 1);
 726          if(swizzle[i] < UTIL_FORMAT_SWIZZLE_0 && swizzle[i] != i)
 727             id_swizzle = FALSE;
 728       }
 729
 730       if(needed_chans > 0)
 731       {
 732          emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3);
 733
 734          switch(input_desc->channel[0].type)
 735          {
 736          case UTIL_FORMAT_TYPE_UNSIGNED:
 737             if(input_desc->channel[0].normalized)
 738             {
 739                sse2_punpcklbw(p->func, dataXMM, dataXMM);
 740                if(output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)
 741                        sse2_psrlw_imm(p->func, dataXMM, 1);
 742             }
 743             else
 744                sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
 745             break;
 746          case UTIL_FORMAT_TYPE_SIGNED:
 747             if(input_desc->channel[0].normalized)
 748             {
 749                sse2_movq(p->func, tmpXMM, get_const(p, CONST_IDENTITY));
 750                sse2_punpcklbw(p->func, tmpXMM, dataXMM);
 751                sse2_psllw_imm(p->func, dataXMM, 9);
 752                sse2_psrlw_imm(p->func, dataXMM, 8);
 753                sse2_por(p->func, tmpXMM, dataXMM);
 754                sse2_psrlw_imm(p->func, dataXMM, 7);
 755                sse2_por(p->func, tmpXMM, dataXMM);
 756                {
 757                   struct x86_reg t = dataXMM;
 758                   dataXMM = tmpXMM;
 759                   tmpXMM = t;
 760                }
 761             }
 762             else
 763             {
 764                sse2_punpcklbw(p->func, dataXMM, dataXMM);
 765                sse2_psraw_imm(p->func, dataXMM, 8);
 766             }
 767             break;
 768          default:
 769             assert(0);
 770          }
 771
 772          if(output_desc->channel[0].normalized)
 773             imms[1] = (output_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) ? 0xffff : 0x7ffff;
 774
 775          if(!id_swizzle)
 776             sse2_pshuflw(p->func, dataXMM, dataXMM, (swizzle[0] & 3) | ((swizzle[1] & 3) << 2) | ((swizzle[2] & 3) << 4) | ((swizzle[3] & 3) << 6));
 777       }
 778
 779       if(output_desc->nr_channels >= 4
 780             && swizzle[0] < UTIL_FORMAT_SWIZZLE_0
 781             && swizzle[1] < UTIL_FORMAT_SWIZZLE_0
 782             && swizzle[2] < UTIL_FORMAT_SWIZZLE_0
 783             && swizzle[3] < UTIL_FORMAT_SWIZZLE_0
 784             )
 785          sse2_movq(p->func, dst, dataXMM);
 786       else
 787       {
 788          if(swizzle[0] < UTIL_FORMAT_SWIZZLE_0)
 789          {
 790             if(output_desc->nr_channels >= 2 && swizzle[1] < UTIL_FORMAT_SWIZZLE_0)
 791                sse2_movd(p->func, dst, dataXMM);
 792             else
 793             {
 794                sse2_movd(p->func, tmp, dataXMM);
 795                x86_mov16(p->func, dst, tmp);
 796                if(output_desc->nr_channels >= 2)
 797                   x86_mov16_imm(p->func, x86_make_disp(dst, 2), imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0]);
 798             }
 799          }
 800          else
 801          {
 802             if(output_desc->nr_channels >= 2 && swizzle[1] >= UTIL_FORMAT_SWIZZLE_0)
 803                x86_mov_imm(p->func, dst, (imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0] << 16) | imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]);
 804             else
 805             {
 806                x86_mov16_imm(p->func, dst, imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]);
 807                if(output_desc->nr_channels >= 2)
 808                {
 809                   sse2_movd(p->func, tmp, dataXMM);
 810                   x86_shr_imm(p->func, tmp, 16);
 811                   x86_mov16(p->func, x86_make_disp(dst, 2), tmp);
 812                }
 813             }
 814          }
 815
 816          if(output_desc->nr_channels >= 3)
 817          {
 818             if(swizzle[2] < UTIL_FORMAT_SWIZZLE_0)
 819             {
 820                if(output_desc->nr_channels >= 4 && swizzle[3] < UTIL_FORMAT_SWIZZLE_0)
 821                {
 822                   sse2_psrlq_imm(p->func, dataXMM, 32);
 823                   sse2_movd(p->func, x86_make_disp(dst, 4), dataXMM);
 824                }
 825                else
 826                {
 827                   sse2_psrlq_imm(p->func, dataXMM, 32);
 828                   sse2_movd(p->func, tmp, dataXMM);
 829                   x86_mov16(p->func, x86_make_disp(dst, 4), tmp);
 830                   if(output_desc->nr_channels >= 4)
 831                   {
 832                      x86_mov16_imm(p->func, x86_make_disp(dst, 6), imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0]);
 833                   }
 834                }
 835             }
 836             else
 837             {
 838                if(output_desc->nr_channels >= 4 && swizzle[3] >= UTIL_FORMAT_SWIZZLE_0)
 839                   x86_mov_imm(p->func, x86_make_disp(dst, 4), (imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0] << 16) | imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]);
 840                else
 841                {
 842                   x86_mov16_imm(p->func, x86_make_disp(dst, 4), imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]);
 843
 844                   if(output_desc->nr_channels >= 4)
 845                   {
 846                      sse2_psrlq_imm(p->func, dataXMM, 48);
 847                      sse2_movd(p->func, tmp, dataXMM);
 848                      x86_mov16(p->func, x86_make_disp(dst, 6), tmp);
 849                   }
 850                }
 851             }
 852          }
 853       }
 854       return TRUE;
 855    }
 856    else if(!memcmp(&output_desc->channel[0], &input_desc->channel[0], sizeof(output_desc->channel[0])))
 857    {
 858       struct x86_reg tmp = p->tmp_EAX;
 859       unsigned i;
 860       if(input_desc->channel[0].size == 8 && input_desc->nr_channels == 4 && output_desc->nr_channels == 4
 861                      && swizzle[0] == UTIL_FORMAT_SWIZZLE_W
 862                      && swizzle[1] == UTIL_FORMAT_SWIZZLE_Z
 863                      && swizzle[2] == UTIL_FORMAT_SWIZZLE_Y
 864                      && swizzle[3] == UTIL_FORMAT_SWIZZLE_X)
 865       {
 866          /* TODO: support movbe */
 867          x86_mov(p->func, tmp, src);
 868          x86_bswap(p->func, tmp);
 869          x86_mov(p->func, dst, tmp);
 870          return TRUE;
 871       }
 872
 873       for(i = 0; i < output_desc->nr_channels; ++i)
 874       {
 875          switch(output_desc->channel[0].size)
 876          {
 877          case 8:
 878             if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0)
 879             {
 880                unsigned v = 0;
 881                if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1)
 882                {
 883                   switch(output_desc->channel[0].type)
 884                   {
 885                   case UTIL_FORMAT_TYPE_UNSIGNED:
 886                      v = output_desc->channel[0].normalized ? 0xff : 1;
 887                      break;
 888                   case UTIL_FORMAT_TYPE_SIGNED:
 889                      v = output_desc->channel[0].normalized ? 0x7f : 1;
 890                      break;
 891                   default:
 892                      return FALSE;
 893                   }
 894                }
 895                x86_mov8_imm(p->func, x86_make_disp(dst, i * 1), v);
 896             }
 897             else
 898             {
 899                x86_mov8(p->func, tmp, x86_make_disp(src, swizzle[i] * 1));
 900                x86_mov8(p->func, x86_make_disp(dst, i * 1), tmp);
 901             }
 902             break;
 903          case 16:
 904             if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0)
 905             {
 906                unsigned v = 0;
 907                if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1)
 908                {
 909                   switch(output_desc->channel[1].type)
 910                   {
 911                   case UTIL_FORMAT_TYPE_UNSIGNED:
 912                      v = output_desc->channel[1].normalized ? 0xffff : 1;
 913                      break;
 914                   case UTIL_FORMAT_TYPE_SIGNED:
 915                      v = output_desc->channel[1].normalized ? 0x7fff : 1;
 916                      break;
 917                   case UTIL_FORMAT_TYPE_FLOAT:
 918                      v = 0x3c00;
 919                      break;
 920                   default:
 921                      return FALSE;
 922                   }
 923                }
 924                x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), v);
 925             }
 926             else if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0)
 927                x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), 0);
 928             else
 929             {
 930                x86_mov16(p->func, tmp, x86_make_disp(src, swizzle[i] * 2));
 931                x86_mov16(p->func, x86_make_disp(dst, i * 2), tmp);
 932             }
 933             break;
 934          case 32:
 935             if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0)
 936             {
 937                unsigned v = 0;
 938                if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1)
 939                {
 940                   switch(output_desc->channel[1].type)
 941                   {
 942                   case UTIL_FORMAT_TYPE_UNSIGNED:
 943                      v = output_desc->channel[1].normalized ? 0xffffffff : 1;
 944                      break;
 945                   case UTIL_FORMAT_TYPE_SIGNED:
 946                      v = output_desc->channel[1].normalized ? 0x7fffffff : 1;
 947                      break;
 948                   case UTIL_FORMAT_TYPE_FLOAT:
 949                      v = 0x3f800000;
 950                      break;
 951                   default:
 952                      return FALSE;
 953                   }
 954                }
 955                x86_mov_imm(p->func, x86_make_disp(dst, i * 4), v);
 956             }
 957             else
 958             {
 959                x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 4));
 960                x86_mov(p->func, x86_make_disp(dst, i * 4), tmp);
 961             }
 962             break;
 963          case 64:
 964             if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0)
 965             {
 966                unsigned l = 0;
 967                unsigned h = 0;
 968                if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1)
 969                {
 970                   switch(output_desc->channel[1].type)
 971                   {
 972                   case UTIL_FORMAT_TYPE_UNSIGNED:
 973                      h = output_desc->channel[1].normalized ? 0xffffffff : 0;
 974                      l = output_desc->channel[1].normalized ? 0xffffffff : 1;
 975                      break;
 976                   case UTIL_FORMAT_TYPE_SIGNED:
 977                      h = output_desc->channel[1].normalized ? 0x7fffffff : 0;
 978                      l = output_desc->channel[1].normalized ? 0xffffffff : 1;
 979                      break;
 980                   case UTIL_FORMAT_TYPE_FLOAT:
 981                      h = 0x3ff00000;
 982                      l = 0;
 983                      break;
 984                   default:
 985                      return FALSE;
 986                   }
 987                }
 988                x86_mov_imm(p->func, x86_make_disp(dst, i * 8), l);
 989                x86_mov_imm(p->func, x86_make_disp(dst, i * 8 + 4), h);
 990             }
 991             else
 992             {
 993                if(x86_target_caps(p->func) & X86_SSE)
 994                {
 995                   struct x86_reg tmpXMM = x86_make_reg(file_XMM, 0);
 996                   emit_load64(p, tmp, tmpXMM, x86_make_disp(src, swizzle[i] * 8));
 997                   emit_store64(p, x86_make_disp(dst, i * 8), tmp, tmpXMM);
 998                }
 999                else
1000                {
1001                   x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 8));
1002                   x86_mov(p->func, x86_make_disp(dst, i * 8), tmp);
1003                   x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 8 + 4));
1004                   x86_mov(p->func, x86_make_disp(dst, i * 8 + 4), tmp);
1005                }
1006             }
1007             break;
1008          default:
1009             return FALSE;
1010          }
1011       }
1012       return TRUE;
1013    }
1014    /* special case for draw's EMIT_4UB (RGBA) and EMIT_4UB_BGRA */
1015    else if((x86_target_caps(p->func) & X86_SSE2) &&
1016          a->input_format == PIPE_FORMAT_R32G32B32A32_FLOAT && (0
1017                || a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM
1018                || a->output_format == PIPE_FORMAT_R8G8B8A8_UNORM
1019          ))
1020    {
1021       struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
1022
1023       /* load */
1024       sse_movups(p->func, dataXMM, src);
1025
1026       if (a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM)
1027          sse_shufps(p->func, dataXMM, dataXMM, SHUF(2,1,0,3));
1028
1029       /* scale by 255.0 */
1030       sse_mulps(p->func, dataXMM, get_const(p, CONST_255));
1031
1032       /* pack and emit */
1033       sse2_cvtps2dq(p->func, dataXMM, dataXMM);
1034       sse2_packssdw(p->func, dataXMM, dataXMM);
1035       sse2_packuswb(p->func, dataXMM, dataXMM);
1036       sse2_movd(p->func, dst, dataXMM);
1037
1038       return TRUE;
1039    }
1040
1041    return FALSE;
1042 }
1043
1044 static boolean translate_attr( struct translate_sse *p,
1045                                const struct translate_element *a,
1046                                struct x86_reg src,
1047                                struct x86_reg dst)
1048 {
1049    if(a->input_format == a->output_format)
1050    {
1051       emit_memcpy(p, dst, src, util_format_get_stride(a->input_format, 1));
1052       return TRUE;
1053    }
1054
1055    return translate_attr_convert(p, a, src, dst);
1056 }
1057
1058 static boolean init_inputs( struct translate_sse *p,
1059                             unsigned index_size )
1060 {
1061    unsigned i;
1062    struct x86_reg instance_id = x86_make_disp(p->machine_EDI,
1063                                               get_offset(p, &p->instance_id));
1064
1065    for (i = 0; i < p->nr_buffer_varients; i++) {
1066       struct translate_buffer_varient *varient = &p->buffer_varient[i];
1067       struct translate_buffer *buffer = &p->buffer[varient->buffer_index];
1068
1069       if (!index_size || varient->instance_divisor) {
1070          struct x86_reg buf_stride   = x86_make_disp(p->machine_EDI,
1071                                                      get_offset(p, &buffer->stride));
1072          struct x86_reg buf_ptr      = x86_make_disp(p->machine_EDI,
1073                                                      get_offset(p, &varient->ptr));
1074          struct x86_reg buf_base_ptr = x86_make_disp(p->machine_EDI,
1075                                                      get_offset(p, &buffer->base_ptr));
1076          struct x86_reg elt = p->idx_ESI;
1077          struct x86_reg tmp_EAX = p->tmp_EAX;
1078
1079          /* Calculate pointer to first attrib:
1080           *   base_ptr + stride * index, where index depends on instance divisor
1081           */
1082          if (varient->instance_divisor) {
1083             /* Our index is instance ID divided by instance divisor.
1084              */
1085             x86_mov(p->func, tmp_EAX, instance_id);
1086
1087             if (varient->instance_divisor != 1) {
1088                struct x86_reg tmp_EDX = p->tmp2_EDX;
1089                struct x86_reg tmp_ECX = p->src_ECX;
1090
1091                /* TODO: Add x86_shr() to rtasm and use it whenever
1092                 *       instance divisor is power of two.
1093                 */
1094
1095                x86_xor(p->func, tmp_EDX, tmp_EDX);
1096                x86_mov_reg_imm(p->func, tmp_ECX, varient->instance_divisor);
1097                x86_div(p->func, tmp_ECX);    /* EAX = EDX:EAX / ECX */
1098             }
1099          } else {
1100             x86_mov(p->func, tmp_EAX, elt);
1101          }
1102
1103          /*
1104           * TODO: Respect translate_buffer::max_index.
1105           */
1106
1107          x86_imul(p->func, tmp_EAX, buf_stride);
1108          x64_rexw(p->func);
1109          x86_add(p->func, tmp_EAX, buf_base_ptr);
1110
1111
1112          /* In the linear case, keep the buffer pointer instead of the
1113           * index number.
1114           */
1115          if (!index_size && p->nr_buffer_varients == 1)
1116          {
1117             x64_rexw(p->func);
1118             x86_mov(p->func, elt, tmp_EAX);
1119          }
1120          else
1121          {
1122             x64_rexw(p->func);
1123             x86_mov(p->func, buf_ptr, tmp_EAX);
1124          }
1125       }
1126    }
1127
1128    return TRUE;
1129 }
1130
1131
1132 static struct x86_reg get_buffer_ptr( struct translate_sse *p,
1133                                       unsigned index_size,
1134                                       unsigned var_idx,
1135                                       struct x86_reg elt )
1136 {
1137    if (var_idx == ELEMENT_BUFFER_INSTANCE_ID) {
1138       return x86_make_disp(p->machine_EDI,
1139                            get_offset(p, &p->instance_id));
1140    }
1141    if (!index_size && p->nr_buffer_varients == 1) {
1142       return p->idx_ESI;
1143    }
1144    else if (!index_size || p->buffer_varient[var_idx].instance_divisor) {
1145       struct x86_reg ptr = p->src_ECX;
1146       struct x86_reg buf_ptr =
1147          x86_make_disp(p->machine_EDI,
1148                        get_offset(p, &p->buffer_varient[var_idx].ptr));
1149
1150       x64_rexw(p->func);
1151       x86_mov(p->func, ptr, buf_ptr);
1152       return ptr;
1153    }
1154    else {
1155       struct x86_reg ptr = p->src_ECX;
1156       const struct translate_buffer_varient *varient = &p->buffer_varient[var_idx];
1157
1158       struct x86_reg buf_stride =
1159          x86_make_disp(p->machine_EDI,
1160                        get_offset(p, &p->buffer[varient->buffer_index].stride));
1161
1162       struct x86_reg buf_base_ptr =
1163          x86_make_disp(p->machine_EDI,
1164                        get_offset(p, &p->buffer[varient->buffer_index].base_ptr));
1165
1166
1167
1168       /* Calculate pointer to current attrib:
1169        */
1170       switch(index_size)
1171       {
1172       case 1:
1173          x86_movzx8(p->func, ptr, elt);
1174          break;
1175       case 2:
1176          x86_movzx16(p->func, ptr, elt);
1177          break;
1178       case 4:
1179          x86_mov(p->func, ptr, elt);
1180          break;
1181       }
1182       x86_imul(p->func, ptr, buf_stride);
1183       x64_rexw(p->func);
1184       x86_add(p->func, ptr, buf_base_ptr);
1185       return ptr;
1186    }
1187 }
1188
1189
1190
1191 static boolean incr_inputs( struct translate_sse *p,
1192                             unsigned index_size )
1193 {
1194    if (!index_size && p->nr_buffer_varients == 1) {
1195       struct x86_reg stride = x86_make_disp(p->machine_EDI,
1196                                             get_offset(p, &p->buffer[0].stride));
1197
1198       if (p->buffer_varient[0].instance_divisor == 0) {
1199          x64_rexw(p->func);
1200          x86_add(p->func, p->idx_ESI, stride);
1201          sse_prefetchnta(p->func, x86_make_disp(p->idx_ESI, 192));
1202       }
1203    }
1204    else if (!index_size) {
1205       unsigned i;
1206
1207       /* Is this worthwhile??
1208        */
1209       for (i = 0; i < p->nr_buffer_varients; i++) {
1210          struct translate_buffer_varient *varient = &p->buffer_varient[i];
1211          struct x86_reg buf_ptr = x86_make_disp(p->machine_EDI,
1212                                                 get_offset(p, &varient->ptr));
1213          struct x86_reg buf_stride = x86_make_disp(p->machine_EDI,
1214                                                    get_offset(p, &p->buffer[varient->buffer_index].stride));
1215
1216          if (varient->instance_divisor == 0) {
1217             x86_mov(p->func, p->tmp_EAX, buf_stride);
1218             x64_rexw(p->func);
1219             x86_add(p->func, p->tmp_EAX, buf_ptr);
1220             if (i == 0) sse_prefetchnta(p->func, x86_make_disp(p->tmp_EAX, 192));
1221             x64_rexw(p->func);
1222             x86_mov(p->func, buf_ptr, p->tmp_EAX);
1223          }
1224       }
1225    }
1226    else {
1227       x64_rexw(p->func);
1228       x86_lea(p->func, p->idx_ESI, x86_make_disp(p->idx_ESI, index_size));
1229    }
1230
1231    return TRUE;
1232 }
1233
1234
1235 /* Build run( struct translate *machine,
1236  *            unsigned start,
1237  *            unsigned count,
1238  *            void *output_buffer )
1239  * or
1240  *  run_elts( struct translate *machine,
1241  *            unsigned *elts,
1242  *            unsigned count,
1243  *            void *output_buffer )
1244  *
1245  *  Lots of hardcoding
1246  *
1247  * EAX -- pointer to current output vertex
1248  * ECX -- pointer to current attribute
1249  *
1250  */
1251 static boolean build_vertex_emit( struct translate_sse *p,
1252                                   struct x86_function *func,
1253                                   unsigned index_size )
1254 {
1255    int fixup, label;
1256    unsigned j;
1257
1258    memset(p->reg_to_const, 0xff, sizeof(p->reg_to_const));
1259    memset(p->const_to_reg, 0xff, sizeof(p->const_to_reg));
1260
1261    p->tmp_EAX       = x86_make_reg(file_REG32, reg_AX);
1262    p->idx_ESI       = x86_make_reg(file_REG32, reg_SI);
1263    p->outbuf_EBX    = x86_make_reg(file_REG32, reg_BX);
1264    p->machine_EDI   = x86_make_reg(file_REG32, reg_DI);
1265    p->count_EBP     = x86_make_reg(file_REG32, reg_BP);
1266    p->tmp2_EDX     = x86_make_reg(file_REG32, reg_DX);
1267    p->src_ECX     = x86_make_reg(file_REG32, reg_CX);
1268
1269    p->func = func;
1270
1271    x86_init_func(p->func);
1272
1273    if(x86_target(p->func) == X86_64_WIN64_ABI)
1274    {
1275            /* the ABI guarantees a 16-byte aligned 32-byte "shadow space" above the return address */
1276            sse2_movdqa(p->func, x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8), x86_make_reg(file_XMM, 6));
1277            sse2_movdqa(p->func, x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24), x86_make_reg(file_XMM, 7));
1278    }
1279
1280    x86_push(p->func, p->outbuf_EBX);
1281    x86_push(p->func, p->count_EBP);
1282
1283 /* on non-Win64 x86-64, these are already in the right registers */
1284    if(x86_target(p->func) != X86_64_STD_ABI)
1285    {
1286       x86_push(p->func, p->machine_EDI);
1287       x86_push(p->func, p->idx_ESI);
1288
1289       x86_mov(p->func, p->machine_EDI, x86_fn_arg(p->func, 1));
1290       x86_mov(p->func, p->idx_ESI, x86_fn_arg(p->func, 2));
1291    }
1292
1293    x86_mov(p->func, p->count_EBP, x86_fn_arg(p->func, 3));
1294
1295    if(x86_target(p->func) != X86_32)
1296       x64_mov64(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 5));
1297    else
1298       x86_mov(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 5));
1299
1300    /* Load instance ID.
1301     */
1302    if (p->use_instancing) {
1303       x86_mov(p->func,
1304               p->tmp_EAX,
1305               x86_fn_arg(p->func, 4));
1306       x86_mov(p->func,
1307               x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id)),
1308               p->tmp_EAX);
1309    }
1310
1311    /* Get vertex count, compare to zero
1312     */
1313    x86_xor(p->func, p->tmp_EAX, p->tmp_EAX);
1314    x86_cmp(p->func, p->count_EBP, p->tmp_EAX);
1315    fixup = x86_jcc_forward(p->func, cc_E);
1316
1317    /* always load, needed or not:
1318     */
1319    init_inputs(p, index_size);
1320
1321    /* Note address for loop jump
1322     */
1323    label = x86_get_label(p->func);
1324    {
1325       struct x86_reg elt = !index_size ? p->idx_ESI : x86_deref(p->idx_ESI);
1326       int last_varient = -1;
1327       struct x86_reg vb;
1328
1329       for (j = 0; j < p->translate.key.nr_elements; j++) {
1330          const struct translate_element *a = &p->translate.key.element[j];
1331          unsigned varient = p->element_to_buffer_varient[j];
1332
1333          /* Figure out source pointer address:
1334           */
1335          if (varient != last_varient) {
1336             last_varient = varient;
1337             vb = get_buffer_ptr(p, index_size, varient, elt);
1338          }
1339
1340          if (!translate_attr( p, a,
1341                               x86_make_disp(vb, a->input_offset),
1342                               x86_make_disp(p->outbuf_EBX, a->output_offset)))
1343             return FALSE;
1344       }
1345
1346       /* Next output vertex:
1347        */
1348       x64_rexw(p->func);
1349       x86_lea(p->func,
1350               p->outbuf_EBX,
1351               x86_make_disp(p->outbuf_EBX,
1352                             p->translate.key.output_stride));
1353
1354       /* Incr index
1355        */
1356       incr_inputs( p, index_size );
1357    }
1358
1359    /* decr count, loop if not zero
1360     */
1361    x86_dec(p->func, p->count_EBP);
1362    x86_jcc(p->func, cc_NZ, label);
1363
1364    /* Exit mmx state?
1365     */
1366    if (p->func->need_emms)
1367       mmx_emms(p->func);
1368
1369    /* Land forward jump here:
1370     */
1371    x86_fixup_fwd_jump(p->func, fixup);
1372
1373    /* Pop regs and return
1374     */
1375
1376    if(x86_target(p->func) != X86_64_STD_ABI)
1377    {
1378       x86_pop(p->func, p->idx_ESI);
1379       x86_pop(p->func, p->machine_EDI);
1380    }
1381
1382    x86_pop(p->func, p->count_EBP);
1383    x86_pop(p->func, p->outbuf_EBX);
1384
1385    if(x86_target(p->func) == X86_64_WIN64_ABI)
1386    {
1387            sse2_movdqa(p->func, x86_make_reg(file_XMM, 6), x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8));
1388            sse2_movdqa(p->func, x86_make_reg(file_XMM, 7), x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24));
1389    }
1390    x86_ret(p->func);
1391
1392    return TRUE;
1393 }
1394
1395
1396
1397
1398
1399
1400
1401 static void translate_sse_set_buffer( struct translate *translate,
1402                                 unsigned buf,
1403                                 const void *ptr,
1404                                 unsigned stride,
1405                                 unsigned max_index )
1406 {
1407    struct translate_sse *p = (struct translate_sse *)translate;
1408
1409    if (buf < p->nr_buffers) {
1410       p->buffer[buf].base_ptr = (char *)ptr;
1411       p->buffer[buf].stride = stride;
1412       p->buffer[buf].max_index = max_index;
1413    }
1414
1415    if (0) debug_printf("%s %d/%d: %p %d\n",
1416                        __FUNCTION__, buf,
1417                        p->nr_buffers,
1418                        ptr, stride);
1419 }
1420
1421
1422 static void translate_sse_release( struct translate *translate )
1423 {
1424    struct translate_sse *p = (struct translate_sse *)translate;
1425
1426    x86_release_func( &p->linear_func );
1427    x86_release_func( &p->elt_func );
1428
1429    os_free_aligned(p);
1430 }
1431
1432
1433 struct translate *translate_sse2_create( const struct translate_key *key )
1434 {
1435    struct translate_sse *p = NULL;
1436    unsigned i;
1437
1438    /* this is misnamed, it actually refers to whether rtasm is enabled or not */
1439    if (!rtasm_cpu_has_sse())
1440       goto fail;
1441
1442    p = os_malloc_aligned(sizeof(struct translate_sse), 16);
1443    if (p == NULL)
1444       goto fail;
1445    memset(p, 0, sizeof(*p));
1446    memcpy(p->consts, consts, sizeof(consts));
1447
1448    p->translate.key = *key;
1449    p->translate.release = translate_sse_release;
1450    p->translate.set_buffer = translate_sse_set_buffer;
1451
1452    for (i = 0; i < key->nr_elements; i++) {
1453       if (key->element[i].type == TRANSLATE_ELEMENT_NORMAL) {
1454          unsigned j;
1455
1456          p->nr_buffers = MAX2(p->nr_buffers, key->element[i].input_buffer + 1);
1457
1458          if (key->element[i].instance_divisor) {
1459             p->use_instancing = TRUE;
1460          }
1461
1462          /*
1463           * Map vertex element to vertex buffer varient.
1464           */
1465          for (j = 0; j < p->nr_buffer_varients; j++) {
1466             if (p->buffer_varient[j].buffer_index == key->element[i].input_buffer &&
1467                 p->buffer_varient[j].instance_divisor == key->element[i].instance_divisor) {
1468                break;
1469             }
1470          }
1471          if (j == p->nr_buffer_varients) {
1472             p->buffer_varient[j].buffer_index = key->element[i].input_buffer;
1473             p->buffer_varient[j].instance_divisor = key->element[i].instance_divisor;
1474             p->nr_buffer_varients++;
1475          }
1476          p->element_to_buffer_varient[i] = j;
1477       } else {
1478          assert(key->element[i].type == TRANSLATE_ELEMENT_INSTANCE_ID);
1479
1480          p->element_to_buffer_varient[i] = ELEMENT_BUFFER_INSTANCE_ID;
1481       }
1482    }
1483
1484    if (0) debug_printf("nr_buffers: %d\n", p->nr_buffers);
1485
1486    if (!build_vertex_emit(p, &p->linear_func, 0))
1487       goto fail;
1488
1489    if (!build_vertex_emit(p, &p->elt_func, 4))
1490       goto fail;
1491
1492    if (!build_vertex_emit(p, &p->elt16_func, 2))
1493       goto fail;
1494
1495    if (!build_vertex_emit(p, &p->elt8_func, 1))
1496       goto fail;
1497
1498    p->translate.run = (run_func) x86_get_func(&p->linear_func);
1499    if (p->translate.run == NULL)
1500       goto fail;
1501
1502    p->translate.run_elts = (run_elts_func) x86_get_func(&p->elt_func);
1503    if (p->translate.run_elts == NULL)
1504       goto fail;
1505
1506    p->translate.run_elts16 = (run_elts16_func) x86_get_func(&p->elt16_func);
1507    if (p->translate.run_elts16 == NULL)
1508       goto fail;
1509
1510    p->translate.run_elts8 = (run_elts8_func) x86_get_func(&p->elt8_func);
1511    if (p->translate.run_elts8 == NULL)
1512       goto fail;
1513
1514    return &p->translate;
1515
1516  fail:
1517    if (p)
1518       translate_sse_release( &p->translate );
1519
1520    return NULL;
1521 }
1522
1523
1524
1525 #else
1526
1527 struct translate *translate_sse2_create( const struct translate_key *key )
1528 {
1529    return NULL;
1530 }
1531
1532 #endif