src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c

   1 /*
   2  * (C) Copyright IBM Corporation 2008
   3  * All Rights Reserved.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * on the rights to use, copy, modify, merge, publish, distribute, sub
   9  * license, and/or sell copies of the Software, and to permit persons to whom
  10  * the Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
  19  * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  20  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  21  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  */
  24
  25 /**
  26  * \file
  27  * Real-time assembly generation interface for Cell B.E. SPEs.
  28  *
  29  * \author Ian Romanick <idr@us.ibm.com>
  30  * \author Brian Paul
  31  */
  32
  33
  34 #include <stdio.h>
  35 #include "pipe/p_compiler.h"
  36 #include "util/u_memory.h"
  37 #include "rtasm_ppc_spe.h"
  38
  39
  40 #ifdef GALLIUM_CELL
  41 /**
  42  * SPE instruction types
  43  *
  44  * There are 6 primary instruction encodings used on the Cell's SPEs.  Each of
  45  * the following unions encodes one type.
  46  *
  47  * \bug
  48  * If, at some point, we start generating SPE code from a little-endian host
  49  * these unions will not work.
  50  */
  51 /*@{*/
  52 /**
  53  * Encode one output register with two input registers
  54  */
  55 union spe_inst_RR {
  56     uint32_t bits;
  57     struct {
  58         unsigned op:11;
  59         unsigned rB:7;
  60         unsigned rA:7;
  61         unsigned rT:7;
  62     } inst;
  63 };
  64
  65
  66 /**
  67  * Encode one output register with three input registers
  68  */
  69 union spe_inst_RRR {
  70     uint32_t bits;
  71     struct {
  72         unsigned op:4;
  73         unsigned rT:7;
  74         unsigned rB:7;
  75         unsigned rA:7;
  76         unsigned rC:7;
  77     } inst;
  78 };
  79
  80
  81 /**
  82  * Encode one output register with one input reg. and a 7-bit signed immed
  83  */
  84 union spe_inst_RI7 {
  85     uint32_t bits;
  86     struct {
  87         unsigned op:11;
  88         unsigned i7:7;
  89         unsigned rA:7;
  90         unsigned rT:7;
  91     } inst;
  92 };
  93
  94
  95 /**
  96  * Encode one output register with one input reg. and an 8-bit signed immed
  97  */
  98 union spe_inst_RI8 {
  99     uint32_t bits;
 100     struct {
 101         unsigned op:10;
 102         unsigned i8:8;
 103         unsigned rA:7;
 104         unsigned rT:7;
 105     } inst;
 106 };
 107
 108
 109 /**
 110  * Encode one output register with one input reg. and a 10-bit signed immed
 111  */
 112 union spe_inst_RI10 {
 113     uint32_t bits;
 114     struct {
 115         unsigned op:8;
 116         unsigned i10:10;
 117         unsigned rA:7;
 118         unsigned rT:7;
 119     } inst;
 120 };
 121
 122
 123 /**
 124  * Encode one output register with a 16-bit signed immediate
 125  */
 126 union spe_inst_RI16 {
 127     uint32_t bits;
 128     struct {
 129         unsigned op:9;
 130         unsigned i16:16;
 131         unsigned rT:7;
 132     } inst;
 133 };
 134
 135
 136 /**
 137  * Encode one output register with a 18-bit signed immediate
 138  */
 139 union spe_inst_RI18 {
 140     uint32_t bits;
 141     struct {
 142         unsigned op:7;
 143         unsigned i18:18;
 144         unsigned rT:7;
 145     } inst;
 146 };
 147 /*@}*/
 148
 149
 150 static void
 151 indent(const struct spe_function *p)
 152 {
 153    int i;
 154    for (i = 0; i < p->indent; i++) {
 155       putchar(' ');
 156    }
 157 }
 158
 159
 160 static const char *
 161 rem_prefix(const char *longname)
 162 {
 163    return longname + 4;
 164 }
 165
 166
 167 static const char *
 168 reg_name(int reg)
 169 {
 170    switch (reg) {
 171    case SPE_REG_SP:
 172       return "$sp";
 173    case SPE_REG_RA:
 174       return "$lr";
 175    default:
 176       {
 177          static char buf[10];
 178          sprintf(buf, "$%d", reg);
 179          return buf;
 180       }
 181    }
 182 }
 183
 184
 185 static void emit_RR(struct spe_function *p, unsigned op, unsigned rT,
 186                     unsigned rA, unsigned rB, const char *name)
 187 {
 188     union spe_inst_RR inst;
 189     inst.inst.op = op;
 190     inst.inst.rB = rB;
 191     inst.inst.rA = rA;
 192     inst.inst.rT = rT;
 193     p->store[p->num_inst++] = inst.bits;
 194     assert(p->num_inst <= p->max_inst);
 195     if (p->print) {
 196        indent(p);
 197        printf("%s\t%s, %s, %s\n",
 198               rem_prefix(name), reg_name(rT), reg_name(rA), reg_name(rB));
 199     }
 200 }
 201
 202
 203 static void emit_RRR(struct spe_function *p, unsigned op, unsigned rT,
 204                      unsigned rA, unsigned rB, unsigned rC, const char *name)
 205 {
 206     union spe_inst_RRR inst;
 207     inst.inst.op = op;
 208     inst.inst.rT = rT;
 209     inst.inst.rB = rB;
 210     inst.inst.rA = rA;
 211     inst.inst.rC = rC;
 212     p->store[p->num_inst++] = inst.bits;
 213     assert(p->num_inst <= p->max_inst);
 214     if (p->print) {
 215        indent(p);
 216        printf("%s\t%s, %s, %s, %s\n", rem_prefix(name), reg_name(rT),
 217               reg_name(rA), reg_name(rB), reg_name(rC));
 218     }
 219 }
 220
 221
 222 static void emit_RI7(struct spe_function *p, unsigned op, unsigned rT,
 223                      unsigned rA, int imm, const char *name)
 224 {
 225     union spe_inst_RI7 inst;
 226     inst.inst.op = op;
 227     inst.inst.i7 = imm;
 228     inst.inst.rA = rA;
 229     inst.inst.rT = rT;
 230     p->store[p->num_inst++] = inst.bits;
 231     assert(p->num_inst <= p->max_inst);
 232     if (p->print) {
 233        indent(p);
 234        printf("%s\t%s, %s, 0x%x\n",
 235               rem_prefix(name), reg_name(rT), reg_name(rA), imm);
 236     }
 237 }
 238
 239
 240
 241 static void emit_RI8(struct spe_function *p, unsigned op, unsigned rT,
 242                      unsigned rA, int imm, const char *name)
 243 {
 244     union spe_inst_RI8 inst;
 245     inst.inst.op = op;
 246     inst.inst.i8 = imm;
 247     inst.inst.rA = rA;
 248     inst.inst.rT = rT;
 249     p->store[p->num_inst++] = inst.bits;
 250     assert(p->num_inst <= p->max_inst);
 251     if (p->print) {
 252        indent(p);
 253        printf("%s\t%s, %s, 0x%x\n",
 254               rem_prefix(name), reg_name(rT), reg_name(rA), imm);
 255     }
 256 }
 257
 258
 259
 260 static void emit_RI10(struct spe_function *p, unsigned op, unsigned rT,
 261                       unsigned rA, int imm, const char *name)
 262 {
 263     union spe_inst_RI10 inst;
 264     inst.inst.op = op;
 265     inst.inst.i10 = imm;
 266     inst.inst.rA = rA;
 267     inst.inst.rT = rT;
 268     p->store[p->num_inst++] = inst.bits;
 269     assert(p->num_inst <= p->max_inst);
 270     if (p->print) {
 271        indent(p);
 272        if (strcmp(name, "spe_lqd") == 0 ||
 273            strcmp(name, "spe_stqd") == 0) {
 274           printf("%s\t%s, %d(%s)\n",
 275                  rem_prefix(name), reg_name(rT), imm, reg_name(rA));
 276        }
 277        else {
 278           printf("%s\t%s, %s, 0x%x\n",
 279                  rem_prefix(name), reg_name(rT), reg_name(rA), imm);
 280        }
 281     }
 282 }
 283
 284
 285 static void emit_RI16(struct spe_function *p, unsigned op, unsigned rT,
 286                       int imm, const char *name)
 287 {
 288     union spe_inst_RI16 inst;
 289     inst.inst.op = op;
 290     inst.inst.i16 = imm;
 291     inst.inst.rT = rT;
 292     p->store[p->num_inst++] = inst.bits;
 293     assert(p->num_inst <= p->max_inst);
 294     if (p->print) {
 295        indent(p);
 296        printf("%s\t%s, 0x%x\n", rem_prefix(name), reg_name(rT), imm);
 297     }
 298 }
 299
 300
 301 static void emit_RI18(struct spe_function *p, unsigned op, unsigned rT,
 302                       int imm, const char *name)
 303 {
 304     union spe_inst_RI18 inst;
 305     inst.inst.op = op;
 306     inst.inst.i18 = imm;
 307     inst.inst.rT = rT;
 308     p->store[p->num_inst++] = inst.bits;
 309     assert(p->num_inst <= p->max_inst);
 310     if (p->print) {
 311        indent(p);
 312        printf("%s\t%s, 0x%x\n", rem_prefix(name), reg_name(rT), imm);
 313     }
 314 }
 315
 316
 317
 318
 319 #define EMIT_(_name, _op) \
 320 void _name (struct spe_function *p, unsigned rT) \
 321 { \
 322    emit_RR(p, _op, rT, 0, 0, __FUNCTION__); \
 323 }
 324
 325 #define EMIT_R(_name, _op) \
 326 void _name (struct spe_function *p, unsigned rT, unsigned rA) \
 327 { \
 328    emit_RR(p, _op, rT, rA, 0, __FUNCTION__);                 \
 329 }
 330
 331 #define EMIT_RR(_name, _op) \
 332 void _name (struct spe_function *p, unsigned rT, unsigned rA, unsigned rB) \
 333 { \
 334    emit_RR(p, _op, rT, rA, rB, __FUNCTION__);                \
 335 }
 336
 337 #define EMIT_RRR(_name, _op) \
 338 void _name (struct spe_function *p, unsigned rT, unsigned rA, unsigned rB, unsigned rC) \
 339 { \
 340    emit_RRR(p, _op, rT, rA, rB, rC, __FUNCTION__);           \
 341 }
 342
 343 #define EMIT_RI7(_name, _op) \
 344 void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \
 345 { \
 346    emit_RI7(p, _op, rT, rA, imm, __FUNCTION__);              \
 347 }
 348
 349 #define EMIT_RI8(_name, _op, bias) \
 350 void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \
 351 { \
 352    emit_RI8(p, _op, rT, rA, bias - imm, __FUNCTION__);       \
 353 }
 354
 355 #define EMIT_RI10(_name, _op) \
 356 void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \
 357 { \
 358    emit_RI10(p, _op, rT, rA, imm, __FUNCTION__);             \
 359 }
 360
 361 #define EMIT_RI16(_name, _op) \
 362 void _name (struct spe_function *p, unsigned rT, int imm) \
 363 { \
 364    emit_RI16(p, _op, rT, imm, __FUNCTION__);                 \
 365 }
 366
 367 #define EMIT_RI18(_name, _op) \
 368 void _name (struct spe_function *p, unsigned rT, int imm) \
 369 { \
 370    emit_RI18(p, _op, rT, imm, __FUNCTION__);                 \
 371 }
 372
 373 #define EMIT_I16(_name, _op) \
 374 void _name (struct spe_function *p, int imm) \
 375 { \
 376    emit_RI16(p, _op, 0, imm, __FUNCTION__);                  \
 377 }
 378
 379 #include "rtasm_ppc_spe.h"
 380
 381
 382 /**
 383  * Initialize an spe_function.
 384  * \param code_size  size of instruction buffer to allocate, in bytes.
 385  */
 386 void spe_init_func(struct spe_function *p, unsigned code_size)
 387 {
 388     register unsigned int i;
 389
 390     p->store = align_malloc(code_size, 16);
 391     p->num_inst = 0;
 392     p->max_inst = code_size / SPE_INST_SIZE;
 393
 394     p->set_count = 0;
 395     memset(p->regs, 0, SPE_NUM_REGS * sizeof(p->regs[0]));
 396
 397     /* Conservatively treat R0 - R2 and R80 - R127 as non-volatile.
 398      */
 399     p->regs[0] = p->regs[1] = p->regs[2] = 1;
 400     for (i = 80; i <= 127; i++) {
 401       p->regs[i] = 1;
 402     }
 403
 404     p->print = false;
 405     p->indent = 0;
 406 }
 407
 408
 409 void spe_release_func(struct spe_function *p)
 410 {
 411     assert(p->num_inst <= p->max_inst);
 412     if (p->store != NULL) {
 413         align_free(p->store);
 414     }
 415     p->store = NULL;
 416 }
 417
 418
 419 /** Return current code size in bytes. */
 420 unsigned spe_code_size(const struct spe_function *p)
 421 {
 422    return p->num_inst * SPE_INST_SIZE;
 423 }
 424
 425
 426 /**
 427  * Allocate a SPE register.
 428  * \return register index or -1 if none left.
 429  */
 430 int spe_allocate_available_register(struct spe_function *p)
 431 {
 432    unsigned i;
 433    for (i = 0; i < SPE_NUM_REGS; i++) {
 434       if (p->regs[i] == 0) {
 435          p->regs[i] = 1;
 436          return i;
 437       }
 438    }
 439
 440    return -1;
 441 }
 442
 443
 444 /**
 445  * Mark the given SPE register as "allocated".
 446  */
 447 int spe_allocate_register(struct spe_function *p, int reg)
 448 {
 449    assert(reg < SPE_NUM_REGS);
 450    assert(p->regs[reg] == 0);
 451    p->regs[reg] = 1;
 452    return reg;
 453 }
 454
 455
 456 /**
 457  * Mark the given SPE register as "unallocated".  Note that this should
 458  * only be used on registers allocated in the current register set; an
 459  * assertion will fail if an attempt is made to deallocate a register
 460  * allocated in an earlier register set.
 461  */
 462 void spe_release_register(struct spe_function *p, int reg)
 463 {
 464    assert(reg < SPE_NUM_REGS);
 465    assert(p->regs[reg] == 1);
 466
 467    p->regs[reg] = 0;
 468 }
 469
 470 /**
 471  * Start a new set of registers.  This can be called if
 472  * it will be difficult later to determine exactly what
 473  * registers were actually allocated during a code generation
 474  * sequence, and you really just want to deallocate all of them.
 475  */
 476 void spe_allocate_register_set(struct spe_function *p)
 477 {
 478    register unsigned int i;
 479
 480    /* Keep track of the set count.  If it ever wraps around to 0,
 481     * we're in trouble.
 482     */
 483    p->set_count++;
 484    assert(p->set_count > 0);
 485
 486    /* Increment the allocation count of all registers currently
 487     * allocated.  Then any registers that are allocated in this set
 488     * will be the only ones with a count of 1; they'll all be released
 489     * when the register set is released.
 490     */
 491    for (i = 0; i < SPE_NUM_REGS; i++) {
 492       if (p->regs[i] > 0) p->regs[i]++;
 493    }
 494 }
 495
 496 void spe_release_register_set(struct spe_function *p)
 497 {
 498    unsigned int i;
 499
 500    /* If the set count drops below zero, we're in trouble. */
 501    assert(p->set_count > 0);
 502    p->set_count--;
 503
 504    /* Drop the allocation level of all registers.  Any allocated
 505     * during this register set will drop to 0 and then become
 506     * available.
 507     */
 508    for (i = 0; i < SPE_NUM_REGS; i++) {
 509       if (p->regs[i] > 0) p->regs[i]--;
 510    }
 511 }
 512
 513
 514 void
 515 spe_print_code(struct spe_function *p, boolean enable)
 516 {
 517    p->print = enable;
 518 }
 519
 520
 521 void
 522 spe_indent(struct spe_function *p, int spaces)
 523 {
 524    p->indent += spaces;
 525 }
 526
 527
 528 extern void
 529 spe_comment(struct spe_function *p, int rel_indent, const char *s)
 530 {
 531    if (p->print) {
 532       p->indent += rel_indent;
 533       indent(p);
 534       p->indent -= rel_indent;
 535       printf("# %s\n", s);
 536    }
 537 }
 538
 539
 540 /**
 541  * For branch instructions:
 542  * \param d  if 1, disable interupts if branch is taken
 543  * \param e  if 1, enable interupts if branch is taken
 544  * If d and e are both zero, don't change interupt status (right?)
 545  */
 546
 547 /** Branch Indirect to address in rA */
 548 void spe_bi(struct spe_function *p, unsigned rA, int d, int e)
 549 {
 550    emit_RI7(p, 0x1a8, 0, rA, (d << 5) | (e << 4), __FUNCTION__);
 551 }
 552
 553 /** Interupt Return */
 554 void spe_iret(struct spe_function *p, unsigned rA, int d, int e)
 555 {
 556    emit_RI7(p, 0x1aa, 0, rA, (d << 5) | (e << 4), __FUNCTION__);
 557 }
 558
 559 /** Branch indirect and set link on external data */
 560 void spe_bisled(struct spe_function *p, unsigned rT, unsigned rA, int d,
 561                 int e)
 562 {
 563    emit_RI7(p, 0x1ab, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
 564 }
 565
 566 /** Branch indirect and set link.  Save PC in rT, jump to rA. */
 567 void spe_bisl(struct spe_function *p, unsigned rT, unsigned rA, int d,
 568                 int e)
 569 {
 570    emit_RI7(p, 0x1a9, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
 571 }
 572
 573 /** Branch indirect if zero word.  If rT.word[0]==0, jump to rA. */
 574 void spe_biz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
 575 {
 576    emit_RI7(p, 0x128, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
 577 }
 578
 579 /** Branch indirect if non-zero word.  If rT.word[0]!=0, jump to rA. */
 580 void spe_binz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
 581 {
 582    emit_RI7(p, 0x129, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
 583 }
 584
 585 /** Branch indirect if zero halfword.  If rT.halfword[1]==0, jump to rA. */
 586 void spe_bihz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
 587 {
 588    emit_RI7(p, 0x12a, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
 589 }
 590
 591 /** Branch indirect if non-zero halfword.  If rT.halfword[1]!=0, jump to rA. */
 592 void spe_bihnz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
 593 {
 594    emit_RI7(p, 0x12b, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
 595 }
 596
 597
 598 /* Hint-for-branch instructions
 599  */
 600 #if 0
 601 hbr;
 602 hbra;
 603 hbrr;
 604 #endif
 605
 606
 607 /* Control instructions
 608  */
 609 #if 0
 610 stop;
 611 EMIT_RR  (spe_stopd, 0x140);
 612 EMIT_    (spe_lnop,  0x001);
 613 EMIT_    (spe_nop,   0x201);
 614 sync;
 615 EMIT_    (spe_dsync, 0x003);
 616 EMIT_R   (spe_mfspr, 0x00c);
 617 EMIT_R   (spe_mtspr, 0x10c);
 618 #endif
 619
 620
 621 /**
 622  ** Helper / "macro" instructions.
 623  ** Use somewhat verbose names as a reminder that these aren't native
 624  ** SPE instructions.
 625  **/
 626
 627
 628 void
 629 spe_load_float(struct spe_function *p, unsigned rT, float x)
 630 {
 631    if (x == 0.0f) {
 632       spe_il(p, rT, 0x0);
 633    }
 634    else if (x == 0.5f) {
 635       spe_ilhu(p, rT, 0x3f00);
 636    }
 637    else if (x == 1.0f) {
 638       spe_ilhu(p, rT, 0x3f80);
 639    }
 640    else if (x == -1.0f) {
 641       spe_ilhu(p, rT, 0xbf80);
 642    }
 643    else {
 644       union {
 645          float f;
 646          unsigned u;
 647       } bits;
 648       bits.f = x;
 649       spe_ilhu(p, rT, bits.u >> 16);
 650       spe_iohl(p, rT, bits.u & 0xffff);
 651    }
 652 }
 653
 654
 655 void
 656 spe_load_int(struct spe_function *p, unsigned rT, int i)
 657 {
 658    if (-32768 <= i && i <= 32767) {
 659       spe_il(p, rT, i);
 660    }
 661    else {
 662       spe_ilhu(p, rT, i >> 16);
 663       if (i & 0xffff)
 664          spe_iohl(p, rT, i & 0xffff);
 665    }
 666 }
 667
 668 void spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui)
 669 {
 670    /* If the whole value is in the lower 18 bits, use ila, which
 671     * doesn't sign-extend.  Otherwise, if the two halfwords of
 672     * the constant are identical, use ilh.  Otherwise, if every byte of
 673     * the desired value is 0x00 or 0xff, we can use Form Select Mask for
 674     * Bytes Immediate (fsmbi) to load the value in a single instruction.
 675     * Otherwise, in the general case, we have to use ilhu followed by iohl.
 676     */
 677    if ((ui & 0xfffc0000) == ui) {
 678       spe_ila(p, rT, ui);
 679    }
 680    else if ((ui >> 16) == (ui & 0xffff)) {
 681       spe_ilh(p, rT, ui & 0xffff);
 682    }
 683    else if (
 684       ((ui & 0x000000ff) == 0 || (ui & 0x000000ff) == 0x000000ff) &&
 685       ((ui & 0x0000ff00) == 0 || (ui & 0x0000ff00) == 0x0000ff00) &&
 686       ((ui & 0x00ff0000) == 0 || (ui & 0x00ff0000) == 0x00ff0000) &&
 687       ((ui & 0xff000000) == 0 || (ui & 0xff000000) == 0xff000000)
 688    ) {
 689       unsigned int mask = 0;
 690       /* fsmbi duplicates each bit in the given mask eight times,
 691        * using a 16-bit value to initialize a 16-byte quadword.
 692        * Each 4-bit nybble of the mask corresponds to a full word
 693        * of the result; look at the value and figure out the mask
 694        * (replicated for each word in the quadword), and then
 695        * form the "select mask" to get the value.
 696        */
 697       if ((ui & 0x000000ff) == 0x000000ff) mask |= 0x1111;
 698       if ((ui & 0x0000ff00) == 0x0000ff00) mask |= 0x2222;
 699       if ((ui & 0x00ff0000) == 0x00ff0000) mask |= 0x4444;
 700       if ((ui & 0xff000000) == 0xff000000) mask |= 0x8888;
 701       spe_fsmbi(p, rT, mask);
 702    }
 703    else {
 704       /* The general case: this usually uses two instructions, but
 705        * may use only one if the low-order 16 bits of each word are 0.
 706        */
 707       spe_ilhu(p, rT, ui >> 16);
 708       if (ui & 0xffff)
 709          spe_iohl(p, rT, ui & 0xffff);
 710    }
 711 }
 712
 713 /* This function is constructed identically to spe_sor_uint() below.
 714  * Changes to one should be made in the other.
 715  */
 716 void spe_and_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
 717 {
 718    /* If we can, emit a single instruction, either And Byte Immediate
 719     * (which uses the same constant across each byte), And Halfword Immediate
 720     * (which sign-extends a 10-bit immediate to 16 bits and uses that
 721     * across each halfword), or And Word Immediate (which sign-extends
 722     * a 10-bit immediate to 32 bits).
 723     *
 724     * Otherwise, we'll need to use a temporary register.
 725     */
 726    register unsigned int tmp;
 727
 728    /* If the upper 23 bits are all 0s or all 1s, sign extension
 729     * will work and we can use And Word Immediate
 730     */
 731    tmp = ui & 0xfffffe00;
 732    if (tmp == 0xfffffe00 || tmp  == 0) {
 733       spe_andi(p, rT, rA, ui & 0x000003ff);
 734       return;
 735    }
 736
 737    /* If the ui field is symmetric along halfword boundaries and
 738     * the upper 7 bits of each halfword are all 0s or 1s, we
 739     * can use And Halfword Immediate
 740     */
 741    tmp = ui & 0xfe00fe00;
 742    if ((tmp == 0xfe00fe00 || tmp == 0) && ((ui >> 16) == (ui & 0x0000ffff))) {
 743       spe_andhi(p, rT, rA, ui & 0x000003ff);
 744       return;
 745    }
 746
 747    /* If the ui field is symmetric in each byte, then we can use
 748     * the And Byte Immediate instruction.
 749     */
 750    tmp = ui & 0x000000ff;
 751    if ((ui >> 24) == tmp && ((ui >> 16) & 0xff) == tmp && ((ui >> 8) & 0xff) == tmp) {
 752       spe_andbi(p, rT, rA, tmp);
 753       return;
 754    }
 755
 756    /* Otherwise, we'll have to use a temporary register. */
 757    unsigned int tmp_reg = spe_allocate_available_register(p);
 758    spe_load_uint(p, tmp_reg, ui);
 759    spe_and(p, rT, rA, tmp_reg);
 760    spe_release_register(p, tmp_reg);
 761 }
 762
 763 /* This function is constructed identically to spe_and_uint() above.
 764  * Changes to one should be made in the other.
 765  */
 766 void spe_xor_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
 767 {
 768    /* If we can, emit a single instruction, either Exclusive Or Byte
 769     * Immediate (which uses the same constant across each byte), Exclusive
 770     * Or Halfword Immediate (which sign-extends a 10-bit immediate to
 771     * 16 bits and uses that across each halfword), or Exclusive Or Word
 772     * Immediate (which sign-extends a 10-bit immediate to 32 bits).
 773     *
 774     * Otherwise, we'll need to use a temporary register.
 775     */
 776    register unsigned int tmp;
 777
 778    /* If the upper 23 bits are all 0s or all 1s, sign extension
 779     * will work and we can use Exclusive Or Word Immediate
 780     */
 781    tmp = ui & 0xfffffe00;
 782    if (tmp == 0xfffffe00 || tmp  == 0) {
 783       spe_xori(p, rT, rA, ui & 0x000003ff);
 784       return;
 785    }
 786
 787    /* If the ui field is symmetric along halfword boundaries and
 788     * the upper 7 bits of each halfword are all 0s or 1s, we
 789     * can use Exclusive Or Halfword Immediate
 790     */
 791    tmp = ui & 0xfe00fe00;
 792    if ((tmp == 0xfe00fe00 || tmp == 0) && ((ui >> 16) == (ui & 0x0000ffff))) {
 793       spe_xorhi(p, rT, rA, ui & 0x000003ff);
 794       return;
 795    }
 796
 797    /* If the ui field is symmetric in each byte, then we can use
 798     * the Exclusive Or Byte Immediate instruction.
 799     */
 800    tmp = ui & 0x000000ff;
 801    if ((ui >> 24) == tmp && ((ui >> 16) & 0xff) == tmp && ((ui >> 8) & 0xff) == tmp) {
 802       spe_xorbi(p, rT, rA, tmp);
 803       return;
 804    }
 805
 806    /* Otherwise, we'll have to use a temporary register. */
 807    unsigned int tmp_reg = spe_allocate_available_register(p);
 808    spe_load_uint(p, tmp_reg, ui);
 809    spe_xor(p, rT, rA, tmp_reg);
 810    spe_release_register(p, tmp_reg);
 811 }
 812
 813 void
 814 spe_compare_equal_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
 815 {
 816    /* If the comparison value is 9 bits or less, it fits inside a
 817     * Compare Equal Word Immediate instruction.
 818     */
 819    if ((ui & 0x000001ff) == ui) {
 820       spe_ceqi(p, rT, rA, ui);
 821    }
 822    /* Otherwise, we're going to have to load a word first. */
 823    else {
 824       unsigned int tmp_reg = spe_allocate_available_register(p);
 825       spe_load_uint(p, tmp_reg, ui);
 826       spe_ceq(p, rT, rA, tmp_reg);
 827       spe_release_register(p, tmp_reg);
 828    }
 829 }
 830
 831 void
 832 spe_compare_greater_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
 833 {
 834    /* If the comparison value is 10 bits or less, it fits inside a
 835     * Compare Logical Greater Than Word Immediate instruction.
 836     */
 837    if ((ui & 0x000003ff) == ui) {
 838       spe_clgti(p, rT, rA, ui);
 839    }
 840    /* Otherwise, we're going to have to load a word first. */
 841    else {
 842       unsigned int tmp_reg = spe_allocate_available_register(p);
 843       spe_load_uint(p, tmp_reg, ui);
 844       spe_clgt(p, rT, rA, tmp_reg);
 845       spe_release_register(p, tmp_reg);
 846    }
 847 }
 848
 849 void
 850 spe_splat(struct spe_function *p, unsigned rT, unsigned rA)
 851 {
 852    /* Duplicate bytes 0, 1, 2, and 3 across the whole register */
 853    spe_ila(p, rT, 0x00010203);
 854    spe_shufb(p, rT, rA, rA, rT);
 855 }
 856
 857
 858 void
 859 spe_complement(struct spe_function *p, unsigned rT, unsigned rA)
 860 {
 861    spe_nor(p, rT, rA, rA);
 862 }
 863
 864
 865 void
 866 spe_move(struct spe_function *p, unsigned rT, unsigned rA)
 867 {
 868    /* Use different instructions depending on the instruction address
 869     * to take advantage of the dual pipelines.
 870     */
 871    if (p->num_inst & 1)
 872       spe_shlqbyi(p, rT, rA, 0);  /* odd pipe */
 873    else
 874       spe_ori(p, rT, rA, 0);  /* even pipe */
 875 }
 876
 877
 878 void
 879 spe_zero(struct spe_function *p, unsigned rT)
 880 {
 881    spe_xor(p, rT, rT, rT);
 882 }
 883
 884
 885 void
 886 spe_splat_word(struct spe_function *p, unsigned rT, unsigned rA, int word)
 887 {
 888    assert(word >= 0);
 889    assert(word <= 3);
 890
 891    if (word == 0) {
 892       int tmp1 = rT;
 893       spe_ila(p, tmp1, 66051);
 894       spe_shufb(p, rT, rA, rA, tmp1);
 895    }
 896    else {
 897       /* XXX review this, we may not need the rotqbyi instruction */
 898       int tmp1 = rT;
 899       int tmp2 = spe_allocate_available_register(p);
 900
 901       spe_ila(p, tmp1, 66051);
 902       spe_rotqbyi(p, tmp2, rA, 4 * word);
 903       spe_shufb(p, rT, tmp2, tmp2, tmp1);
 904
 905       spe_release_register(p, tmp2);
 906    }
 907 }
 908
 909 /**
 910  * For each 32-bit float element of rA and rB, choose the smaller of the
 911  * two, compositing them into the rT register.
 912  *
 913  * The Float Compare Greater Than (fcgt) instruction will put 1s into
 914  * compare_reg where rA > rB, and 0s where rA <= rB.
 915  *
 916  * Then the Select Bits (selb) instruction will take bits from rA where
 917  * compare_reg is 0, and from rB where compare_reg is 1; i.e., from rA
 918  * where rA <= rB and from rB where rB > rA, which is exactly the
 919  * "min" operation.
 920  *
 921  * The compare_reg could in many cases be the same as rT, unless
 922  * rT == rA || rt == rB.  But since this is common in constructions
 923  * like "x = min(x, a)", we always allocate a new register to be safe.
 924  */
 925 void
 926 spe_float_min(struct spe_function *p, unsigned rT, unsigned rA, unsigned rB)
 927 {
 928    unsigned int compare_reg = spe_allocate_available_register(p);
 929    spe_fcgt(p, compare_reg, rA, rB);
 930    spe_selb(p, rT, rA, rB, compare_reg);
 931    spe_release_register(p, compare_reg);
 932 }
 933
 934 /**
 935  * For each 32-bit float element of rA and rB, choose the greater of the
 936  * two, compositing them into the rT register.
 937  *
 938  * The logic is similar to that of spe_float_min() above; the only
 939  * difference is that the registers on spe_selb() have been reversed,
 940  * so that the larger of the two is selected instead of the smaller.
 941  */
 942 void
 943 spe_float_max(struct spe_function *p, unsigned rT, unsigned rA, unsigned rB)
 944 {
 945    unsigned int compare_reg = spe_allocate_available_register(p);
 946    spe_fcgt(p, compare_reg, rA, rB);
 947    spe_selb(p, rT, rB, rA, compare_reg);
 948    spe_release_register(p, compare_reg);
 949 }
 950
 951 #endif /* GALLIUM_CELL */