src/gallium/drivers/i965/brw_vs_emit.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keith@tungstengraphics.com>
  30   */
  31
  32 #include "pipe/p_shader_tokens.h"
  33
  34 #include "util/u_memory.h"
  35 #include "util/u_math.h"
  36
  37 #include "tgsi/tgsi_ureg.h"
  38 #include "tgsi/tgsi_ureg_parse.h"
  39 #include "tgsi/tgsi_dump.h"
  40 #include "tgsi/tgsi_info.h"
  41
  42 #include "brw_context.h"
  43 #include "brw_vs.h"
  44 #include "brw_debug.h"
  45
  46
  47
  48 static struct brw_reg get_tmp( struct brw_vs_compile *c )
  49 {
  50    struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
  51
  52    if (++c->last_tmp > c->prog_data.total_grf)
  53       c->prog_data.total_grf = c->last_tmp;
  54
  55    return tmp;
  56 }
  57
  58 static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
  59 {
  60    if (tmp.nr == c->last_tmp-1)
  61       c->last_tmp--;
  62 }
  63
  64 static void release_tmps( struct brw_vs_compile *c )
  65 {
  66    c->last_tmp = c->first_tmp;
  67 }
  68
  69
  70 /**
  71  * Preallocate GRF register before code emit.
  72  * Do things as simply as possible.  Allocate and populate all regs
  73  * ahead of time.
  74  */
  75 static void brw_vs_alloc_regs( struct brw_vs_compile *c )
  76 {
  77    GLuint i, reg = 0, mrf;
  78    int attributes_in_vue;
  79
  80    /* Determine whether to use a real constant buffer or use a block
  81     * of GRF registers for constants.  The later is faster but only
  82     * works if everything fits in the GRF.
  83     * XXX this heuristic/check may need some fine tuning...
  84     */
  85    if (c->vp->info.file_max[TGSI_FILE_CONSTANT] +
  86        c->vp->info.file_max[TGSI_FILE_TEMPORARY] + 21 > BRW_MAX_GRF)
  87       c->vp->use_const_buffer = GL_TRUE;
  88    else
  89       c->vp->use_const_buffer = GL_FALSE;
  90
  91    /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
  92
  93    /* r0 -- reserved as usual
  94     */
  95    c->r0 = brw_vec8_grf(reg, 0);
  96    reg++;
  97
  98    /* User clip planes from curbe:
  99     */
 100    if (c->key.nr_userclip) {
 101       for (i = 0; i < c->key.nr_userclip; i++) {
 102          c->userplane[i] = stride( brw_vec4_grf(reg+3+i/2, (i%2) * 4), 0, 4, 1);
 103       }
 104
 105       /* Deal with curbe alignment:
 106        */
 107       reg += ((6 + c->key.nr_userclip + 3) / 4) * 2;
 108    }
 109
 110    /* Vertex program parameters from curbe:
 111     */
 112    if (c->vp->use_const_buffer) {
 113       /* get constants from a real constant buffer */
 114       c->prog_data.curb_read_length = 0;
 115       c->prog_data.nr_params = 4; /* XXX 0 causes a bug elsewhere... */
 116    }
 117    else {
 118       /* use a section of the GRF for constants */
 119       GLuint nr_params = c->vp->info.file_max[TGSI_FILE_CONSTANT] + 1;
 120       for (i = 0; i < nr_params; i++) {
 121          c->regs[TGSI_FILE_CONSTANT][i] = stride( brw_vec4_grf(reg+i/2, (i%2) * 4), 0, 4, 1);
 122       }
 123       reg += (nr_params + 1) / 2;
 124       c->prog_data.curb_read_length = reg - 1;
 125       c->prog_data.nr_params = nr_params * 4;
 126    }
 127
 128    /* Allocate input regs:
 129     */
 130    c->nr_inputs = c->vp->info.num_inputs;
 131    for (i = 0; i < c->nr_inputs; i++) {
 132       c->regs[TGSI_FILE_INPUT][i] = brw_vec8_grf(reg, 0);
 133       reg++;
 134    }
 135
 136    /* If there are no inputs, we'll still be reading one attribute's worth
 137     * because it's required -- see urb_read_length setting.
 138     */
 139    if (c->nr_inputs == 0)
 140       reg++;
 141
 142    /* Allocate outputs.  The non-position outputs go straight into message regs.
 143     */
 144    c->nr_outputs = 0;
 145    c->first_output = reg;
 146    c->first_overflow_output = 0;
 147
 148    if (c->chipset.is_igdng)
 149       mrf = 8;
 150    else
 151       mrf = 4;
 152
 153    /* XXX: need to access vertex output semantics here:
 154     */
 155    c->nr_outputs = c->prog_data.nr_outputs;
 156    for (i = 0; i < c->prog_data.nr_outputs; i++) {
 157       assert(i < Elements(c->regs[TGSI_FILE_OUTPUT]));
 158
 159       /* XXX: Hardwire position to zero:
 160        */
 161       if (i == 0) {
 162          c->regs[TGSI_FILE_OUTPUT][i] = brw_vec8_grf(reg, 0);
 163          reg++;
 164       }
 165       /* XXX: disable psiz:
 166        */
 167       else if (0) {
 168          c->regs[TGSI_FILE_OUTPUT][i] = brw_vec8_grf(reg, 0);
 169          reg++;
 170          mrf++;         /* just a placeholder?  XXX fix later stages & remove this */
 171       }
 172       else if (mrf < 16) {
 173          c->regs[TGSI_FILE_OUTPUT][i] = brw_message_reg(mrf);
 174          mrf++;
 175       }
 176       else {
 177          /* too many vertex results to fit in MRF, use GRF for overflow */
 178          if (!c->first_overflow_output)
 179             c->first_overflow_output = i;
 180          c->regs[TGSI_FILE_OUTPUT][i] = brw_vec8_grf(reg, 0);
 181          reg++;
 182       }
 183    }
 184
 185    /* Allocate program temporaries:
 186     */
 187
 188    for (i = 0; i < c->vp->info.file_max[TGSI_FILE_TEMPORARY]+1; i++) {
 189       c->regs[TGSI_FILE_TEMPORARY][i] = brw_vec8_grf(reg, 0);
 190       reg++;
 191    }
 192
 193    /* Address reg(s).  Don't try to use the internal address reg until
 194     * deref time.
 195     */
 196    for (i = 0; i < c->vp->info.file_max[TGSI_FILE_ADDRESS]+1; i++) {
 197       c->regs[TGSI_FILE_ADDRESS][i] =  brw_reg(BRW_GENERAL_REGISTER_FILE,
 198                                              reg,
 199                                              0,
 200                                              BRW_REGISTER_TYPE_D,
 201                                              BRW_VERTICAL_STRIDE_8,
 202                                              BRW_WIDTH_8,
 203                                              BRW_HORIZONTAL_STRIDE_1,
 204                                              BRW_SWIZZLE_XXXX,
 205                                              BRW_WRITEMASK_X);
 206       reg++;
 207    }
 208
 209    if (c->vp->use_const_buffer) {
 210       for (i = 0; i < 3; i++) {
 211          c->current_const[i].index = -1;
 212          c->current_const[i].reg = brw_vec8_grf(reg, 0);
 213          reg++;
 214       }
 215    }
 216
 217    for (i = 0; i < 128; i++) {
 218       if (c->output_regs[i].used_in_src) {
 219          c->output_regs[i].reg = brw_vec8_grf(reg, 0);
 220          reg++;
 221       }
 222    }
 223
 224    c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0);
 225    reg += 2;
 226
 227    /* Some opcodes need an internal temporary:
 228     */
 229    c->first_tmp = reg;
 230    c->last_tmp = reg;           /* for allocation purposes */
 231
 232    /* Each input reg holds data from two vertices.  The
 233     * urb_read_length is the number of registers read from *each*
 234     * vertex urb, so is half the amount:
 235     */
 236    c->prog_data.urb_read_length = (c->nr_inputs + 1) / 2;
 237
 238    /* Setting this field to 0 leads to undefined behavior according to the
 239     * the VS_STATE docs.  Our VUEs will always have at least one attribute
 240     * sitting in them, even if it's padding.
 241     */
 242    if (c->prog_data.urb_read_length == 0)
 243       c->prog_data.urb_read_length = 1;
 244
 245    /* The VS VUEs are shared by VF (outputting our inputs) and VS, so size
 246     * them to fit the biggest thing they need to.
 247     */
 248    attributes_in_vue = MAX2(c->nr_outputs, c->nr_inputs);
 249
 250    if (c->chipset.is_igdng)
 251       c->prog_data.urb_entry_size = (attributes_in_vue + 6 + 3) / 4;
 252    else
 253       c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 3) / 4;
 254
 255    c->prog_data.total_grf = reg;
 256
 257    if (BRW_DEBUG & DEBUG_VS) {
 258       debug_printf("%s NumAddrRegs %d\n", __FUNCTION__,
 259                    c->vp->info.file_max[TGSI_FILE_ADDRESS]+1);
 260       debug_printf("%s NumTemps %d\n", __FUNCTION__,
 261                    c->vp->info.file_max[TGSI_FILE_TEMPORARY]+1);
 262       debug_printf("%s reg = %d\n", __FUNCTION__, reg);
 263    }
 264 }
 265
 266
 267 /**
 268  * If an instruction uses a temp reg both as a src and the dest, we
 269  * sometimes need to allocate an intermediate temporary.
 270  */
 271 static void unalias1( struct brw_vs_compile *c,
 272                       struct brw_reg dst,
 273                       struct brw_reg arg0,
 274                       void (*func)( struct brw_vs_compile *,
 275                                     struct brw_reg,
 276                                     struct brw_reg ))
 277 {
 278    if (dst.file == arg0.file && dst.nr == arg0.nr) {
 279       struct brw_compile *p = &c->func;
 280       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 281       func(c, tmp, arg0);
 282       brw_MOV(p, dst, tmp);
 283       release_tmp(c, tmp);
 284    }
 285    else {
 286       func(c, dst, arg0);
 287    }
 288 }
 289
 290 /**
 291  * \sa unalias2
 292  * Checkes if 2-operand instruction needs an intermediate temporary.
 293  */
 294 static void unalias2( struct brw_vs_compile *c,
 295                       struct brw_reg dst,
 296                       struct brw_reg arg0,
 297                       struct brw_reg arg1,
 298                       void (*func)( struct brw_vs_compile *,
 299                                     struct brw_reg,
 300                                     struct brw_reg,
 301                                     struct brw_reg ))
 302 {
 303    if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
 304        (dst.file == arg1.file && dst.nr == arg1.nr)) {
 305       struct brw_compile *p = &c->func;
 306       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 307       func(c, tmp, arg0, arg1);
 308       brw_MOV(p, dst, tmp);
 309       release_tmp(c, tmp);
 310    }
 311    else {
 312       func(c, dst, arg0, arg1);
 313    }
 314 }
 315
 316 /**
 317  * \sa unalias2
 318  * Checkes if 3-operand instruction needs an intermediate temporary.
 319  */
 320 static void unalias3( struct brw_vs_compile *c,
 321                       struct brw_reg dst,
 322                       struct brw_reg arg0,
 323                       struct brw_reg arg1,
 324                       struct brw_reg arg2,
 325                       void (*func)( struct brw_vs_compile *,
 326                                     struct brw_reg,
 327                                     struct brw_reg,
 328                                     struct brw_reg,
 329                                     struct brw_reg ))
 330 {
 331    if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
 332        (dst.file == arg1.file && dst.nr == arg1.nr) ||
 333        (dst.file == arg2.file && dst.nr == arg2.nr)) {
 334       struct brw_compile *p = &c->func;
 335       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 336       func(c, tmp, arg0, arg1, arg2);
 337       brw_MOV(p, dst, tmp);
 338       release_tmp(c, tmp);
 339    }
 340    else {
 341       func(c, dst, arg0, arg1, arg2);
 342    }
 343 }
 344
 345 static void emit_sop( struct brw_compile *p,
 346                       struct brw_reg dst,
 347                       struct brw_reg arg0,
 348                       struct brw_reg arg1,
 349                       GLuint cond)
 350 {
 351    brw_MOV(p, dst, brw_imm_f(0.0f));
 352    brw_CMP(p, brw_null_reg(), cond, arg0, arg1);
 353    brw_MOV(p, dst, brw_imm_f(1.0f));
 354    brw_set_predicate_control_flag_value(p, 0xff);
 355 }
 356
 357 static void emit_seq( struct brw_compile *p,
 358                       struct brw_reg dst,
 359                       struct brw_reg arg0,
 360                       struct brw_reg arg1 )
 361 {
 362    emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_EQ);
 363 }
 364
 365 static void emit_sne( struct brw_compile *p,
 366                       struct brw_reg dst,
 367                       struct brw_reg arg0,
 368                       struct brw_reg arg1 )
 369 {
 370    emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_NEQ);
 371 }
 372 static void emit_slt( struct brw_compile *p,
 373                       struct brw_reg dst,
 374                       struct brw_reg arg0,
 375                       struct brw_reg arg1 )
 376 {
 377    emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_L);
 378 }
 379
 380 static void emit_sle( struct brw_compile *p,
 381                       struct brw_reg dst,
 382                       struct brw_reg arg0,
 383                       struct brw_reg arg1 )
 384 {
 385    emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_LE);
 386 }
 387
 388 static void emit_sgt( struct brw_compile *p,
 389                       struct brw_reg dst,
 390                       struct brw_reg arg0,
 391                       struct brw_reg arg1 )
 392 {
 393    emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_G);
 394 }
 395
 396 static void emit_sge( struct brw_compile *p,
 397                       struct brw_reg dst,
 398                       struct brw_reg arg0,
 399                       struct brw_reg arg1 )
 400 {
 401   emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_GE);
 402 }
 403
 404 static void emit_max( struct brw_compile *p,
 405                       struct brw_reg dst,
 406                       struct brw_reg arg0,
 407                       struct brw_reg arg1 )
 408 {
 409    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
 410    brw_SEL(p, dst, arg1, arg0);
 411    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 412 }
 413
 414 static void emit_min( struct brw_compile *p,
 415                       struct brw_reg dst,
 416                       struct brw_reg arg0,
 417                       struct brw_reg arg1 )
 418 {
 419    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
 420    brw_SEL(p, dst, arg0, arg1);
 421    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 422 }
 423
 424
 425 static void emit_math1( struct brw_vs_compile *c,
 426                         GLuint function,
 427                         struct brw_reg dst,
 428                         struct brw_reg arg0,
 429                         GLuint precision)
 430 {
 431    /* There are various odd behaviours with SEND on the simulator.  In
 432     * addition there are documented issues with the fact that the GEN4
 433     * processor doesn't do dependency control properly on SEND
 434     * results.  So, on balance, this kludge to get around failures
 435     * with writemasked math results looks like it might be necessary
 436     * whether that turns out to be a simulator bug or not:
 437     */
 438    struct brw_compile *p = &c->func;
 439    struct brw_reg tmp = dst;
 440    GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
 441                          dst.file != BRW_GENERAL_REGISTER_FILE);
 442
 443    if (need_tmp)
 444       tmp = get_tmp(c);
 445
 446    brw_math(p,
 447             tmp,
 448             function,
 449             BRW_MATH_SATURATE_NONE,
 450             2,
 451             arg0,
 452             BRW_MATH_DATA_SCALAR,
 453             precision);
 454
 455    if (need_tmp) {
 456       brw_MOV(p, dst, tmp);
 457       release_tmp(c, tmp);
 458    }
 459 }
 460
 461
 462 static void emit_math2( struct brw_vs_compile *c,
 463                         GLuint function,
 464                         struct brw_reg dst,
 465                         struct brw_reg arg0,
 466                         struct brw_reg arg1,
 467                         GLuint precision)
 468 {
 469    struct brw_compile *p = &c->func;
 470    struct brw_reg tmp = dst;
 471    GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
 472                          dst.file != BRW_GENERAL_REGISTER_FILE);
 473
 474    if (need_tmp)
 475       tmp = get_tmp(c);
 476
 477    brw_MOV(p, brw_message_reg(3), arg1);
 478
 479    brw_math(p,
 480             tmp,
 481             function,
 482             BRW_MATH_SATURATE_NONE,
 483             2,
 484             arg0,
 485             BRW_MATH_DATA_SCALAR,
 486             precision);
 487
 488    if (need_tmp) {
 489       brw_MOV(p, dst, tmp);
 490       release_tmp(c, tmp);
 491    }
 492 }
 493
 494
 495 static void emit_exp_noalias( struct brw_vs_compile *c,
 496                               struct brw_reg dst,
 497                               struct brw_reg arg0 )
 498 {
 499    struct brw_compile *p = &c->func;
 500
 501
 502    if (dst.dw1.bits.writemask & BRW_WRITEMASK_X) {
 503       struct brw_reg tmp = get_tmp(c);
 504       struct brw_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
 505
 506       /* tmp_d = floor(arg0.x) */
 507       brw_RNDD(p, tmp_d, brw_swizzle1(arg0, 0));
 508
 509       /* result[0] = 2.0 ^ tmp */
 510
 511       /* Adjust exponent for floating point:
 512        * exp += 127
 513        */
 514       brw_ADD(p, brw_writemask(tmp_d, BRW_WRITEMASK_X), tmp_d, brw_imm_d(127));
 515
 516       /* Install exponent and sign.
 517        * Excess drops off the edge:
 518        */
 519       brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), BRW_WRITEMASK_X),
 520               tmp_d, brw_imm_d(23));
 521
 522       release_tmp(c, tmp);
 523    }
 524
 525    if (dst.dw1.bits.writemask & BRW_WRITEMASK_Y) {
 526       /* result[1] = arg0.x - floor(arg0.x) */
 527       brw_FRC(p, brw_writemask(dst, BRW_WRITEMASK_Y), brw_swizzle1(arg0, 0));
 528    }
 529
 530    if (dst.dw1.bits.writemask & BRW_WRITEMASK_Z) {
 531       /* As with the LOG instruction, we might be better off just
 532        * doing a taylor expansion here, seeing as we have to do all
 533        * the prep work.
 534        *
 535        * If mathbox partial precision is too low, consider also:
 536        * result[3] = result[0] * EXP(result[1])
 537        */
 538       emit_math1(c,
 539                  BRW_MATH_FUNCTION_EXP,
 540                  brw_writemask(dst, BRW_WRITEMASK_Z),
 541                  brw_swizzle1(arg0, 0),
 542                  BRW_MATH_PRECISION_FULL);
 543    }
 544
 545    if (dst.dw1.bits.writemask & BRW_WRITEMASK_W) {
 546       /* result[3] = 1.0; */
 547       brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_W), brw_imm_f(1));
 548    }
 549 }
 550
 551
 552 static void emit_log_noalias( struct brw_vs_compile *c,
 553                               struct brw_reg dst,
 554                               struct brw_reg arg0 )
 555 {
 556    struct brw_compile *p = &c->func;
 557    struct brw_reg tmp = dst;
 558    struct brw_reg tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
 559    struct brw_reg arg0_ud = retype(arg0, BRW_REGISTER_TYPE_UD);
 560    GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
 561                          dst.file != BRW_GENERAL_REGISTER_FILE);
 562
 563    if (need_tmp) {
 564       tmp = get_tmp(c);
 565       tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
 566    }
 567
 568    /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
 569     * according to spec:
 570     *
 571     * These almost look likey they could be joined up, but not really
 572     * practical:
 573     *
 574     * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
 575     * result[1].i = (x.i & ((1<<23)-1)        + (127<<23)
 576     */
 577    if (dst.dw1.bits.writemask & BRW_WRITEMASK_XZ) {
 578       brw_AND(p,
 579               brw_writemask(tmp_ud, BRW_WRITEMASK_X),
 580               brw_swizzle1(arg0_ud, 0),
 581               brw_imm_ud((1U<<31)-1));
 582
 583       brw_SHR(p,
 584               brw_writemask(tmp_ud, BRW_WRITEMASK_X),
 585               tmp_ud,
 586               brw_imm_ud(23));
 587
 588       brw_ADD(p,
 589               brw_writemask(tmp, BRW_WRITEMASK_X),
 590               retype(tmp_ud, BRW_REGISTER_TYPE_D),      /* does it matter? */
 591               brw_imm_d(-127));
 592    }
 593
 594    if (dst.dw1.bits.writemask & BRW_WRITEMASK_YZ) {
 595       brw_AND(p,
 596               brw_writemask(tmp_ud, BRW_WRITEMASK_Y),
 597               brw_swizzle1(arg0_ud, 0),
 598               brw_imm_ud((1<<23)-1));
 599
 600       brw_OR(p,
 601              brw_writemask(tmp_ud, BRW_WRITEMASK_Y),
 602              tmp_ud,
 603              brw_imm_ud(127<<23));
 604    }
 605
 606    if (dst.dw1.bits.writemask & BRW_WRITEMASK_Z) {
 607       /* result[2] = result[0] + LOG2(result[1]); */
 608
 609       /* Why bother?  The above is just a hint how to do this with a
 610        * taylor series.  Maybe we *should* use a taylor series as by
 611        * the time all the above has been done it's almost certainly
 612        * quicker than calling the mathbox, even with low precision.
 613        *
 614        * Options are:
 615        *    - result[0] + mathbox.LOG2(result[1])
 616        *    - mathbox.LOG2(arg0.x)
 617        *    - result[0] + inline_taylor_approx(result[1])
 618        */
 619       emit_math1(c,
 620                  BRW_MATH_FUNCTION_LOG,
 621                  brw_writemask(tmp, BRW_WRITEMASK_Z),
 622                  brw_swizzle1(tmp, 1),
 623                  BRW_MATH_PRECISION_FULL);
 624
 625       brw_ADD(p,
 626               brw_writemask(tmp, BRW_WRITEMASK_Z),
 627               brw_swizzle1(tmp, 2),
 628               brw_swizzle1(tmp, 0));
 629    }
 630
 631    if (dst.dw1.bits.writemask & BRW_WRITEMASK_W) {
 632       /* result[3] = 1.0; */
 633       brw_MOV(p, brw_writemask(tmp, BRW_WRITEMASK_W), brw_imm_f(1));
 634    }
 635
 636    if (need_tmp) {
 637       brw_MOV(p, dst, tmp);
 638       release_tmp(c, tmp);
 639    }
 640 }
 641
 642
 643 /* Need to unalias - consider swizzles:   r0 = DST r0.xxxx r1
 644  */
 645 static void emit_dst_noalias( struct brw_vs_compile *c,
 646                               struct brw_reg dst,
 647                               struct brw_reg arg0,
 648                               struct brw_reg arg1)
 649 {
 650    struct brw_compile *p = &c->func;
 651
 652    /* There must be a better way to do this:
 653     */
 654    if (dst.dw1.bits.writemask & BRW_WRITEMASK_X)
 655       brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_X), brw_imm_f(1.0));
 656    if (dst.dw1.bits.writemask & BRW_WRITEMASK_Y)
 657       brw_MUL(p, brw_writemask(dst, BRW_WRITEMASK_Y), arg0, arg1);
 658    if (dst.dw1.bits.writemask & BRW_WRITEMASK_Z)
 659       brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_Z), arg0);
 660    if (dst.dw1.bits.writemask & BRW_WRITEMASK_W)
 661       brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_W), arg1);
 662 }
 663
 664
 665 static void emit_xpd( struct brw_compile *p,
 666                       struct brw_reg dst,
 667                       struct brw_reg t,
 668                       struct brw_reg u)
 669 {
 670    brw_MUL(p, brw_null_reg(), brw_swizzle(t, 1,2,0,3),  brw_swizzle(u,2,0,1,3));
 671    brw_MAC(p, dst,     negate(brw_swizzle(t, 2,0,1,3)), brw_swizzle(u,1,2,0,3));
 672 }
 673
 674
 675 static void emit_lit_noalias( struct brw_vs_compile *c,
 676                               struct brw_reg dst,
 677                               struct brw_reg arg0 )
 678 {
 679    struct brw_compile *p = &c->func;
 680    struct brw_instruction *if_insn;
 681    struct brw_reg tmp = dst;
 682    GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
 683
 684    if (need_tmp)
 685       tmp = get_tmp(c);
 686
 687    brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_YZ), brw_imm_f(0));
 688    brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_XW), brw_imm_f(1));
 689
 690    /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
 691     * to get all channels active inside the IF.  In the clipping code
 692     * we run with NoMask, so it's not an option and we can use
 693     * BRW_EXECUTE_1 for all comparisions.
 694     */
 695    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,0), brw_imm_f(0));
 696    if_insn = brw_IF(p, BRW_EXECUTE_8);
 697    {
 698       brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_Y), brw_swizzle1(arg0,0));
 699
 700       brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,1), brw_imm_f(0));
 701       brw_MOV(p, brw_writemask(tmp, BRW_WRITEMASK_Z),  brw_swizzle1(arg0,1));
 702       brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 703
 704       emit_math2(c,
 705                  BRW_MATH_FUNCTION_POW,
 706                  brw_writemask(dst, BRW_WRITEMASK_Z),
 707                  brw_swizzle1(tmp, 2),
 708                  brw_swizzle1(arg0, 3),
 709                  BRW_MATH_PRECISION_PARTIAL);
 710    }
 711
 712    brw_ENDIF(p, if_insn);
 713
 714    release_tmp(c, tmp);
 715 }
 716
 717 static void emit_lrp_noalias(struct brw_vs_compile *c,
 718                              struct brw_reg dst,
 719                              struct brw_reg arg0,
 720                              struct brw_reg arg1,
 721                              struct brw_reg arg2)
 722 {
 723    struct brw_compile *p = &c->func;
 724
 725    brw_ADD(p, dst, negate(arg0), brw_imm_f(1.0));
 726    brw_MUL(p, brw_null_reg(), dst, arg2);
 727    brw_MAC(p, dst, arg0, arg1);
 728 }
 729
 730 /** 3 or 4-component vector normalization */
 731 static void emit_nrm( struct brw_vs_compile *c,
 732                       struct brw_reg dst,
 733                       struct brw_reg arg0,
 734                       int num_comps)
 735 {
 736    struct brw_compile *p = &c->func;
 737    struct brw_reg tmp = get_tmp(c);
 738
 739    /* tmp = dot(arg0, arg0) */
 740    if (num_comps == 3)
 741       brw_DP3(p, tmp, arg0, arg0);
 742    else
 743       brw_DP4(p, tmp, arg0, arg0);
 744
 745    /* tmp = 1 / sqrt(tmp) */
 746    emit_math1(c, BRW_MATH_FUNCTION_RSQ, tmp, tmp, BRW_MATH_PRECISION_FULL);
 747
 748    /* dst = arg0 * tmp */
 749    brw_MUL(p, dst, arg0, tmp);
 750
 751    release_tmp(c, tmp);
 752 }
 753
 754
 755 static struct brw_reg
 756 get_constant(struct brw_vs_compile *c,
 757              const struct ureg_instruction *inst,
 758              GLuint argIndex)
 759 {
 760    const struct ureg_src src = inst->src[argIndex];
 761    struct brw_compile *p = &c->func;
 762    struct brw_reg const_reg;
 763    struct brw_reg const2_reg;
 764    const GLboolean relAddr = src.Indirect;
 765
 766    assert(argIndex < 3);
 767
 768    if (c->current_const[argIndex].index != src.Index || relAddr) {
 769       struct brw_reg addrReg = c->regs[TGSI_FILE_ADDRESS][0];
 770
 771       c->current_const[argIndex].index = src.Index;
 772
 773 #if 0
 774       printf("  fetch const[%d] for arg %d into reg %d\n",
 775              src.Index, argIndex, c->current_const[argIndex].reg.nr);
 776 #endif
 777       /* need to fetch the constant now */
 778       brw_dp_READ_4_vs(p,
 779                        c->current_const[argIndex].reg,/* writeback dest */
 780                        0,                             /* oword */
 781                        relAddr,                       /* relative indexing? */
 782                        addrReg,                       /* address register */
 783                        16 * src.Index,               /* byte offset */
 784                        SURF_INDEX_VERT_CONST_BUFFER   /* binding table index */
 785                        );
 786
 787       if (relAddr) {
 788          /* second read */
 789          const2_reg = get_tmp(c);
 790
 791          /* use upper half of address reg for second read */
 792          addrReg = stride(addrReg, 0, 4, 0);
 793          addrReg.subnr = 16;
 794
 795          brw_dp_READ_4_vs(p,
 796                           const2_reg,              /* writeback dest */
 797                           1,                       /* oword */
 798                           relAddr,                 /* relative indexing? */
 799                           addrReg,                 /* address register */
 800                           16 * src.Index,         /* byte offset */
 801                           SURF_INDEX_VERT_CONST_BUFFER
 802                           );
 803       }
 804    }
 805
 806    const_reg = c->current_const[argIndex].reg;
 807
 808    if (relAddr) {
 809       /* merge the two Owords into the constant register */
 810       /* const_reg[7..4] = const2_reg[7..4] */
 811       brw_MOV(p,
 812               suboffset(stride(const_reg, 0, 4, 1), 4),
 813               suboffset(stride(const2_reg, 0, 4, 1), 4));
 814       release_tmp(c, const2_reg);
 815    }
 816    else {
 817       /* replicate lower four floats into upper half (to get XYZWXYZW) */
 818       const_reg = stride(const_reg, 0, 4, 0);
 819       const_reg.subnr = 0;
 820    }
 821
 822    return const_reg;
 823 }
 824
 825
 826
 827 /* TODO: relative addressing!
 828  */
 829 static struct brw_reg get_reg( struct brw_vs_compile *c,
 830                                enum tgsi_file_type file,
 831                                GLuint index )
 832 {
 833    switch (file) {
 834    case TGSI_FILE_TEMPORARY:
 835    case TGSI_FILE_INPUT:
 836    case TGSI_FILE_OUTPUT:
 837    case TGSI_FILE_CONSTANT:
 838       assert(c->regs[file][index].nr != 0);
 839       return c->regs[file][index];
 840
 841    case TGSI_FILE_ADDRESS:
 842       assert(index == 0);
 843       return c->regs[file][index];
 844
 845    case TGSI_FILE_NULL:                 /* undef values */
 846       return brw_null_reg();
 847
 848    default:
 849       assert(0);
 850       return brw_null_reg();
 851    }
 852 }
 853
 854
 855 /**
 856  * Indirect addressing:  get reg[[arg] + offset].
 857  */
 858 static struct brw_reg deref( struct brw_vs_compile *c,
 859                              struct brw_reg arg,
 860                              GLint offset)
 861 {
 862    struct brw_compile *p = &c->func;
 863    struct brw_reg tmp = vec4(get_tmp(c));
 864    struct brw_reg addr_reg = c->regs[TGSI_FILE_ADDRESS][0];
 865    struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_UW);
 866    GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * 16;
 867    struct brw_reg indirect = brw_vec4_indirect(0,0);
 868
 869    {
 870       brw_push_insn_state(p);
 871       brw_set_access_mode(p, BRW_ALIGN_1);
 872
 873       /* This is pretty clunky - load the address register twice and
 874        * fetch each 4-dword value in turn.  There must be a way to do
 875        * this in a single pass, but I couldn't get it to work.
 876        */
 877       brw_ADD(p, brw_address_reg(0), vp_address, brw_imm_d(byte_offset));
 878       brw_MOV(p, tmp, indirect);
 879
 880       brw_ADD(p, brw_address_reg(0), suboffset(vp_address, 8), brw_imm_d(byte_offset));
 881       brw_MOV(p, suboffset(tmp, 4), indirect);
 882
 883       brw_pop_insn_state(p);
 884    }
 885
 886    /* NOTE: tmp not released */
 887    return vec8(tmp);
 888 }
 889
 890
 891 /**
 892  * Get brw reg corresponding to the instruction's [argIndex] src reg.
 893  * TODO: relative addressing!
 894  */
 895 static struct brw_reg
 896 get_src_reg( struct brw_vs_compile *c,
 897              const struct ureg_instruction *inst,
 898              GLuint argIndex )
 899 {
 900    const GLuint file = inst->src[argIndex].File;
 901    const GLint index = inst->src[argIndex].Index;
 902    const GLboolean relAddr = inst->src[argIndex].Indirect;
 903
 904    switch (file) {
 905    case TGSI_FILE_TEMPORARY:
 906    case TGSI_FILE_INPUT:
 907    case TGSI_FILE_OUTPUT:
 908       if (relAddr) {
 909          return deref(c, c->regs[file][0], index);
 910       }
 911       else {
 912          assert(c->regs[file][index].nr != 0);
 913          return c->regs[file][index];
 914       }
 915
 916    case TGSI_FILE_CONSTANT:
 917       if (c->vp->use_const_buffer) {
 918          return get_constant(c, inst, argIndex);
 919       }
 920       else if (relAddr) {
 921          return deref(c, c->regs[TGSI_FILE_CONSTANT][0], index);
 922       }
 923       else {
 924          assert(c->regs[TGSI_FILE_CONSTANT][index].nr != 0);
 925          return c->regs[TGSI_FILE_CONSTANT][index];
 926       }
 927    case TGSI_FILE_ADDRESS:
 928       assert(index == 0);
 929       return c->regs[file][index];
 930
 931    case TGSI_FILE_NULL:
 932       /* this is a normal case since we loop over all three src args */
 933       return brw_null_reg();
 934
 935    default:
 936       assert(0);
 937       return brw_null_reg();
 938    }
 939 }
 940
 941
 942 static void emit_arl( struct brw_vs_compile *c,
 943                       struct brw_reg dst,
 944                       struct brw_reg arg0 )
 945 {
 946    struct brw_compile *p = &c->func;
 947    struct brw_reg tmp = dst;
 948    GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
 949
 950    if (need_tmp)
 951       tmp = get_tmp(c);
 952
 953    brw_RNDD(p, tmp, arg0);               /* tmp = round(arg0) */
 954    brw_MUL(p, dst, tmp, brw_imm_d(16));  /* dst = tmp * 16 */
 955
 956    if (need_tmp)
 957       release_tmp(c, tmp);
 958 }
 959
 960
 961 /**
 962  * Return the brw reg for the given instruction's src argument.
 963  */
 964 static struct brw_reg get_arg( struct brw_vs_compile *c,
 965                                const struct ureg_instruction *inst,
 966                                GLuint argIndex )
 967 {
 968    const struct ureg_src src = inst->src[argIndex];
 969    struct brw_reg reg;
 970
 971    if (src.File == TGSI_FILE_NULL)
 972       return brw_null_reg();
 973
 974    reg = get_src_reg(c, inst, argIndex);
 975
 976    /* Convert 3-bit swizzle to 2-bit.
 977     */
 978    reg.dw1.bits.swizzle = BRW_SWIZZLE4(src.SwizzleX,
 979                                        src.SwizzleY,
 980                                        src.SwizzleZ,
 981                                        src.SwizzleW);
 982
 983    /* Note this is ok for non-swizzle instructions:
 984     */
 985    reg.negate = src.Negate ? 1 : 0;
 986
 987    return reg;
 988 }
 989
 990
 991 /**
 992  * Get brw register for the given program dest register.
 993  */
 994 static struct brw_reg get_dst( struct brw_vs_compile *c,
 995                                struct ureg_dst dst )
 996 {
 997    struct brw_reg reg;
 998
 999    switch (dst.File) {
1000    case TGSI_FILE_TEMPORARY:
1001    case TGSI_FILE_OUTPUT:
1002       assert(c->regs[dst.File][dst.Index].nr != 0);
1003       reg = c->regs[dst.File][dst.Index];
1004       break;
1005    case TGSI_FILE_ADDRESS:
1006       assert(dst.Index == 0);
1007       reg = c->regs[dst.File][dst.Index];
1008       break;
1009    case TGSI_FILE_NULL:
1010       /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
1011       reg = brw_null_reg();
1012       break;
1013    default:
1014       assert(0);
1015       reg = brw_null_reg();
1016    }
1017
1018    reg.dw1.bits.writemask = dst.WriteMask;
1019
1020    return reg;
1021 }
1022
1023
1024
1025
1026 /**
1027  * Post-vertex-program processing.  Send the results to the URB.
1028  */
1029 static void emit_vertex_write( struct brw_vs_compile *c)
1030 {
1031    struct brw_compile *p = &c->func;
1032    struct brw_reg m0 = brw_message_reg(0);
1033    struct brw_reg pos = c->regs[TGSI_FILE_OUTPUT][VERT_RESULT_HPOS];
1034    struct brw_reg ndc;
1035    int eot;
1036    GLuint len_vertext_header = 2;
1037
1038    if (c->key.copy_edgeflag) {
1039       assert(0);
1040       brw_MOV(p,
1041               get_reg(c, TGSI_FILE_OUTPUT, 0),
1042               get_reg(c, TGSI_FILE_INPUT, 0));
1043    }
1044
1045    /* Build ndc coords */
1046    ndc = get_tmp(c);
1047    /* ndc = 1.0 / pos.w */
1048    emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
1049    /* ndc.xyz = pos * ndc */
1050    brw_MUL(p, brw_writemask(ndc, BRW_WRITEMASK_XYZ), pos, ndc);
1051
1052    /* Update the header for point size, user clipping flags, and -ve rhw
1053     * workaround.
1054     */
1055    if (c->prog_data.writes_psiz ||
1056        c->key.nr_userclip ||
1057        c->chipset.is_965)
1058    {
1059       struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
1060       GLuint i;
1061
1062       brw_MOV(p, header1, brw_imm_ud(0));
1063
1064       brw_set_access_mode(p, BRW_ALIGN_16);
1065
1066       if (c->prog_data.writes_psiz) {
1067          struct brw_reg psiz = c->regs[TGSI_FILE_OUTPUT][VERT_RESULT_PSIZ];
1068          brw_MUL(p, brw_writemask(header1, BRW_WRITEMASK_W), brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
1069          brw_AND(p, brw_writemask(header1, BRW_WRITEMASK_W), header1, brw_imm_ud(0x7ff<<8));
1070       }
1071
1072       for (i = 0; i < c->key.nr_userclip; i++) {
1073          brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
1074          brw_DP4(p, brw_null_reg(), pos, c->userplane[i]);
1075          brw_OR(p, brw_writemask(header1, BRW_WRITEMASK_W), header1, brw_imm_ud(1<<i));
1076          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1077       }
1078
1079       /* i965 clipping workaround:
1080        * 1) Test for -ve rhw
1081        * 2) If set,
1082        *      set ndc = (0,0,0,0)
1083        *      set ucp[6] = 1
1084        *
1085        * Later, clipping will detect ucp[6] and ensure the primitive is
1086        * clipped against all fixed planes.
1087        */
1088       if (c->chipset.is_965) {
1089          brw_CMP(p,
1090                  vec8(brw_null_reg()),
1091                  BRW_CONDITIONAL_L,
1092                  brw_swizzle1(ndc, 3),
1093                  brw_imm_f(0));
1094
1095          brw_OR(p, brw_writemask(header1, BRW_WRITEMASK_W), header1, brw_imm_ud(1<<6));
1096          brw_MOV(p, ndc, brw_imm_f(0));
1097          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1098       }
1099
1100       brw_set_access_mode(p, BRW_ALIGN_1);      /* why? */
1101       brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), header1);
1102       brw_set_access_mode(p, BRW_ALIGN_16);
1103
1104       release_tmp(c, header1);
1105    }
1106    else {
1107       brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
1108    }
1109
1110    /* Emit the (interleaved) headers for the two vertices - an 8-reg
1111     * of zeros followed by two sets of NDC coordinates:
1112     */
1113    brw_set_access_mode(p, BRW_ALIGN_1);
1114    brw_MOV(p, offset(m0, 2), ndc);
1115
1116    if (c->chipset.is_igdng) {
1117        /* There are 20 DWs (D0-D19) in VUE vertex header on IGDNG */
1118        brw_MOV(p, offset(m0, 3), pos); /* a portion of vertex header */
1119        /* m4, m5 contain the distances from vertex to the user clip planeXXX.
1120         * Seems it is useless for us.
1121         * m6 is used for aligning, so that the remainder of vertex element is
1122         * reg-aligned.
1123         */
1124        brw_MOV(p, offset(m0, 7), pos); /* the remainder of vertex element */
1125        len_vertext_header = 6;
1126    } else {
1127        brw_MOV(p, offset(m0, 3), pos);
1128        len_vertext_header = 2;
1129    }
1130
1131    eot = (c->first_overflow_output == 0);
1132
1133    brw_urb_WRITE(p,
1134                  brw_null_reg(), /* dest */
1135                  0,             /* starting mrf reg nr */
1136                  c->r0,         /* src */
1137                  0,             /* allocate */
1138                  1,             /* used */
1139                  MIN2(c->nr_outputs + 1 + len_vertext_header, (BRW_MAX_MRF-1)), /* msg len */
1140                  0,             /* response len */
1141                  eot,           /* eot */
1142                  eot,           /* writes complete */
1143                  0,             /* urb destination offset */
1144                  BRW_URB_SWIZZLE_INTERLEAVE);
1145
1146    if (c->first_overflow_output > 0) {
1147       /* Not all of the vertex outputs/results fit into the MRF.
1148        * Move the overflowed attributes from the GRF to the MRF and
1149        * issue another brw_urb_WRITE().
1150        */
1151       /* XXX I'm not 100% sure about which MRF regs to use here.  Starting
1152        * at mrf[4] atm...
1153        */
1154       GLuint i, mrf = 0;
1155       for (i = c->first_overflow_output; i < c->prog_data.nr_outputs; i++) {
1156          /* move from GRF to MRF */
1157          brw_MOV(p, brw_message_reg(4+mrf), c->regs[TGSI_FILE_OUTPUT][i]);
1158          mrf++;
1159       }
1160
1161       brw_urb_WRITE(p,
1162                     brw_null_reg(), /* dest */
1163                     4,              /* starting mrf reg nr */
1164                     c->r0,          /* src */
1165                     0,              /* allocate */
1166                     1,              /* used */
1167                     mrf+1,          /* msg len */
1168                     0,              /* response len */
1169                     1,              /* eot */
1170                     1,              /* writes complete */
1171                     BRW_MAX_MRF-1,  /* urb destination offset */
1172                     BRW_URB_SWIZZLE_INTERLEAVE);
1173    }
1174 }
1175
1176
1177 /**
1178  * Called after code generation to resolve subroutine calls and the
1179  * END instruction.
1180  * \param end_inst  points to brw code for END instruction
1181  * \param last_inst  points to last instruction emitted before vertex write
1182  */
1183 static void
1184 post_vs_emit( struct brw_vs_compile *c,
1185               struct brw_instruction *end_inst,
1186               struct brw_instruction *last_inst )
1187 {
1188    GLint offset;
1189
1190    brw_resolve_cals(&c->func);
1191
1192    /* patch up the END code to jump past subroutines, etc */
1193    offset = last_inst - end_inst;
1194    if (offset > 1) {
1195       brw_set_src1(end_inst, brw_imm_d(offset * 16));
1196    } else {
1197       end_inst->header.opcode = BRW_OPCODE_NOP;
1198    }
1199 }
1200
1201 static uint32_t
1202 get_predicate(const struct ureg_instruction *inst)
1203 {
1204    /* XXX: disabling for now
1205     */
1206 #if 0
1207    if (inst->dst.CondMask == COND_TR)
1208       return BRW_PREDICATE_NONE;
1209
1210    /* All of GLSL only produces predicates for COND_NE and one channel per
1211     * vector.  Fail badly if someone starts doing something else, as it might
1212     * mean infinite looping or something.
1213     *
1214     * We'd like to support all the condition codes, but our hardware doesn't
1215     * quite match the Mesa IR, which is modeled after the NV extensions.  For
1216     * those, the instruction may update the condition codes or not, then any
1217     * later instruction may use one of those condition codes.  For gen4, the
1218     * instruction may update the flags register based on one of the condition
1219     * codes output by the instruction, and then further instructions may
1220     * predicate on that.  We can probably support this, but it won't
1221     * necessarily be easy.
1222     */
1223 /*   assert(inst->dst.CondMask == COND_NE); */
1224
1225    switch (inst->dst.CondSwizzle) {
1226    case SWIZZLE_XXXX:
1227       return BRW_PREDICATE_ALIGN16_REPLICATE_X;
1228    case SWIZZLE_YYYY:
1229       return BRW_PREDICATE_ALIGN16_REPLICATE_Y;
1230    case SWIZZLE_ZZZZ:
1231       return BRW_PREDICATE_ALIGN16_REPLICATE_Z;
1232    case SWIZZLE_WWWW:
1233       return BRW_PREDICATE_ALIGN16_REPLICATE_W;
1234    default:
1235       debug_printf("Unexpected predicate: 0x%08x\n",
1236                     inst->dst.CondMask);
1237       return BRW_PREDICATE_NORMAL;
1238    }
1239 #else
1240    return BRW_PREDICATE_NORMAL;
1241 #endif
1242 }
1243
1244 static void emit_insn(struct brw_vs_compile *c,
1245                       const struct ureg_instruction *inst)
1246 {
1247    struct brw_compile *p = &c->func;
1248    struct brw_reg args[3], dst;
1249    GLuint i;
1250
1251 #if 0
1252    printf("%d: ", insn);
1253    _mesa_print_instruction(inst);
1254 #endif
1255
1256    /* Get argument regs.
1257     */
1258    for (i = 0; i < 3; i++) {
1259       args[i] = get_arg(c, inst, i);
1260    }
1261
1262    /* Get dest regs.  Note that it is possible for a reg to be both
1263     * dst and arg, given the static allocation of registers.  So
1264     * care needs to be taken emitting multi-operation instructions.
1265     */
1266    dst = get_dst(c, inst->dst);
1267
1268    if (inst->dst.Saturate) {
1269       debug_printf("Unsupported saturate in vertex shader");
1270    }
1271
1272    switch (inst->opcode) {
1273    case TGSI_OPCODE_ABS:
1274       brw_MOV(p, dst, brw_abs(args[0]));
1275       break;
1276    case TGSI_OPCODE_ADD:
1277       brw_ADD(p, dst, args[0], args[1]);
1278       break;
1279    case TGSI_OPCODE_COS:
1280       emit_math1(c, BRW_MATH_FUNCTION_COS, dst, args[0], BRW_MATH_PRECISION_FULL);
1281       break;
1282    case TGSI_OPCODE_DP3:
1283       brw_DP3(p, dst, args[0], args[1]);
1284       break;
1285    case TGSI_OPCODE_DP4:
1286       brw_DP4(p, dst, args[0], args[1]);
1287       break;
1288    case TGSI_OPCODE_DPH:
1289       brw_DPH(p, dst, args[0], args[1]);
1290       break;
1291    case TGSI_OPCODE_NRM:
1292       emit_nrm(c, dst, args[0], 3);
1293       break;
1294    case TGSI_OPCODE_NRM4:
1295       emit_nrm(c, dst, args[0], 4);
1296       break;
1297    case TGSI_OPCODE_DST:
1298       unalias2(c, dst, args[0], args[1], emit_dst_noalias);
1299       break;
1300    case TGSI_OPCODE_EXP:
1301       unalias1(c, dst, args[0], emit_exp_noalias);
1302       break;
1303    case TGSI_OPCODE_EX2:
1304       emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
1305       break;
1306    case TGSI_OPCODE_ARL:
1307       emit_arl(c, dst, args[0]);
1308       break;
1309    case TGSI_OPCODE_FLR:
1310       brw_RNDD(p, dst, args[0]);
1311       break;
1312    case TGSI_OPCODE_FRC:
1313       brw_FRC(p, dst, args[0]);
1314       break;
1315    case TGSI_OPCODE_LOG:
1316       unalias1(c, dst, args[0], emit_log_noalias);
1317       break;
1318    case TGSI_OPCODE_LG2:
1319       emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
1320       break;
1321    case TGSI_OPCODE_LIT:
1322       unalias1(c, dst, args[0], emit_lit_noalias);
1323       break;
1324    case TGSI_OPCODE_LRP:
1325       unalias3(c, dst, args[0], args[1], args[2], emit_lrp_noalias);
1326       break;
1327    case TGSI_OPCODE_MAD:
1328       brw_MOV(p, brw_acc_reg(), args[2]);
1329       brw_MAC(p, dst, args[0], args[1]);
1330       break;
1331    case TGSI_OPCODE_MAX:
1332       emit_max(p, dst, args[0], args[1]);
1333       break;
1334    case TGSI_OPCODE_MIN:
1335       emit_min(p, dst, args[0], args[1]);
1336       break;
1337    case TGSI_OPCODE_MOV:
1338       brw_MOV(p, dst, args[0]);
1339       break;
1340    case TGSI_OPCODE_MUL:
1341       brw_MUL(p, dst, args[0], args[1]);
1342       break;
1343    case TGSI_OPCODE_POW:
1344       emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL);
1345       break;
1346    case TGSI_OPCODE_RCP:
1347       emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
1348       break;
1349    case TGSI_OPCODE_RSQ:
1350       emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, args[0], BRW_MATH_PRECISION_FULL);
1351       break;
1352    case TGSI_OPCODE_SEQ:
1353       emit_seq(p, dst, args[0], args[1]);
1354       break;
1355    case TGSI_OPCODE_SIN:
1356       emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, args[0], BRW_MATH_PRECISION_FULL);
1357       break;
1358    case TGSI_OPCODE_SNE:
1359       emit_sne(p, dst, args[0], args[1]);
1360       break;
1361    case TGSI_OPCODE_SGE:
1362       emit_sge(p, dst, args[0], args[1]);
1363       break;
1364    case TGSI_OPCODE_SGT:
1365       emit_sgt(p, dst, args[0], args[1]);
1366       break;
1367    case TGSI_OPCODE_SLT:
1368       emit_slt(p, dst, args[0], args[1]);
1369       break;
1370    case TGSI_OPCODE_SLE:
1371       emit_sle(p, dst, args[0], args[1]);
1372       break;
1373    case TGSI_OPCODE_SUB:
1374       brw_ADD(p, dst, args[0], negate(args[1]));
1375       break;
1376    case TGSI_OPCODE_TRUNC:
1377       /* round toward zero */
1378       brw_RNDZ(p, dst, args[0]);
1379       break;
1380    case TGSI_OPCODE_XPD:
1381       emit_xpd(p, dst, args[0], args[1]);
1382       break;
1383    case TGSI_OPCODE_IF:
1384       assert(c->if_depth < MAX_IF_DEPTH);
1385       c->if_inst[c->if_depth] = brw_IF(p, BRW_EXECUTE_8);
1386       /* Note that brw_IF smashes the predicate_control field. */
1387       c->if_inst[c->if_depth]->header.predicate_control = get_predicate(inst);
1388       c->if_depth++;
1389       break;
1390    case TGSI_OPCODE_ELSE:
1391       c->if_inst[c->if_depth-1] = brw_ELSE(p, c->if_inst[c->if_depth-1]);
1392       break;
1393    case TGSI_OPCODE_ENDIF:
1394       assert(c->if_depth > 0);
1395       brw_ENDIF(p, c->if_inst[--c->if_depth]);
1396       break;
1397    case TGSI_OPCODE_BGNLOOP:
1398       c->loop_inst[c->loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
1399       break;
1400    case TGSI_OPCODE_BRK:
1401       brw_set_predicate_control(p, get_predicate(inst));
1402       brw_BREAK(p);
1403       brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1404       break;
1405    case TGSI_OPCODE_CONT:
1406       brw_set_predicate_control(p, get_predicate(inst));
1407       brw_CONT(p);
1408       brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1409       break;
1410    case TGSI_OPCODE_ENDLOOP:
1411    {
1412       struct brw_instruction *inst0, *inst1;
1413       GLuint br = 1;
1414
1415       c->loop_depth--;
1416
1417       if (c->chipset.is_igdng)
1418          br = 2;
1419
1420       inst0 = inst1 = brw_WHILE(p, c->loop_inst[c->loop_depth]);
1421       /* patch all the BREAK/CONT instructions from last BEGINLOOP */
1422       while (inst0 > c->loop_inst[c->loop_depth]) {
1423          inst0--;
1424          if (inst0->header.opcode == TGSI_OPCODE_BRK) {
1425             inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
1426             inst0->bits3.if_else.pop_count = 0;
1427          }
1428          else if (inst0->header.opcode == TGSI_OPCODE_CONT) {
1429             inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
1430             inst0->bits3.if_else.pop_count = 0;
1431          }
1432       }
1433    }
1434    break;
1435    case TGSI_OPCODE_BRA:
1436       brw_set_predicate_control(p, get_predicate(inst));
1437       brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1438       brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1439       break;
1440    case TGSI_OPCODE_CAL:
1441       brw_set_access_mode(p, BRW_ALIGN_1);
1442       brw_ADD(p, deref_1d(c->stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
1443       brw_set_access_mode(p, BRW_ALIGN_16);
1444       brw_ADD(p, get_addr_reg(c->stack_index),
1445               get_addr_reg(c->stack_index), brw_imm_d(4));
1446       brw_save_call(p, inst->label, p->nr_insn);
1447       brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1448       break;
1449    case TGSI_OPCODE_RET:
1450       brw_ADD(p, get_addr_reg(c->stack_index),
1451               get_addr_reg(c->stack_index), brw_imm_d(-4));
1452       brw_set_access_mode(p, BRW_ALIGN_1);
1453       brw_MOV(p, brw_ip_reg(), deref_1d(c->stack_index, 0));
1454       brw_set_access_mode(p, BRW_ALIGN_16);
1455       break;
1456    case TGSI_OPCODE_END:
1457       c->end_offset = p->nr_insn;
1458       /* this instruction will get patched later to jump past subroutine
1459        * code, etc.
1460        */
1461       brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1462       break;
1463    case TGSI_OPCODE_BGNSUB:
1464       brw_save_label(p, p->nr_insn, p->nr_insn);
1465       break;
1466    case TGSI_OPCODE_ENDSUB:
1467       /* no-op */
1468       break;
1469    default:
1470       debug_printf("Unsupported opcode %i (%s) in vertex shader",
1471                    inst->opcode,
1472                    tgsi_get_opcode_name(inst->opcode));
1473    }
1474
1475    /* Set the predication update on the last instruction of the native
1476     * instruction sequence.
1477     *
1478     * This would be problematic if it was set on a math instruction,
1479     * but that shouldn't be the case with the current GLSL compiler.
1480     */
1481 #if 0
1482    /* XXX: disabled
1483     */
1484    if (inst->CondUpdate) {
1485       struct brw_instruction *hw_insn = &p->store[p->nr_insn - 1];
1486
1487       assert(hw_insn->header.destreg__conditionalmod == 0);
1488       hw_insn->header.destreg__conditionalmod = BRW_CONDITIONAL_NZ;
1489    }
1490 #endif
1491
1492    release_tmps(c);
1493 }
1494
1495
1496 /* Emit the vertex program instructions here.
1497  */
1498 void brw_vs_emit(struct brw_vs_compile *c)
1499 {
1500    struct brw_compile *p = &c->func;
1501    struct brw_instruction *end_inst, *last_inst;
1502    struct ureg_parse_context parse;
1503    struct ureg_declaration *decl;
1504    struct ureg_declaration *imm;
1505    struct ureg_declaration *insn;
1506
1507    if (BRW_DEBUG & DEBUG_VS)
1508       tgsi_dump(c->vp->tokens, 0);
1509
1510    c->stack_index = brw_indirect(0, 0);
1511
1512    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1513    brw_set_access_mode(p, BRW_ALIGN_16);
1514
1515    /* Static register allocation
1516     */
1517    brw_vs_alloc_regs(c);
1518    brw_MOV(p, get_addr_reg(c->stack_index), brw_address(c->stack));
1519
1520    while (ureg_next_decl(&parse, &decl)) {
1521    }
1522
1523    while (ureg_next_immediate(&parse, &imm)) {
1524    }
1525
1526    while (ureg_next_instruction(&parse, &insn)) {
1527    }
1528
1529    end_inst = &p->store[end_offset];
1530    last_inst = &p->store[p->nr_insn];
1531
1532    /* The END instruction will be patched to jump to this code */
1533    emit_vertex_write(c);
1534
1535    post_vs_emit(c, end_inst, last_inst);
1536
1537    if (BRW_DEBUG & DEBUG_VS) {
1538       int i;
1539
1540       debug_printf("vs-native:\n");
1541       for (i = 0; i < p->nr_insn; i++)
1542          brw_disasm(stderr, &p->store[i]);
1543       debug_printf("\n");
1544    }
1545 }