src/mesa/drivers/dri/i965/brw_vs_emit.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keith@tungstengraphics.com>
  30   */
  31
  32
  33 #include "main/macros.h"
  34 #include "shader/program.h"
  35 #include "shader/prog_parameter.h"
  36 #include "shader/prog_print.h"
  37 #include "brw_context.h"
  38 #include "brw_vs.h"
  39
  40
  41 static struct brw_reg get_tmp( struct brw_vs_compile *c )
  42 {
  43    struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
  44
  45    if (++c->last_tmp > c->prog_data.total_grf)
  46       c->prog_data.total_grf = c->last_tmp;
  47
  48    return tmp;
  49 }
  50
  51 static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
  52 {
  53    if (tmp.nr == c->last_tmp-1)
  54       c->last_tmp--;
  55 }
  56
  57 static void release_tmps( struct brw_vs_compile *c )
  58 {
  59    c->last_tmp = c->first_tmp;
  60 }
  61
  62
  63 /**
  64  * Preallocate GRF register before code emit.
  65  * Do things as simply as possible.  Allocate and populate all regs
  66  * ahead of time.
  67  */
  68 static void brw_vs_alloc_regs( struct brw_vs_compile *c )
  69 {
  70    GLuint i, reg = 0, mrf;
  71
  72    /* Determine whether to use a real constant buffer or use a block
  73     * of GRF registers for constants.  The later is faster but only
  74     * works if everything fits in the GRF.
  75     * XXX this heuristic/check may need some fine tuning...
  76     */
  77    if (c->vp->program.Base.Parameters->NumParameters +
  78        c->vp->program.Base.NumTemporaries + 20 > BRW_MAX_GRF)
  79       c->vp->use_const_buffer = GL_TRUE;
  80    else
  81       c->vp->use_const_buffer = GL_FALSE;
  82
  83    /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
  84
  85    /* r0 -- reserved as usual
  86     */
  87    c->r0 = brw_vec8_grf(reg, 0);
  88    reg++;
  89
  90    /* User clip planes from curbe:
  91     */
  92    if (c->key.nr_userclip) {
  93       for (i = 0; i < c->key.nr_userclip; i++) {
  94          c->userplane[i] = stride( brw_vec4_grf(reg+3+i/2, (i%2) * 4), 0, 4, 1);
  95       }
  96
  97       /* Deal with curbe alignment:
  98        */
  99       reg += ((6 + c->key.nr_userclip + 3) / 4) * 2;
 100    }
 101
 102    /* Vertex program parameters from curbe:
 103     */
 104    if (c->vp->use_const_buffer) {
 105       /* get constants from a real constant buffer */
 106       c->prog_data.curb_read_length = 0;
 107       c->prog_data.nr_params = 4; /* XXX 0 causes a bug elsewhere... */
 108    }
 109    else {
 110       /* use a section of the GRF for constants */
 111       GLuint nr_params = c->vp->program.Base.Parameters->NumParameters;
 112       for (i = 0; i < nr_params; i++) {
 113          c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2, (i%2) * 4), 0, 4, 1);
 114       }
 115       reg += (nr_params + 1) / 2;
 116       c->prog_data.curb_read_length = reg - 1;
 117
 118       c->prog_data.nr_params = nr_params * 4;
 119    }
 120
 121    /* Allocate input regs:
 122     */
 123    c->nr_inputs = 0;
 124    for (i = 0; i < VERT_ATTRIB_MAX; i++) {
 125       if (c->prog_data.inputs_read & (1 << i)) {
 126          c->nr_inputs++;
 127          c->regs[PROGRAM_INPUT][i] = brw_vec8_grf(reg, 0);
 128          reg++;
 129       }
 130    }
 131
 132    /* Allocate outputs.  The non-position outputs go straight into message regs.
 133     */
 134    c->nr_outputs = 0;
 135    c->first_output = reg;
 136    c->first_overflow_output = 0;
 137
 138    if (BRW_IS_IGDNG(c->func.brw))
 139        mrf = 8;
 140    else
 141        mrf = 4;
 142
 143    for (i = 0; i < VERT_RESULT_MAX; i++) {
 144       if (c->prog_data.outputs_written & (1 << i)) {
 145          c->nr_outputs++;
 146          assert(i < Elements(c->regs[PROGRAM_OUTPUT]));
 147          if (i == VERT_RESULT_HPOS) {
 148             c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
 149             reg++;
 150          }
 151          else if (i == VERT_RESULT_PSIZ) {
 152             c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
 153             reg++;
 154             mrf++;              /* just a placeholder?  XXX fix later stages & remove this */
 155          }
 156          else {
 157             if (mrf < 16) {
 158                c->regs[PROGRAM_OUTPUT][i] = brw_message_reg(mrf);
 159                mrf++;
 160             }
 161             else {
 162                /* too many vertex results to fit in MRF, use GRF for overflow */
 163                if (!c->first_overflow_output)
 164                   c->first_overflow_output = i;
 165                c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
 166                reg++;
 167             }
 168          }
 169       }
 170    }
 171
 172    /* Allocate program temporaries:
 173     */
 174    for (i = 0; i < c->vp->program.Base.NumTemporaries; i++) {
 175       c->regs[PROGRAM_TEMPORARY][i] = brw_vec8_grf(reg, 0);
 176       reg++;
 177    }
 178
 179    /* Address reg(s).  Don't try to use the internal address reg until
 180     * deref time.
 181     */
 182    for (i = 0; i < c->vp->program.Base.NumAddressRegs; i++) {
 183       c->regs[PROGRAM_ADDRESS][i] =  brw_reg(BRW_GENERAL_REGISTER_FILE,
 184                                              reg,
 185                                              0,
 186                                              BRW_REGISTER_TYPE_D,
 187                                              BRW_VERTICAL_STRIDE_8,
 188                                              BRW_WIDTH_8,
 189                                              BRW_HORIZONTAL_STRIDE_1,
 190                                              BRW_SWIZZLE_XXXX,
 191                                              WRITEMASK_X);
 192       reg++;
 193    }
 194
 195    if (c->vp->use_const_buffer) {
 196       for (i = 0; i < 3; i++) {
 197          c->current_const[i].index = -1;
 198          c->current_const[i].reg = brw_vec8_grf(reg, 0);
 199          reg++;
 200       }
 201    }
 202
 203    for (i = 0; i < 128; i++) {
 204       if (c->output_regs[i].used_in_src) {
 205          c->output_regs[i].reg = brw_vec8_grf(reg, 0);
 206          reg++;
 207       }
 208    }
 209
 210    c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0);
 211    reg += 2;
 212
 213    /* Some opcodes need an internal temporary:
 214     */
 215    c->first_tmp = reg;
 216    c->last_tmp = reg;           /* for allocation purposes */
 217
 218    /* Each input reg holds data from two vertices.  The
 219     * urb_read_length is the number of registers read from *each*
 220     * vertex urb, so is half the amount:
 221     */
 222    c->prog_data.urb_read_length = (c->nr_inputs + 1) / 2;
 223
 224    if (BRW_IS_IGDNG(c->func.brw))
 225        c->prog_data.urb_entry_size = (c->nr_outputs + 6 + 3) / 4;
 226    else
 227        c->prog_data.urb_entry_size = (c->nr_outputs + 2 + 3) / 4;
 228
 229    c->prog_data.total_grf = reg;
 230
 231    if (INTEL_DEBUG & DEBUG_VS) {
 232       _mesa_printf("%s NumAddrRegs %d\n", __FUNCTION__, c->vp->program.Base.NumAddressRegs);
 233       _mesa_printf("%s NumTemps %d\n", __FUNCTION__, c->vp->program.Base.NumTemporaries);
 234       _mesa_printf("%s reg = %d\n", __FUNCTION__, reg);
 235    }
 236 }
 237
 238
 239 /**
 240  * If an instruction uses a temp reg both as a src and the dest, we
 241  * sometimes need to allocate an intermediate temporary.
 242  */
 243 static void unalias1( struct brw_vs_compile *c,
 244                       struct brw_reg dst,
 245                       struct brw_reg arg0,
 246                       void (*func)( struct brw_vs_compile *,
 247                                     struct brw_reg,
 248                                     struct brw_reg ))
 249 {
 250    if (dst.file == arg0.file && dst.nr == arg0.nr) {
 251       struct brw_compile *p = &c->func;
 252       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 253       func(c, tmp, arg0);
 254       brw_MOV(p, dst, tmp);
 255       release_tmp(c, tmp);
 256    }
 257    else {
 258       func(c, dst, arg0);
 259    }
 260 }
 261
 262 /**
 263  * \sa unalias2
 264  * Checkes if 2-operand instruction needs an intermediate temporary.
 265  */
 266 static void unalias2( struct brw_vs_compile *c,
 267                       struct brw_reg dst,
 268                       struct brw_reg arg0,
 269                       struct brw_reg arg1,
 270                       void (*func)( struct brw_vs_compile *,
 271                                     struct brw_reg,
 272                                     struct brw_reg,
 273                                     struct brw_reg ))
 274 {
 275    if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
 276        (dst.file == arg1.file && dst.nr == arg1.nr)) {
 277       struct brw_compile *p = &c->func;
 278       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 279       func(c, tmp, arg0, arg1);
 280       brw_MOV(p, dst, tmp);
 281       release_tmp(c, tmp);
 282    }
 283    else {
 284       func(c, dst, arg0, arg1);
 285    }
 286 }
 287
 288 /**
 289  * \sa unalias2
 290  * Checkes if 3-operand instruction needs an intermediate temporary.
 291  */
 292 static void unalias3( struct brw_vs_compile *c,
 293                       struct brw_reg dst,
 294                       struct brw_reg arg0,
 295                       struct brw_reg arg1,
 296                       struct brw_reg arg2,
 297                       void (*func)( struct brw_vs_compile *,
 298                                     struct brw_reg,
 299                                     struct brw_reg,
 300                                     struct brw_reg,
 301                                     struct brw_reg ))
 302 {
 303    if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
 304        (dst.file == arg1.file && dst.nr == arg1.nr) ||
 305        (dst.file == arg2.file && dst.nr == arg2.nr)) {
 306       struct brw_compile *p = &c->func;
 307       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 308       func(c, tmp, arg0, arg1, arg2);
 309       brw_MOV(p, dst, tmp);
 310       release_tmp(c, tmp);
 311    }
 312    else {
 313       func(c, dst, arg0, arg1, arg2);
 314    }
 315 }
 316
 317 static void emit_sop( struct brw_compile *p,
 318                       struct brw_reg dst,
 319                       struct brw_reg arg0,
 320                       struct brw_reg arg1,
 321                       GLuint cond)
 322 {
 323    brw_MOV(p, dst, brw_imm_f(0.0f));
 324    brw_CMP(p, brw_null_reg(), cond, arg0, arg1);
 325    brw_MOV(p, dst, brw_imm_f(1.0f));
 326    brw_set_predicate_control_flag_value(p, 0xff);
 327 }
 328
 329 static void emit_seq( struct brw_compile *p,
 330                       struct brw_reg dst,
 331                       struct brw_reg arg0,
 332                       struct brw_reg arg1 )
 333 {
 334    emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_EQ);
 335 }
 336
 337 static void emit_sne( struct brw_compile *p,
 338                       struct brw_reg dst,
 339                       struct brw_reg arg0,
 340                       struct brw_reg arg1 )
 341 {
 342    emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_NEQ);
 343 }
 344 static void emit_slt( struct brw_compile *p,
 345                       struct brw_reg dst,
 346                       struct brw_reg arg0,
 347                       struct brw_reg arg1 )
 348 {
 349    emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_L);
 350 }
 351
 352 static void emit_sle( struct brw_compile *p,
 353                       struct brw_reg dst,
 354                       struct brw_reg arg0,
 355                       struct brw_reg arg1 )
 356 {
 357    emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_LE);
 358 }
 359
 360 static void emit_sgt( struct brw_compile *p,
 361                       struct brw_reg dst,
 362                       struct brw_reg arg0,
 363                       struct brw_reg arg1 )
 364 {
 365    emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_G);
 366 }
 367
 368 static void emit_sge( struct brw_compile *p,
 369                       struct brw_reg dst,
 370                       struct brw_reg arg0,
 371                       struct brw_reg arg1 )
 372 {
 373   emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_GE);
 374 }
 375
 376 static void emit_max( struct brw_compile *p,
 377                       struct brw_reg dst,
 378                       struct brw_reg arg0,
 379                       struct brw_reg arg1 )
 380 {
 381    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
 382    brw_SEL(p, dst, arg1, arg0);
 383    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 384 }
 385
 386 static void emit_min( struct brw_compile *p,
 387                       struct brw_reg dst,
 388                       struct brw_reg arg0,
 389                       struct brw_reg arg1 )
 390 {
 391    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
 392    brw_SEL(p, dst, arg0, arg1);
 393    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 394 }
 395
 396
 397 static void emit_math1( struct brw_vs_compile *c,
 398                         GLuint function,
 399                         struct brw_reg dst,
 400                         struct brw_reg arg0,
 401                         GLuint precision)
 402 {
 403    /* There are various odd behaviours with SEND on the simulator.  In
 404     * addition there are documented issues with the fact that the GEN4
 405     * processor doesn't do dependency control properly on SEND
 406     * results.  So, on balance, this kludge to get around failures
 407     * with writemasked math results looks like it might be necessary
 408     * whether that turns out to be a simulator bug or not:
 409     */
 410    struct brw_compile *p = &c->func;
 411    struct brw_reg tmp = dst;
 412    GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
 413                          dst.file != BRW_GENERAL_REGISTER_FILE);
 414
 415    if (need_tmp)
 416       tmp = get_tmp(c);
 417
 418    brw_math(p,
 419             tmp,
 420             function,
 421             BRW_MATH_SATURATE_NONE,
 422             2,
 423             arg0,
 424             BRW_MATH_DATA_SCALAR,
 425             precision);
 426
 427    if (need_tmp) {
 428       brw_MOV(p, dst, tmp);
 429       release_tmp(c, tmp);
 430    }
 431 }
 432
 433
 434 static void emit_math2( struct brw_vs_compile *c,
 435                         GLuint function,
 436                         struct brw_reg dst,
 437                         struct brw_reg arg0,
 438                         struct brw_reg arg1,
 439                         GLuint precision)
 440 {
 441    struct brw_compile *p = &c->func;
 442    struct brw_reg tmp = dst;
 443    GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
 444                          dst.file != BRW_GENERAL_REGISTER_FILE);
 445
 446    if (need_tmp)
 447       tmp = get_tmp(c);
 448
 449    brw_MOV(p, brw_message_reg(3), arg1);
 450
 451    brw_math(p,
 452             tmp,
 453             function,
 454             BRW_MATH_SATURATE_NONE,
 455             2,
 456             arg0,
 457             BRW_MATH_DATA_SCALAR,
 458             precision);
 459
 460    if (need_tmp) {
 461       brw_MOV(p, dst, tmp);
 462       release_tmp(c, tmp);
 463    }
 464 }
 465
 466
 467 static void emit_exp_noalias( struct brw_vs_compile *c,
 468                               struct brw_reg dst,
 469                               struct brw_reg arg0 )
 470 {
 471    struct brw_compile *p = &c->func;
 472
 473
 474    if (dst.dw1.bits.writemask & WRITEMASK_X) {
 475       struct brw_reg tmp = get_tmp(c);
 476       struct brw_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
 477
 478       /* tmp_d = floor(arg0.x) */
 479       brw_RNDD(p, tmp_d, brw_swizzle1(arg0, 0));
 480
 481       /* result[0] = 2.0 ^ tmp */
 482
 483       /* Adjust exponent for floating point:
 484        * exp += 127
 485        */
 486       brw_ADD(p, brw_writemask(tmp_d, WRITEMASK_X), tmp_d, brw_imm_d(127));
 487
 488       /* Install exponent and sign.
 489        * Excess drops off the edge:
 490        */
 491       brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), WRITEMASK_X),
 492               tmp_d, brw_imm_d(23));
 493
 494       release_tmp(c, tmp);
 495    }
 496
 497    if (dst.dw1.bits.writemask & WRITEMASK_Y) {
 498       /* result[1] = arg0.x - floor(arg0.x) */
 499       brw_FRC(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0, 0));
 500    }
 501
 502    if (dst.dw1.bits.writemask & WRITEMASK_Z) {
 503       /* As with the LOG instruction, we might be better off just
 504        * doing a taylor expansion here, seeing as we have to do all
 505        * the prep work.
 506        *
 507        * If mathbox partial precision is too low, consider also:
 508        * result[3] = result[0] * EXP(result[1])
 509        */
 510       emit_math1(c,
 511                  BRW_MATH_FUNCTION_EXP,
 512                  brw_writemask(dst, WRITEMASK_Z),
 513                  brw_swizzle1(arg0, 0),
 514                  BRW_MATH_PRECISION_FULL);
 515    }
 516
 517    if (dst.dw1.bits.writemask & WRITEMASK_W) {
 518       /* result[3] = 1.0; */
 519       brw_MOV(p, brw_writemask(dst, WRITEMASK_W), brw_imm_f(1));
 520    }
 521 }
 522
 523
 524 static void emit_log_noalias( struct brw_vs_compile *c,
 525                               struct brw_reg dst,
 526                               struct brw_reg arg0 )
 527 {
 528    struct brw_compile *p = &c->func;
 529    struct brw_reg tmp = dst;
 530    struct brw_reg tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
 531    struct brw_reg arg0_ud = retype(arg0, BRW_REGISTER_TYPE_UD);
 532    GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
 533                          dst.file != BRW_GENERAL_REGISTER_FILE);
 534
 535    if (need_tmp) {
 536       tmp = get_tmp(c);
 537       tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
 538    }
 539
 540    /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
 541     * according to spec:
 542     *
 543     * These almost look likey they could be joined up, but not really
 544     * practical:
 545     *
 546     * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
 547     * result[1].i = (x.i & ((1<<23)-1)        + (127<<23)
 548     */
 549    if (dst.dw1.bits.writemask & WRITEMASK_XZ) {
 550       brw_AND(p,
 551               brw_writemask(tmp_ud, WRITEMASK_X),
 552               brw_swizzle1(arg0_ud, 0),
 553               brw_imm_ud((1U<<31)-1));
 554
 555       brw_SHR(p,
 556               brw_writemask(tmp_ud, WRITEMASK_X),
 557               tmp_ud,
 558               brw_imm_ud(23));
 559
 560       brw_ADD(p,
 561               brw_writemask(tmp, WRITEMASK_X),
 562               retype(tmp_ud, BRW_REGISTER_TYPE_D),      /* does it matter? */
 563               brw_imm_d(-127));
 564    }
 565
 566    if (dst.dw1.bits.writemask & WRITEMASK_YZ) {
 567       brw_AND(p,
 568               brw_writemask(tmp_ud, WRITEMASK_Y),
 569               brw_swizzle1(arg0_ud, 0),
 570               brw_imm_ud((1<<23)-1));
 571
 572       brw_OR(p,
 573              brw_writemask(tmp_ud, WRITEMASK_Y),
 574              tmp_ud,
 575              brw_imm_ud(127<<23));
 576    }
 577
 578    if (dst.dw1.bits.writemask & WRITEMASK_Z) {
 579       /* result[2] = result[0] + LOG2(result[1]); */
 580
 581       /* Why bother?  The above is just a hint how to do this with a
 582        * taylor series.  Maybe we *should* use a taylor series as by
 583        * the time all the above has been done it's almost certainly
 584        * quicker than calling the mathbox, even with low precision.
 585        *
 586        * Options are:
 587        *    - result[0] + mathbox.LOG2(result[1])
 588        *    - mathbox.LOG2(arg0.x)
 589        *    - result[0] + inline_taylor_approx(result[1])
 590        */
 591       emit_math1(c,
 592                  BRW_MATH_FUNCTION_LOG,
 593                  brw_writemask(tmp, WRITEMASK_Z),
 594                  brw_swizzle1(tmp, 1),
 595                  BRW_MATH_PRECISION_FULL);
 596
 597       brw_ADD(p,
 598               brw_writemask(tmp, WRITEMASK_Z),
 599               brw_swizzle1(tmp, 2),
 600               brw_swizzle1(tmp, 0));
 601    }
 602
 603    if (dst.dw1.bits.writemask & WRITEMASK_W) {
 604       /* result[3] = 1.0; */
 605       brw_MOV(p, brw_writemask(tmp, WRITEMASK_W), brw_imm_f(1));
 606    }
 607
 608    if (need_tmp) {
 609       brw_MOV(p, dst, tmp);
 610       release_tmp(c, tmp);
 611    }
 612 }
 613
 614
 615 /* Need to unalias - consider swizzles:   r0 = DST r0.xxxx r1
 616  */
 617 static void emit_dst_noalias( struct brw_vs_compile *c,
 618                               struct brw_reg dst,
 619                               struct brw_reg arg0,
 620                               struct brw_reg arg1)
 621 {
 622    struct brw_compile *p = &c->func;
 623
 624    /* There must be a better way to do this:
 625     */
 626    if (dst.dw1.bits.writemask & WRITEMASK_X)
 627       brw_MOV(p, brw_writemask(dst, WRITEMASK_X), brw_imm_f(1.0));
 628    if (dst.dw1.bits.writemask & WRITEMASK_Y)
 629       brw_MUL(p, brw_writemask(dst, WRITEMASK_Y), arg0, arg1);
 630    if (dst.dw1.bits.writemask & WRITEMASK_Z)
 631       brw_MOV(p, brw_writemask(dst, WRITEMASK_Z), arg0);
 632    if (dst.dw1.bits.writemask & WRITEMASK_W)
 633       brw_MOV(p, brw_writemask(dst, WRITEMASK_W), arg1);
 634 }
 635
 636
 637 static void emit_xpd( struct brw_compile *p,
 638                       struct brw_reg dst,
 639                       struct brw_reg t,
 640                       struct brw_reg u)
 641 {
 642    brw_MUL(p, brw_null_reg(), brw_swizzle(t, 1,2,0,3),  brw_swizzle(u,2,0,1,3));
 643    brw_MAC(p, dst,     negate(brw_swizzle(t, 2,0,1,3)), brw_swizzle(u,1,2,0,3));
 644 }
 645
 646
 647 static void emit_lit_noalias( struct brw_vs_compile *c,
 648                               struct brw_reg dst,
 649                               struct brw_reg arg0 )
 650 {
 651    struct brw_compile *p = &c->func;
 652    struct brw_instruction *if_insn;
 653    struct brw_reg tmp = dst;
 654    GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
 655
 656    if (need_tmp)
 657       tmp = get_tmp(c);
 658
 659    brw_MOV(p, brw_writemask(dst, WRITEMASK_YZ), brw_imm_f(0));
 660    brw_MOV(p, brw_writemask(dst, WRITEMASK_XW), brw_imm_f(1));
 661
 662    /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
 663     * to get all channels active inside the IF.  In the clipping code
 664     * we run with NoMask, so it's not an option and we can use
 665     * BRW_EXECUTE_1 for all comparisions.
 666     */
 667    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,0), brw_imm_f(0));
 668    if_insn = brw_IF(p, BRW_EXECUTE_8);
 669    {
 670       brw_MOV(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0,0));
 671
 672       brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,1), brw_imm_f(0));
 673       brw_MOV(p, brw_writemask(tmp, WRITEMASK_Z),  brw_swizzle1(arg0,1));
 674       brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 675
 676       emit_math2(c,
 677                  BRW_MATH_FUNCTION_POW,
 678                  brw_writemask(dst, WRITEMASK_Z),
 679                  brw_swizzle1(tmp, 2),
 680                  brw_swizzle1(arg0, 3),
 681                  BRW_MATH_PRECISION_PARTIAL);
 682    }
 683
 684    brw_ENDIF(p, if_insn);
 685
 686    release_tmp(c, tmp);
 687 }
 688
 689 static void emit_lrp_noalias(struct brw_vs_compile *c,
 690                              struct brw_reg dst,
 691                              struct brw_reg arg0,
 692                              struct brw_reg arg1,
 693                              struct brw_reg arg2)
 694 {
 695    struct brw_compile *p = &c->func;
 696
 697    brw_ADD(p, dst, negate(arg0), brw_imm_f(1.0));
 698    brw_MUL(p, brw_null_reg(), dst, arg2);
 699    brw_MAC(p, dst, arg0, arg1);
 700 }
 701
 702 /** 3 or 4-component vector normalization */
 703 static void emit_nrm( struct brw_vs_compile *c,
 704                       struct brw_reg dst,
 705                       struct brw_reg arg0,
 706                       int num_comps)
 707 {
 708    struct brw_compile *p = &c->func;
 709    struct brw_reg tmp = get_tmp(c);
 710
 711    /* tmp = dot(arg0, arg0) */
 712    if (num_comps == 3)
 713       brw_DP3(p, tmp, arg0, arg0);
 714    else
 715       brw_DP4(p, tmp, arg0, arg0);
 716
 717    /* tmp = 1 / sqrt(tmp) */
 718    emit_math1(c, BRW_MATH_FUNCTION_RSQ, tmp, tmp, BRW_MATH_PRECISION_FULL);
 719
 720    /* dst = arg0 * tmp */
 721    brw_MUL(p, dst, arg0, tmp);
 722
 723    release_tmp(c, tmp);
 724 }
 725
 726
 727 static struct brw_reg
 728 get_constant(struct brw_vs_compile *c,
 729              const struct prog_instruction *inst,
 730              GLuint argIndex)
 731 {
 732    const struct prog_src_register *src = &inst->SrcReg[argIndex];
 733    struct brw_compile *p = &c->func;
 734    struct brw_reg const_reg;
 735    struct brw_reg const2_reg;
 736    const GLboolean relAddr = src->RelAddr;
 737
 738    assert(argIndex < 3);
 739
 740    if (c->current_const[argIndex].index != src->Index || relAddr) {
 741       struct brw_reg addrReg = c->regs[PROGRAM_ADDRESS][0];
 742
 743       c->current_const[argIndex].index = src->Index;
 744
 745 #if 0
 746       printf("  fetch const[%d] for arg %d into reg %d\n",
 747              src->Index, argIndex, c->current_const[argIndex].reg.nr);
 748 #endif
 749       /* need to fetch the constant now */
 750       brw_dp_READ_4_vs(p,
 751                        c->current_const[argIndex].reg,/* writeback dest */
 752                        0,                             /* oword */
 753                        relAddr,                       /* relative indexing? */
 754                        addrReg,                       /* address register */
 755                        16 * src->Index,               /* byte offset */
 756                        SURF_INDEX_VERT_CONST_BUFFER   /* binding table index */
 757                        );
 758
 759       if (relAddr) {
 760          /* second read */
 761          const2_reg = get_tmp(c);
 762
 763          /* use upper half of address reg for second read */
 764          addrReg = stride(addrReg, 0, 4, 0);
 765          addrReg.subnr = 16;
 766
 767          brw_dp_READ_4_vs(p,
 768                           const2_reg,              /* writeback dest */
 769                           1,                       /* oword */
 770                           relAddr,                 /* relative indexing? */
 771                           addrReg,                 /* address register */
 772                           16 * src->Index,         /* byte offset */
 773                           SURF_INDEX_VERT_CONST_BUFFER
 774                           );
 775       }
 776    }
 777
 778    const_reg = c->current_const[argIndex].reg;
 779
 780    if (relAddr) {
 781       /* merge the two Owords into the constant register */
 782       /* const_reg[7..4] = const2_reg[7..4] */
 783       brw_MOV(p,
 784               suboffset(stride(const_reg, 0, 4, 1), 4),
 785               suboffset(stride(const2_reg, 0, 4, 1), 4));
 786       release_tmp(c, const2_reg);
 787    }
 788    else {
 789       /* replicate lower four floats into upper half (to get XYZWXYZW) */
 790       const_reg = stride(const_reg, 0, 4, 0);
 791       const_reg.subnr = 0;
 792    }
 793
 794    return const_reg;
 795 }
 796
 797
 798
 799 /* TODO: relative addressing!
 800  */
 801 static struct brw_reg get_reg( struct brw_vs_compile *c,
 802                                gl_register_file file,
 803                                GLuint index )
 804 {
 805    switch (file) {
 806    case PROGRAM_TEMPORARY:
 807    case PROGRAM_INPUT:
 808    case PROGRAM_OUTPUT:
 809       assert(c->regs[file][index].nr != 0);
 810       return c->regs[file][index];
 811    case PROGRAM_STATE_VAR:
 812    case PROGRAM_CONSTANT:
 813    case PROGRAM_UNIFORM:
 814       assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
 815       return c->regs[PROGRAM_STATE_VAR][index];
 816    case PROGRAM_ADDRESS:
 817       assert(index == 0);
 818       return c->regs[file][index];
 819
 820    case PROGRAM_UNDEFINED:                      /* undef values */
 821       return brw_null_reg();
 822
 823    case PROGRAM_LOCAL_PARAM:
 824    case PROGRAM_ENV_PARAM:
 825    case PROGRAM_WRITE_ONLY:
 826    default:
 827       assert(0);
 828       return brw_null_reg();
 829    }
 830 }
 831
 832
 833 /**
 834  * Indirect addressing:  get reg[[arg] + offset].
 835  */
 836 static struct brw_reg deref( struct brw_vs_compile *c,
 837                              struct brw_reg arg,
 838                              GLint offset)
 839 {
 840    struct brw_compile *p = &c->func;
 841    struct brw_reg tmp = vec4(get_tmp(c));
 842    struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
 843    struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_UW);
 844    GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * 16;
 845    struct brw_reg indirect = brw_vec4_indirect(0,0);
 846
 847    {
 848       brw_push_insn_state(p);
 849       brw_set_access_mode(p, BRW_ALIGN_1);
 850
 851       /* This is pretty clunky - load the address register twice and
 852        * fetch each 4-dword value in turn.  There must be a way to do
 853        * this in a single pass, but I couldn't get it to work.
 854        */
 855       brw_ADD(p, brw_address_reg(0), vp_address, brw_imm_d(byte_offset));
 856       brw_MOV(p, tmp, indirect);
 857
 858       brw_ADD(p, brw_address_reg(0), suboffset(vp_address, 8), brw_imm_d(byte_offset));
 859       brw_MOV(p, suboffset(tmp, 4), indirect);
 860
 861       brw_pop_insn_state(p);
 862    }
 863
 864    /* NOTE: tmp not released */
 865    return vec8(tmp);
 866 }
 867
 868
 869 /**
 870  * Get brw reg corresponding to the instruction's [argIndex] src reg.
 871  * TODO: relative addressing!
 872  */
 873 static struct brw_reg
 874 get_src_reg( struct brw_vs_compile *c,
 875              const struct prog_instruction *inst,
 876              GLuint argIndex )
 877 {
 878    const GLuint file = inst->SrcReg[argIndex].File;
 879    const GLint index = inst->SrcReg[argIndex].Index;
 880    const GLboolean relAddr = inst->SrcReg[argIndex].RelAddr;
 881
 882    switch (file) {
 883    case PROGRAM_TEMPORARY:
 884    case PROGRAM_INPUT:
 885    case PROGRAM_OUTPUT:
 886       if (relAddr) {
 887          return deref(c, c->regs[file][0], index);
 888       }
 889       else {
 890          assert(c->regs[file][index].nr != 0);
 891          return c->regs[file][index];
 892       }
 893
 894    case PROGRAM_STATE_VAR:
 895    case PROGRAM_CONSTANT:
 896    case PROGRAM_UNIFORM:
 897       if (c->vp->use_const_buffer) {
 898          return get_constant(c, inst, argIndex);
 899       }
 900       else if (relAddr) {
 901          return deref(c, c->regs[PROGRAM_STATE_VAR][0], index);
 902       }
 903       else {
 904          assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
 905          return c->regs[PROGRAM_STATE_VAR][index];
 906       }
 907    case PROGRAM_ADDRESS:
 908       assert(index == 0);
 909       return c->regs[file][index];
 910
 911    case PROGRAM_UNDEFINED:
 912       /* this is a normal case since we loop over all three src args */
 913       return brw_null_reg();
 914
 915    case PROGRAM_LOCAL_PARAM:
 916    case PROGRAM_ENV_PARAM:
 917    case PROGRAM_WRITE_ONLY:
 918    default:
 919       assert(0);
 920       return brw_null_reg();
 921    }
 922 }
 923
 924
 925 static void emit_arl( struct brw_vs_compile *c,
 926                       struct brw_reg dst,
 927                       struct brw_reg arg0 )
 928 {
 929    struct brw_compile *p = &c->func;
 930    struct brw_reg tmp = dst;
 931    GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
 932
 933    if (need_tmp)
 934       tmp = get_tmp(c);
 935
 936    brw_RNDD(p, tmp, arg0);               /* tmp = round(arg0) */
 937    brw_MUL(p, dst, tmp, brw_imm_d(16));  /* dst = tmp * 16 */
 938
 939    if (need_tmp)
 940       release_tmp(c, tmp);
 941 }
 942
 943
 944 /**
 945  * Return the brw reg for the given instruction's src argument.
 946  * Will return mangled results for SWZ op.  The emit_swz() function
 947  * ignores this result and recalculates taking extended swizzles into
 948  * account.
 949  */
 950 static struct brw_reg get_arg( struct brw_vs_compile *c,
 951                                const struct prog_instruction *inst,
 952                                GLuint argIndex )
 953 {
 954    const struct prog_src_register *src = &inst->SrcReg[argIndex];
 955    struct brw_reg reg;
 956
 957    if (src->File == PROGRAM_UNDEFINED)
 958       return brw_null_reg();
 959
 960    reg = get_src_reg(c, inst, argIndex);
 961
 962    /* Convert 3-bit swizzle to 2-bit.
 963     */
 964    reg.dw1.bits.swizzle = BRW_SWIZZLE4(GET_SWZ(src->Swizzle, 0),
 965                                        GET_SWZ(src->Swizzle, 1),
 966                                        GET_SWZ(src->Swizzle, 2),
 967                                        GET_SWZ(src->Swizzle, 3));
 968
 969    /* Note this is ok for non-swizzle instructions:
 970     */
 971    reg.negate = src->Negate ? 1 : 0;
 972
 973    return reg;
 974 }
 975
 976
 977 /**
 978  * Get brw register for the given program dest register.
 979  */
 980 static struct brw_reg get_dst( struct brw_vs_compile *c,
 981                                struct prog_dst_register dst )
 982 {
 983    struct brw_reg reg;
 984
 985    switch (dst.File) {
 986    case PROGRAM_TEMPORARY:
 987    case PROGRAM_OUTPUT:
 988       assert(c->regs[dst.File][dst.Index].nr != 0);
 989       reg = c->regs[dst.File][dst.Index];
 990       break;
 991    case PROGRAM_ADDRESS:
 992       assert(dst.Index == 0);
 993       reg = c->regs[dst.File][dst.Index];
 994       break;
 995    case PROGRAM_UNDEFINED:
 996       /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
 997       reg = brw_null_reg();
 998       break;
 999    default:
1000       assert(0);
1001       reg = brw_null_reg();
1002    }
1003
1004    reg.dw1.bits.writemask = dst.WriteMask;
1005
1006    return reg;
1007 }
1008
1009
1010 static void emit_swz( struct brw_vs_compile *c,
1011                       struct brw_reg dst,
1012                       const struct prog_instruction *inst)
1013 {
1014    const GLuint argIndex = 0;
1015    const struct prog_src_register src = inst->SrcReg[argIndex];
1016    struct brw_compile *p = &c->func;
1017    GLuint zeros_mask = 0;
1018    GLuint ones_mask = 0;
1019    GLuint src_mask = 0;
1020    GLubyte src_swz[4];
1021    GLboolean need_tmp = (src.Negate &&
1022                          dst.file != BRW_GENERAL_REGISTER_FILE);
1023    struct brw_reg tmp = dst;
1024    GLuint i;
1025
1026    if (need_tmp)
1027       tmp = get_tmp(c);
1028
1029    for (i = 0; i < 4; i++) {
1030       if (dst.dw1.bits.writemask & (1<<i)) {
1031          GLubyte s = GET_SWZ(src.Swizzle, i);
1032          switch (s) {
1033          case SWIZZLE_X:
1034          case SWIZZLE_Y:
1035          case SWIZZLE_Z:
1036          case SWIZZLE_W:
1037             src_mask |= 1<<i;
1038             src_swz[i] = s;
1039             break;
1040          case SWIZZLE_ZERO:
1041             zeros_mask |= 1<<i;
1042             break;
1043          case SWIZZLE_ONE:
1044             ones_mask |= 1<<i;
1045             break;
1046          }
1047       }
1048    }
1049
1050    /* Do src first, in case dst aliases src:
1051     */
1052    if (src_mask) {
1053       struct brw_reg arg0;
1054
1055       arg0 = get_src_reg(c, inst, argIndex);
1056
1057       arg0 = brw_swizzle(arg0,
1058                          src_swz[0], src_swz[1],
1059                          src_swz[2], src_swz[3]);
1060
1061       brw_MOV(p, brw_writemask(tmp, src_mask), arg0);
1062    }
1063
1064    if (zeros_mask)
1065       brw_MOV(p, brw_writemask(tmp, zeros_mask), brw_imm_f(0));
1066
1067    if (ones_mask)
1068       brw_MOV(p, brw_writemask(tmp, ones_mask), brw_imm_f(1));
1069
1070    if (src.Negate)
1071       brw_MOV(p, brw_writemask(tmp, src.Negate), negate(tmp));
1072
1073    if (need_tmp) {
1074       brw_MOV(p, dst, tmp);
1075       release_tmp(c, tmp);
1076    }
1077 }
1078
1079
1080 /**
1081  * Post-vertex-program processing.  Send the results to the URB.
1082  */
1083 static void emit_vertex_write( struct brw_vs_compile *c)
1084 {
1085    struct brw_compile *p = &c->func;
1086    struct brw_reg m0 = brw_message_reg(0);
1087    struct brw_reg pos = c->regs[PROGRAM_OUTPUT][VERT_RESULT_HPOS];
1088    struct brw_reg ndc;
1089    int eot;
1090    GLuint len_vertext_header = 2;
1091
1092    if (c->key.copy_edgeflag) {
1093       brw_MOV(p,
1094               get_reg(c, PROGRAM_OUTPUT, VERT_RESULT_EDGE),
1095               get_reg(c, PROGRAM_INPUT, VERT_ATTRIB_EDGEFLAG));
1096    }
1097
1098    /* Build ndc coords */
1099    ndc = get_tmp(c);
1100    /* ndc = 1.0 / pos.w */
1101    emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
1102    /* ndc.xyz = pos * ndc */
1103    brw_MUL(p, brw_writemask(ndc, WRITEMASK_XYZ), pos, ndc);
1104
1105    /* Update the header for point size, user clipping flags, and -ve rhw
1106     * workaround.
1107     */
1108    if ((c->prog_data.outputs_written & (1<<VERT_RESULT_PSIZ)) ||
1109        c->key.nr_userclip || BRW_IS_965(p->brw))
1110    {
1111       struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
1112       GLuint i;
1113
1114       brw_MOV(p, header1, brw_imm_ud(0));
1115
1116       brw_set_access_mode(p, BRW_ALIGN_16);
1117
1118       if (c->prog_data.outputs_written & (1<<VERT_RESULT_PSIZ)) {
1119          struct brw_reg psiz = c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ];
1120          brw_MUL(p, brw_writemask(header1, WRITEMASK_W), brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
1121          brw_AND(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(0x7ff<<8));
1122       }
1123
1124       for (i = 0; i < c->key.nr_userclip; i++) {
1125          brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
1126          brw_DP4(p, brw_null_reg(), pos, c->userplane[i]);
1127          brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<i));
1128          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1129       }
1130
1131       /* i965 clipping workaround:
1132        * 1) Test for -ve rhw
1133        * 2) If set,
1134        *      set ndc = (0,0,0,0)
1135        *      set ucp[6] = 1
1136        *
1137        * Later, clipping will detect ucp[6] and ensure the primitive is
1138        * clipped against all fixed planes.
1139        */
1140       if (BRW_IS_965(p->brw)) {
1141          brw_CMP(p,
1142                  vec8(brw_null_reg()),
1143                  BRW_CONDITIONAL_L,
1144                  brw_swizzle1(ndc, 3),
1145                  brw_imm_f(0));
1146
1147          brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
1148          brw_MOV(p, ndc, brw_imm_f(0));
1149          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1150       }
1151
1152       brw_set_access_mode(p, BRW_ALIGN_1);      /* why? */
1153       brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), header1);
1154       brw_set_access_mode(p, BRW_ALIGN_16);
1155
1156       release_tmp(c, header1);
1157    }
1158    else {
1159       brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
1160    }
1161
1162    /* Emit the (interleaved) headers for the two vertices - an 8-reg
1163     * of zeros followed by two sets of NDC coordinates:
1164     */
1165    brw_set_access_mode(p, BRW_ALIGN_1);
1166    brw_MOV(p, offset(m0, 2), ndc);
1167
1168    if (BRW_IS_IGDNG(p->brw)) {
1169        /* There are 20 DWs (D0-D19) in VUE vertex header on IGDNG */
1170        brw_MOV(p, offset(m0, 3), pos); /* a portion of vertex header */
1171        /* m4, m5 contain the distances from vertex to the user clip planeXXX.
1172         * Seems it is useless for us.
1173         * m6 is used for aligning, so that the remainder of vertex element is
1174         * reg-aligned.
1175         */
1176        brw_MOV(p, offset(m0, 7), pos); /* the remainder of vertex element */
1177        len_vertext_header = 6;
1178    } else {
1179        brw_MOV(p, offset(m0, 3), pos);
1180        len_vertext_header = 2;
1181    }
1182
1183    eot = (c->first_overflow_output == 0);
1184
1185    brw_urb_WRITE(p,
1186                  brw_null_reg(), /* dest */
1187                  0,             /* starting mrf reg nr */
1188                  c->r0,         /* src */
1189                  0,             /* allocate */
1190                  1,             /* used */
1191                  MIN2(c->nr_outputs + 1 + len_vertext_header, (BRW_MAX_MRF-1)), /* msg len */
1192                  0,             /* response len */
1193                  eot,           /* eot */
1194                  1,             /* writes complete */
1195                  0,             /* urb destination offset */
1196                  BRW_URB_SWIZZLE_INTERLEAVE);
1197
1198    if (c->first_overflow_output > 0) {
1199       /* Not all of the vertex outputs/results fit into the MRF.
1200        * Move the overflowed attributes from the GRF to the MRF and
1201        * issue another brw_urb_WRITE().
1202        */
1203       /* XXX I'm not 100% sure about which MRF regs to use here.  Starting
1204        * at mrf[4] atm...
1205        */
1206       GLuint i, mrf = 0;
1207       for (i = c->first_overflow_output; i < VERT_RESULT_MAX; i++) {
1208          if (c->prog_data.outputs_written & (1 << i)) {
1209             /* move from GRF to MRF */
1210             brw_MOV(p, brw_message_reg(4+mrf), c->regs[PROGRAM_OUTPUT][i]);
1211             mrf++;
1212          }
1213       }
1214
1215       brw_urb_WRITE(p,
1216                     brw_null_reg(), /* dest */
1217                     4,              /* starting mrf reg nr */
1218                     c->r0,          /* src */
1219                     0,              /* allocate */
1220                     1,              /* used */
1221                     mrf+1,          /* msg len */
1222                     0,              /* response len */
1223                     1,              /* eot */
1224                     1,              /* writes complete */
1225                     BRW_MAX_MRF-1,  /* urb destination offset */
1226                     BRW_URB_SWIZZLE_INTERLEAVE);
1227    }
1228 }
1229
1230
1231 /**
1232  * Called after code generation to resolve subroutine calls and the
1233  * END instruction.
1234  * \param end_inst  points to brw code for END instruction
1235  * \param last_inst  points to last instruction emitted before vertex write
1236  */
1237 static void
1238 post_vs_emit( struct brw_vs_compile *c,
1239               struct brw_instruction *end_inst,
1240               struct brw_instruction *last_inst )
1241 {
1242    GLint offset;
1243
1244    brw_resolve_cals(&c->func);
1245
1246    /* patch up the END code to jump past subroutines, etc */
1247    offset = last_inst - end_inst;
1248    brw_set_src1(end_inst, brw_imm_d(offset * 16));
1249 }
1250
1251
1252 /* Emit the vertex program instructions here.
1253  */
1254 void brw_vs_emit(struct brw_vs_compile *c )
1255 {
1256 #define MAX_IF_DEPTH 32
1257 #define MAX_LOOP_DEPTH 32
1258    struct brw_compile *p = &c->func;
1259    const GLuint nr_insns = c->vp->program.Base.NumInstructions;
1260    GLuint insn, if_depth = 0, loop_depth = 0;
1261    GLuint end_offset = 0;
1262    struct brw_instruction *end_inst, *last_inst;
1263    struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH];
1264    const struct brw_indirect stack_index = brw_indirect(0, 0);
1265    GLuint index;
1266    GLuint file;
1267
1268    if (INTEL_DEBUG & DEBUG_VS) {
1269       _mesa_printf("vs-emit:\n");
1270       _mesa_print_program(&c->vp->program.Base);
1271       _mesa_printf("\n");
1272    }
1273
1274    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1275    brw_set_access_mode(p, BRW_ALIGN_16);
1276
1277    /* Message registers can't be read, so copy the output into GRF register
1278       if they are used in source registers */
1279    for (insn = 0; insn < nr_insns; insn++) {
1280        GLuint i;
1281        struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1282        for (i = 0; i < 3; i++) {
1283            struct prog_src_register *src = &inst->SrcReg[i];
1284            GLuint index = src->Index;
1285            GLuint file = src->File;
1286            if (file == PROGRAM_OUTPUT && index != VERT_RESULT_HPOS)
1287                c->output_regs[index].used_in_src = GL_TRUE;
1288        }
1289    }
1290
1291    /* Static register allocation
1292     */
1293    brw_vs_alloc_regs(c);
1294    brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
1295
1296    for (insn = 0; insn < nr_insns; insn++) {
1297
1298       const struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1299       struct brw_reg args[3], dst;
1300       GLuint i;
1301
1302 #if 0
1303       printf("%d: ", insn);
1304       _mesa_print_instruction(inst);
1305 #endif
1306
1307       /* Get argument regs.  SWZ is special and does this itself.
1308        */
1309       if (inst->Opcode != OPCODE_SWZ)
1310           for (i = 0; i < 3; i++) {
1311               const struct prog_src_register *src = &inst->SrcReg[i];
1312               index = src->Index;
1313               file = src->File;
1314               if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1315                   args[i] = c->output_regs[index].reg;
1316               else
1317                   args[i] = get_arg(c, inst, i);
1318           }
1319
1320       /* Get dest regs.  Note that it is possible for a reg to be both
1321        * dst and arg, given the static allocation of registers.  So
1322        * care needs to be taken emitting multi-operation instructions.
1323        */
1324       index = inst->DstReg.Index;
1325       file = inst->DstReg.File;
1326       if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1327           dst = c->output_regs[index].reg;
1328       else
1329           dst = get_dst(c, inst->DstReg);
1330
1331       if (inst->SaturateMode != SATURATE_OFF) {
1332          _mesa_problem(NULL, "Unsupported saturate %d in vertex shader",
1333                        inst->SaturateMode);
1334       }
1335
1336       switch (inst->Opcode) {
1337       case OPCODE_ABS:
1338          brw_MOV(p, dst, brw_abs(args[0]));
1339          break;
1340       case OPCODE_ADD:
1341          brw_ADD(p, dst, args[0], args[1]);
1342          break;
1343       case OPCODE_COS:
1344          emit_math1(c, BRW_MATH_FUNCTION_COS, dst, args[0], BRW_MATH_PRECISION_FULL);
1345          break;
1346       case OPCODE_DP3:
1347          brw_DP3(p, dst, args[0], args[1]);
1348          break;
1349       case OPCODE_DP4:
1350          brw_DP4(p, dst, args[0], args[1]);
1351          break;
1352       case OPCODE_DPH:
1353          brw_DPH(p, dst, args[0], args[1]);
1354          break;
1355       case OPCODE_NRM3:
1356          emit_nrm(c, dst, args[0], 3);
1357          break;
1358       case OPCODE_NRM4:
1359          emit_nrm(c, dst, args[0], 4);
1360          break;
1361       case OPCODE_DST:
1362          unalias2(c, dst, args[0], args[1], emit_dst_noalias);
1363          break;
1364       case OPCODE_EXP:
1365          unalias1(c, dst, args[0], emit_exp_noalias);
1366          break;
1367       case OPCODE_EX2:
1368          emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
1369          break;
1370       case OPCODE_ARL:
1371          emit_arl(c, dst, args[0]);
1372          break;
1373       case OPCODE_FLR:
1374          brw_RNDD(p, dst, args[0]);
1375          break;
1376       case OPCODE_FRC:
1377          brw_FRC(p, dst, args[0]);
1378          break;
1379       case OPCODE_LOG:
1380          unalias1(c, dst, args[0], emit_log_noalias);
1381          break;
1382       case OPCODE_LG2:
1383          emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
1384          break;
1385       case OPCODE_LIT:
1386          unalias1(c, dst, args[0], emit_lit_noalias);
1387          break;
1388       case OPCODE_LRP:
1389          unalias3(c, dst, args[0], args[1], args[2], emit_lrp_noalias);
1390          break;
1391       case OPCODE_MAD:
1392          brw_MOV(p, brw_acc_reg(), args[2]);
1393          brw_MAC(p, dst, args[0], args[1]);
1394          break;
1395       case OPCODE_MAX:
1396          emit_max(p, dst, args[0], args[1]);
1397          break;
1398       case OPCODE_MIN:
1399          emit_min(p, dst, args[0], args[1]);
1400          break;
1401       case OPCODE_MOV:
1402          brw_MOV(p, dst, args[0]);
1403          break;
1404       case OPCODE_MUL:
1405          brw_MUL(p, dst, args[0], args[1]);
1406          break;
1407       case OPCODE_POW:
1408          emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL);
1409          break;
1410       case OPCODE_RCP:
1411          emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
1412          break;
1413       case OPCODE_RSQ:
1414          emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, args[0], BRW_MATH_PRECISION_FULL);
1415          break;
1416
1417       case OPCODE_SEQ:
1418          emit_seq(p, dst, args[0], args[1]);
1419          break;
1420       case OPCODE_SIN:
1421          emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, args[0], BRW_MATH_PRECISION_FULL);
1422          break;
1423       case OPCODE_SNE:
1424          emit_sne(p, dst, args[0], args[1]);
1425          break;
1426       case OPCODE_SGE:
1427          emit_sge(p, dst, args[0], args[1]);
1428          break;
1429       case OPCODE_SGT:
1430          emit_sgt(p, dst, args[0], args[1]);
1431          break;
1432       case OPCODE_SLT:
1433          emit_slt(p, dst, args[0], args[1]);
1434          break;
1435       case OPCODE_SLE:
1436          emit_sle(p, dst, args[0], args[1]);
1437          break;
1438       case OPCODE_SUB:
1439          brw_ADD(p, dst, args[0], negate(args[1]));
1440          break;
1441       case OPCODE_SWZ:
1442          /* The args[0] value can't be used here as it won't have
1443           * correctly encoded the full swizzle:
1444           */
1445          emit_swz(c, dst, inst);
1446          break;
1447       case OPCODE_TRUNC:
1448          /* round toward zero */
1449          brw_RNDZ(p, dst, args[0]);
1450          break;
1451       case OPCODE_XPD:
1452          emit_xpd(p, dst, args[0], args[1]);
1453          break;
1454       case OPCODE_IF:
1455          assert(if_depth < MAX_IF_DEPTH);
1456          if_inst[if_depth++] = brw_IF(p, BRW_EXECUTE_8);
1457          break;
1458       case OPCODE_ELSE:
1459          if_inst[if_depth-1] = brw_ELSE(p, if_inst[if_depth-1]);
1460          break;
1461       case OPCODE_ENDIF:
1462          assert(if_depth > 0);
1463          brw_ENDIF(p, if_inst[--if_depth]);
1464          break;
1465 #if 0
1466       case OPCODE_BGNLOOP:
1467          loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
1468          break;
1469       case OPCODE_BRK:
1470          brw_BREAK(p);
1471          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1472          break;
1473       case OPCODE_CONT:
1474          brw_CONT(p);
1475          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1476          break;
1477       case OPCODE_ENDLOOP:
1478          {
1479             struct brw_instruction *inst0, *inst1;
1480             loop_depth--;
1481             inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
1482             /* patch all the BREAK/CONT instructions from last BEGINLOOP */
1483             while (inst0 > loop_inst[loop_depth]) {
1484                inst0--;
1485                if (inst0->header.opcode == BRW_OPCODE_BREAK) {
1486                   inst0->bits3.if_else.jump_count = inst1 - inst0 + 1;
1487                   inst0->bits3.if_else.pop_count = 0;
1488                }
1489                else if (inst0->header.opcode == BRW_OPCODE_CONTINUE) {
1490                   inst0->bits3.if_else.jump_count = inst1 - inst0;
1491                   inst0->bits3.if_else.pop_count = 0;
1492                }
1493             }
1494          }
1495          break;
1496 #else
1497          (void) loop_inst;
1498          (void) loop_depth;
1499 #endif
1500       case OPCODE_BRA:
1501          brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
1502          brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1503          brw_set_predicate_control_flag_value(p, 0xff);
1504          break;
1505       case OPCODE_CAL:
1506          brw_set_access_mode(p, BRW_ALIGN_1);
1507          brw_ADD(p, deref_1d(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
1508          brw_set_access_mode(p, BRW_ALIGN_16);
1509          brw_ADD(p, get_addr_reg(stack_index),
1510                          get_addr_reg(stack_index), brw_imm_d(4));
1511          brw_save_call(p, inst->Comment, p->nr_insn);
1512          brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1513          break;
1514       case OPCODE_RET:
1515          brw_ADD(p, get_addr_reg(stack_index),
1516                          get_addr_reg(stack_index), brw_imm_d(-4));
1517          brw_set_access_mode(p, BRW_ALIGN_1);
1518          brw_MOV(p, brw_ip_reg(), deref_1d(stack_index, 0));
1519          brw_set_access_mode(p, BRW_ALIGN_16);
1520          break;
1521       case OPCODE_END:
1522          end_offset = p->nr_insn;
1523          /* this instruction will get patched later to jump past subroutine
1524           * code, etc.
1525           */
1526          brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1527          break;
1528       case OPCODE_PRINT:
1529          /* no-op */
1530          break;
1531       case OPCODE_BGNSUB:
1532          brw_save_label(p, inst->Comment, p->nr_insn);
1533          break;
1534       case OPCODE_ENDSUB:
1535          /* no-op */
1536          break;
1537       default:
1538          _mesa_problem(NULL, "Unsupported opcode %i (%s) in vertex shader",
1539                        inst->Opcode, inst->Opcode < MAX_OPCODE ?
1540                                     _mesa_opcode_string(inst->Opcode) :
1541                                     "unknown");
1542       }
1543
1544       if ((inst->DstReg.File == PROGRAM_OUTPUT)
1545           && (inst->DstReg.Index != VERT_RESULT_HPOS)
1546           && c->output_regs[inst->DstReg.Index].used_in_src) {
1547          brw_MOV(p, get_dst(c, inst->DstReg), dst);
1548       }
1549
1550       /* Result color clamping.
1551        *
1552        * When destination register is an output register and
1553        * it's primary/secondary front/back color, we have to clamp
1554        * the result to [0,1]. This is done by enabling the
1555        * saturation bit for the last instruction.
1556        *
1557        * We don't use brw_set_saturate() as it modifies
1558        * p->current->header.saturate, which affects all the subsequent
1559        * instructions. Instead, we directly modify the header
1560        * of the last (already stored) instruction.
1561        */
1562       if (inst->DstReg.File == PROGRAM_OUTPUT) {
1563          if ((inst->DstReg.Index == VERT_RESULT_COL0)
1564              || (inst->DstReg.Index == VERT_RESULT_COL1)
1565              || (inst->DstReg.Index == VERT_RESULT_BFC0)
1566              || (inst->DstReg.Index == VERT_RESULT_BFC1)) {
1567             p->store[p->nr_insn-1].header.saturate = 1;
1568          }
1569       }
1570
1571       release_tmps(c);
1572    }
1573
1574    end_inst = &p->store[end_offset];
1575    last_inst = &p->store[p->nr_insn];
1576
1577    /* The END instruction will be patched to jump to this code */
1578    emit_vertex_write(c);
1579
1580    post_vs_emit(c, end_inst, last_inst);
1581 }