src/mesa/drivers/dri/i965/brw_vs_emit.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keith@tungstengraphics.com>
  30   */
  31
  32
  33 #include "main/macros.h"
  34 #include "shader/program.h"
  35 #include "shader/prog_parameter.h"
  36 #include "shader/prog_print.h"
  37 #include "brw_context.h"
  38 #include "brw_vs.h"
  39
  40
  41 static struct brw_reg get_tmp( struct brw_vs_compile *c )
  42 {
  43    struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
  44
  45    if (++c->last_tmp > c->prog_data.total_grf)
  46       c->prog_data.total_grf = c->last_tmp;
  47
  48    return tmp;
  49 }
  50
  51 static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
  52 {
  53    if (tmp.nr == c->last_tmp-1)
  54       c->last_tmp--;
  55 }
  56
  57 static void release_tmps( struct brw_vs_compile *c )
  58 {
  59    c->last_tmp = c->first_tmp;
  60 }
  61
  62
  63 /**
  64  * Preallocate GRF register before code emit.
  65  * Do things as simply as possible.  Allocate and populate all regs
  66  * ahead of time.
  67  */
  68 static void brw_vs_alloc_regs( struct brw_vs_compile *c )
  69 {
  70    struct intel_context *intel = &c->func.brw->intel;
  71    GLuint i, reg = 0, mrf;
  72    int attributes_in_vue;
  73
  74    /* Determine whether to use a real constant buffer or use a block
  75     * of GRF registers for constants.  The later is faster but only
  76     * works if everything fits in the GRF.
  77     * XXX this heuristic/check may need some fine tuning...
  78     */
  79    if (c->vp->program.Base.Parameters->NumParameters +
  80        c->vp->program.Base.NumTemporaries + 20 > BRW_MAX_GRF)
  81       c->vp->use_const_buffer = GL_TRUE;
  82    else
  83       c->vp->use_const_buffer = GL_FALSE;
  84
  85    /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
  86
  87    /* r0 -- reserved as usual
  88     */
  89    c->r0 = brw_vec8_grf(reg, 0);
  90    reg++;
  91
  92    /* User clip planes from curbe:
  93     */
  94    if (c->key.nr_userclip) {
  95       for (i = 0; i < c->key.nr_userclip; i++) {
  96          c->userplane[i] = stride( brw_vec4_grf(reg+3+i/2, (i%2) * 4), 0, 4, 1);
  97       }
  98
  99       /* Deal with curbe alignment:
 100        */
 101       reg += ((6 + c->key.nr_userclip + 3) / 4) * 2;
 102    }
 103
 104    /* Vertex program parameters from curbe:
 105     */
 106    if (c->vp->use_const_buffer) {
 107       int max_constant = BRW_MAX_GRF - 20 - c->vp->program.Base.NumTemporaries;
 108       int constant = 0;
 109
 110       /* We've got more constants than we can load with the push
 111        * mechanism.  This is often correlated with reladdr loads where
 112        * we should probably be using a pull mechanism anyway to avoid
 113        * excessive reading.  However, the pull mechanism is slow in
 114        * general.  So, we try to allocate as many non-reladdr-loaded
 115        * constants through the push buffer as we can before giving up.
 116        */
 117       memset(c->constant_map, -1, c->vp->program.Base.Parameters->NumParameters);
 118       for (i = 0;
 119            i < c->vp->program.Base.NumInstructions && constant < max_constant;
 120            i++) {
 121          struct prog_instruction *inst = &c->vp->program.Base.Instructions[i];
 122          int arg;
 123
 124          for (arg = 0; arg < 3 && constant < max_constant; arg++) {
 125             if ((inst->SrcReg[arg].File != PROGRAM_STATE_VAR &&
 126                  inst->SrcReg[arg].File != PROGRAM_CONSTANT &&
 127                  inst->SrcReg[arg].File != PROGRAM_UNIFORM &&
 128                  inst->SrcReg[arg].File != PROGRAM_ENV_PARAM &&
 129                  inst->SrcReg[arg].File != PROGRAM_LOCAL_PARAM) ||
 130                 inst->SrcReg[arg].RelAddr)
 131                continue;
 132
 133             if (c->constant_map[inst->SrcReg[arg].Index] == -1) {
 134                c->constant_map[inst->SrcReg[arg].Index] = constant++;
 135             }
 136          }
 137       }
 138
 139       for (i = 0; i < constant; i++) {
 140          c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2,
 141                                                               (i%2) * 4),
 142                                                  0, 4, 1);
 143       }
 144       reg += (constant + 1) / 2;
 145       c->prog_data.curb_read_length = reg - 1;
 146       /* XXX 0 causes a bug elsewhere... */
 147       c->prog_data.nr_params = MAX2(constant * 4, 4);
 148    }
 149    else {
 150       /* use a section of the GRF for constants */
 151       GLuint nr_params = c->vp->program.Base.Parameters->NumParameters;
 152       for (i = 0; i < nr_params; i++) {
 153          c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2, (i%2) * 4), 0, 4, 1);
 154       }
 155       reg += (nr_params + 1) / 2;
 156       c->prog_data.curb_read_length = reg - 1;
 157
 158       c->prog_data.nr_params = nr_params * 4;
 159    }
 160
 161    /* Allocate input regs:
 162     */
 163    c->nr_inputs = 0;
 164    for (i = 0; i < VERT_ATTRIB_MAX; i++) {
 165       if (c->prog_data.inputs_read & (1 << i)) {
 166          c->nr_inputs++;
 167          c->regs[PROGRAM_INPUT][i] = brw_vec8_grf(reg, 0);
 168          reg++;
 169       }
 170    }
 171    /* If there are no inputs, we'll still be reading one attribute's worth
 172     * because it's required -- see urb_read_length setting.
 173     */
 174    if (c->nr_inputs == 0)
 175       reg++;
 176
 177    /* Allocate outputs.  The non-position outputs go straight into message regs.
 178     */
 179    c->nr_outputs = 0;
 180    c->first_output = reg;
 181    c->first_overflow_output = 0;
 182
 183    if (intel->is_ironlake)
 184        mrf = 8;
 185    else
 186        mrf = 4;
 187
 188    for (i = 0; i < VERT_RESULT_MAX; i++) {
 189       if (c->prog_data.outputs_written & BITFIELD64_BIT(i)) {
 190          c->nr_outputs++;
 191          assert(i < Elements(c->regs[PROGRAM_OUTPUT]));
 192          if (i == VERT_RESULT_HPOS) {
 193             c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
 194             reg++;
 195          }
 196          else if (i == VERT_RESULT_PSIZ) {
 197             c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
 198             reg++;
 199             mrf++;              /* just a placeholder?  XXX fix later stages & remove this */
 200          }
 201          else {
 202             if (mrf < 16) {
 203                c->regs[PROGRAM_OUTPUT][i] = brw_message_reg(mrf);
 204                mrf++;
 205             }
 206             else {
 207                /* too many vertex results to fit in MRF, use GRF for overflow */
 208                if (!c->first_overflow_output)
 209                   c->first_overflow_output = i;
 210                c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
 211                reg++;
 212             }
 213          }
 214       }
 215    }
 216
 217    /* Allocate program temporaries:
 218     */
 219    for (i = 0; i < c->vp->program.Base.NumTemporaries; i++) {
 220       c->regs[PROGRAM_TEMPORARY][i] = brw_vec8_grf(reg, 0);
 221       reg++;
 222    }
 223
 224    /* Address reg(s).  Don't try to use the internal address reg until
 225     * deref time.
 226     */
 227    for (i = 0; i < c->vp->program.Base.NumAddressRegs; i++) {
 228       c->regs[PROGRAM_ADDRESS][i] =  brw_reg(BRW_GENERAL_REGISTER_FILE,
 229                                              reg,
 230                                              0,
 231                                              BRW_REGISTER_TYPE_D,
 232                                              BRW_VERTICAL_STRIDE_8,
 233                                              BRW_WIDTH_8,
 234                                              BRW_HORIZONTAL_STRIDE_1,
 235                                              BRW_SWIZZLE_XXXX,
 236                                              WRITEMASK_X);
 237       reg++;
 238    }
 239
 240    if (c->vp->use_const_buffer) {
 241       for (i = 0; i < 3; i++) {
 242          c->current_const[i].index = -1;
 243          c->current_const[i].reg = brw_vec8_grf(reg, 0);
 244          reg++;
 245       }
 246    }
 247
 248    for (i = 0; i < 128; i++) {
 249       if (c->output_regs[i].used_in_src) {
 250          c->output_regs[i].reg = brw_vec8_grf(reg, 0);
 251          reg++;
 252       }
 253    }
 254
 255    if (c->needs_stack) {
 256       c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0);
 257       reg += 2;
 258    }
 259
 260    /* Some opcodes need an internal temporary:
 261     */
 262    c->first_tmp = reg;
 263    c->last_tmp = reg;           /* for allocation purposes */
 264
 265    /* Each input reg holds data from two vertices.  The
 266     * urb_read_length is the number of registers read from *each*
 267     * vertex urb, so is half the amount:
 268     */
 269    c->prog_data.urb_read_length = (c->nr_inputs + 1) / 2;
 270    /* Setting this field to 0 leads to undefined behavior according to the
 271     * the VS_STATE docs.  Our VUEs will always have at least one attribute
 272     * sitting in them, even if it's padding.
 273     */
 274    if (c->prog_data.urb_read_length == 0)
 275       c->prog_data.urb_read_length = 1;
 276
 277    /* The VS VUEs are shared by VF (outputting our inputs) and VS, so size
 278     * them to fit the biggest thing they need to.
 279     */
 280    attributes_in_vue = MAX2(c->nr_outputs, c->nr_inputs);
 281
 282    if (intel->is_ironlake)
 283        c->prog_data.urb_entry_size = (attributes_in_vue + 6 + 3) / 4;
 284    else
 285        c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 3) / 4;
 286
 287    c->prog_data.total_grf = reg;
 288
 289    if (INTEL_DEBUG & DEBUG_VS) {
 290       printf("%s NumAddrRegs %d\n", __FUNCTION__, c->vp->program.Base.NumAddressRegs);
 291       printf("%s NumTemps %d\n", __FUNCTION__, c->vp->program.Base.NumTemporaries);
 292       printf("%s reg = %d\n", __FUNCTION__, reg);
 293    }
 294 }
 295
 296
 297 /**
 298  * If an instruction uses a temp reg both as a src and the dest, we
 299  * sometimes need to allocate an intermediate temporary.
 300  */
 301 static void unalias1( struct brw_vs_compile *c,
 302                       struct brw_reg dst,
 303                       struct brw_reg arg0,
 304                       void (*func)( struct brw_vs_compile *,
 305                                     struct brw_reg,
 306                                     struct brw_reg ))
 307 {
 308    if (dst.file == arg0.file && dst.nr == arg0.nr) {
 309       struct brw_compile *p = &c->func;
 310       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 311       func(c, tmp, arg0);
 312       brw_MOV(p, dst, tmp);
 313       release_tmp(c, tmp);
 314    }
 315    else {
 316       func(c, dst, arg0);
 317    }
 318 }
 319
 320 /**
 321  * \sa unalias2
 322  * Checkes if 2-operand instruction needs an intermediate temporary.
 323  */
 324 static void unalias2( struct brw_vs_compile *c,
 325                       struct brw_reg dst,
 326                       struct brw_reg arg0,
 327                       struct brw_reg arg1,
 328                       void (*func)( struct brw_vs_compile *,
 329                                     struct brw_reg,
 330                                     struct brw_reg,
 331                                     struct brw_reg ))
 332 {
 333    if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
 334        (dst.file == arg1.file && dst.nr == arg1.nr)) {
 335       struct brw_compile *p = &c->func;
 336       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 337       func(c, tmp, arg0, arg1);
 338       brw_MOV(p, dst, tmp);
 339       release_tmp(c, tmp);
 340    }
 341    else {
 342       func(c, dst, arg0, arg1);
 343    }
 344 }
 345
 346 /**
 347  * \sa unalias2
 348  * Checkes if 3-operand instruction needs an intermediate temporary.
 349  */
 350 static void unalias3( struct brw_vs_compile *c,
 351                       struct brw_reg dst,
 352                       struct brw_reg arg0,
 353                       struct brw_reg arg1,
 354                       struct brw_reg arg2,
 355                       void (*func)( struct brw_vs_compile *,
 356                                     struct brw_reg,
 357                                     struct brw_reg,
 358                                     struct brw_reg,
 359                                     struct brw_reg ))
 360 {
 361    if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
 362        (dst.file == arg1.file && dst.nr == arg1.nr) ||
 363        (dst.file == arg2.file && dst.nr == arg2.nr)) {
 364       struct brw_compile *p = &c->func;
 365       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 366       func(c, tmp, arg0, arg1, arg2);
 367       brw_MOV(p, dst, tmp);
 368       release_tmp(c, tmp);
 369    }
 370    else {
 371       func(c, dst, arg0, arg1, arg2);
 372    }
 373 }
 374
 375 static void emit_sop( struct brw_vs_compile *c,
 376                       struct brw_reg dst,
 377                       struct brw_reg arg0,
 378                       struct brw_reg arg1,
 379                       GLuint cond)
 380 {
 381    struct brw_compile *p = &c->func;
 382
 383    brw_MOV(p, dst, brw_imm_f(0.0f));
 384    brw_CMP(p, brw_null_reg(), cond, arg0, arg1);
 385    brw_MOV(p, dst, brw_imm_f(1.0f));
 386    brw_set_predicate_control_flag_value(p, 0xff);
 387 }
 388
 389 static void emit_seq( struct brw_vs_compile *c,
 390                       struct brw_reg dst,
 391                       struct brw_reg arg0,
 392                       struct brw_reg arg1 )
 393 {
 394    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_EQ);
 395 }
 396
 397 static void emit_sne( struct brw_vs_compile *c,
 398                       struct brw_reg dst,
 399                       struct brw_reg arg0,
 400                       struct brw_reg arg1 )
 401 {
 402    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_NEQ);
 403 }
 404 static void emit_slt( struct brw_vs_compile *c,
 405                       struct brw_reg dst,
 406                       struct brw_reg arg0,
 407                       struct brw_reg arg1 )
 408 {
 409    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_L);
 410 }
 411
 412 static void emit_sle( struct brw_vs_compile *c,
 413                       struct brw_reg dst,
 414                       struct brw_reg arg0,
 415                       struct brw_reg arg1 )
 416 {
 417    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_LE);
 418 }
 419
 420 static void emit_sgt( struct brw_vs_compile *c,
 421                       struct brw_reg dst,
 422                       struct brw_reg arg0,
 423                       struct brw_reg arg1 )
 424 {
 425    emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_G);
 426 }
 427
 428 static void emit_sge( struct brw_vs_compile *c,
 429                       struct brw_reg dst,
 430                       struct brw_reg arg0,
 431                       struct brw_reg arg1 )
 432 {
 433   emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_GE);
 434 }
 435
 436 static void emit_cmp( struct brw_compile *p,
 437                       struct brw_reg dst,
 438                       struct brw_reg arg0,
 439                       struct brw_reg arg1,
 440                       struct brw_reg arg2 )
 441 {
 442    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, brw_imm_f(0));
 443    brw_SEL(p, dst, arg1, arg2);
 444    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 445 }
 446
 447 static void emit_max( struct brw_compile *p,
 448                       struct brw_reg dst,
 449                       struct brw_reg arg0,
 450                       struct brw_reg arg1 )
 451 {
 452    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
 453    brw_SEL(p, dst, arg1, arg0);
 454    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 455 }
 456
 457 static void emit_min( struct brw_compile *p,
 458                       struct brw_reg dst,
 459                       struct brw_reg arg0,
 460                       struct brw_reg arg1 )
 461 {
 462    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
 463    brw_SEL(p, dst, arg0, arg1);
 464    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 465 }
 466
 467
 468 static void emit_math1( struct brw_vs_compile *c,
 469                         GLuint function,
 470                         struct brw_reg dst,
 471                         struct brw_reg arg0,
 472                         GLuint precision)
 473 {
 474    /* There are various odd behaviours with SEND on the simulator.  In
 475     * addition there are documented issues with the fact that the GEN4
 476     * processor doesn't do dependency control properly on SEND
 477     * results.  So, on balance, this kludge to get around failures
 478     * with writemasked math results looks like it might be necessary
 479     * whether that turns out to be a simulator bug or not:
 480     */
 481    struct brw_compile *p = &c->func;
 482    struct brw_reg tmp = dst;
 483    GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
 484                          dst.file != BRW_GENERAL_REGISTER_FILE);
 485
 486    if (need_tmp)
 487       tmp = get_tmp(c);
 488
 489    brw_math(p,
 490             tmp,
 491             function,
 492             BRW_MATH_SATURATE_NONE,
 493             2,
 494             arg0,
 495             BRW_MATH_DATA_SCALAR,
 496             precision);
 497
 498    if (need_tmp) {
 499       brw_MOV(p, dst, tmp);
 500       release_tmp(c, tmp);
 501    }
 502 }
 503
 504
 505 static void emit_math2( struct brw_vs_compile *c,
 506                         GLuint function,
 507                         struct brw_reg dst,
 508                         struct brw_reg arg0,
 509                         struct brw_reg arg1,
 510                         GLuint precision)
 511 {
 512    struct brw_compile *p = &c->func;
 513    struct brw_reg tmp = dst;
 514    GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
 515                          dst.file != BRW_GENERAL_REGISTER_FILE);
 516
 517    if (need_tmp)
 518       tmp = get_tmp(c);
 519
 520    brw_MOV(p, brw_message_reg(3), arg1);
 521
 522    brw_math(p,
 523             tmp,
 524             function,
 525             BRW_MATH_SATURATE_NONE,
 526             2,
 527             arg0,
 528             BRW_MATH_DATA_SCALAR,
 529             precision);
 530
 531    if (need_tmp) {
 532       brw_MOV(p, dst, tmp);
 533       release_tmp(c, tmp);
 534    }
 535 }
 536
 537
 538 static void emit_exp_noalias( struct brw_vs_compile *c,
 539                               struct brw_reg dst,
 540                               struct brw_reg arg0 )
 541 {
 542    struct brw_compile *p = &c->func;
 543
 544
 545    if (dst.dw1.bits.writemask & WRITEMASK_X) {
 546       struct brw_reg tmp = get_tmp(c);
 547       struct brw_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
 548
 549       /* tmp_d = floor(arg0.x) */
 550       brw_RNDD(p, tmp_d, brw_swizzle1(arg0, 0));
 551
 552       /* result[0] = 2.0 ^ tmp */
 553
 554       /* Adjust exponent for floating point:
 555        * exp += 127
 556        */
 557       brw_ADD(p, brw_writemask(tmp_d, WRITEMASK_X), tmp_d, brw_imm_d(127));
 558
 559       /* Install exponent and sign.
 560        * Excess drops off the edge:
 561        */
 562       brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), WRITEMASK_X),
 563               tmp_d, brw_imm_d(23));
 564
 565       release_tmp(c, tmp);
 566    }
 567
 568    if (dst.dw1.bits.writemask & WRITEMASK_Y) {
 569       /* result[1] = arg0.x - floor(arg0.x) */
 570       brw_FRC(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0, 0));
 571    }
 572
 573    if (dst.dw1.bits.writemask & WRITEMASK_Z) {
 574       /* As with the LOG instruction, we might be better off just
 575        * doing a taylor expansion here, seeing as we have to do all
 576        * the prep work.
 577        *
 578        * If mathbox partial precision is too low, consider also:
 579        * result[3] = result[0] * EXP(result[1])
 580        */
 581       emit_math1(c,
 582                  BRW_MATH_FUNCTION_EXP,
 583                  brw_writemask(dst, WRITEMASK_Z),
 584                  brw_swizzle1(arg0, 0),
 585                  BRW_MATH_PRECISION_FULL);
 586    }
 587
 588    if (dst.dw1.bits.writemask & WRITEMASK_W) {
 589       /* result[3] = 1.0; */
 590       brw_MOV(p, brw_writemask(dst, WRITEMASK_W), brw_imm_f(1));
 591    }
 592 }
 593
 594
 595 static void emit_log_noalias( struct brw_vs_compile *c,
 596                               struct brw_reg dst,
 597                               struct brw_reg arg0 )
 598 {
 599    struct brw_compile *p = &c->func;
 600    struct brw_reg tmp = dst;
 601    struct brw_reg tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
 602    struct brw_reg arg0_ud = retype(arg0, BRW_REGISTER_TYPE_UD);
 603    GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
 604                          dst.file != BRW_GENERAL_REGISTER_FILE);
 605
 606    if (need_tmp) {
 607       tmp = get_tmp(c);
 608       tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
 609    }
 610
 611    /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
 612     * according to spec:
 613     *
 614     * These almost look likey they could be joined up, but not really
 615     * practical:
 616     *
 617     * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
 618     * result[1].i = (x.i & ((1<<23)-1)        + (127<<23)
 619     */
 620    if (dst.dw1.bits.writemask & WRITEMASK_XZ) {
 621       brw_AND(p,
 622               brw_writemask(tmp_ud, WRITEMASK_X),
 623               brw_swizzle1(arg0_ud, 0),
 624               brw_imm_ud((1U<<31)-1));
 625
 626       brw_SHR(p,
 627               brw_writemask(tmp_ud, WRITEMASK_X),
 628               tmp_ud,
 629               brw_imm_ud(23));
 630
 631       brw_ADD(p,
 632               brw_writemask(tmp, WRITEMASK_X),
 633               retype(tmp_ud, BRW_REGISTER_TYPE_D),      /* does it matter? */
 634               brw_imm_d(-127));
 635    }
 636
 637    if (dst.dw1.bits.writemask & WRITEMASK_YZ) {
 638       brw_AND(p,
 639               brw_writemask(tmp_ud, WRITEMASK_Y),
 640               brw_swizzle1(arg0_ud, 0),
 641               brw_imm_ud((1<<23)-1));
 642
 643       brw_OR(p,
 644              brw_writemask(tmp_ud, WRITEMASK_Y),
 645              tmp_ud,
 646              brw_imm_ud(127<<23));
 647    }
 648
 649    if (dst.dw1.bits.writemask & WRITEMASK_Z) {
 650       /* result[2] = result[0] + LOG2(result[1]); */
 651
 652       /* Why bother?  The above is just a hint how to do this with a
 653        * taylor series.  Maybe we *should* use a taylor series as by
 654        * the time all the above has been done it's almost certainly
 655        * quicker than calling the mathbox, even with low precision.
 656        *
 657        * Options are:
 658        *    - result[0] + mathbox.LOG2(result[1])
 659        *    - mathbox.LOG2(arg0.x)
 660        *    - result[0] + inline_taylor_approx(result[1])
 661        */
 662       emit_math1(c,
 663                  BRW_MATH_FUNCTION_LOG,
 664                  brw_writemask(tmp, WRITEMASK_Z),
 665                  brw_swizzle1(tmp, 1),
 666                  BRW_MATH_PRECISION_FULL);
 667
 668       brw_ADD(p,
 669               brw_writemask(tmp, WRITEMASK_Z),
 670               brw_swizzle1(tmp, 2),
 671               brw_swizzle1(tmp, 0));
 672    }
 673
 674    if (dst.dw1.bits.writemask & WRITEMASK_W) {
 675       /* result[3] = 1.0; */
 676       brw_MOV(p, brw_writemask(tmp, WRITEMASK_W), brw_imm_f(1));
 677    }
 678
 679    if (need_tmp) {
 680       brw_MOV(p, dst, tmp);
 681       release_tmp(c, tmp);
 682    }
 683 }
 684
 685
 686 /* Need to unalias - consider swizzles:   r0 = DST r0.xxxx r1
 687  */
 688 static void emit_dst_noalias( struct brw_vs_compile *c,
 689                               struct brw_reg dst,
 690                               struct brw_reg arg0,
 691                               struct brw_reg arg1)
 692 {
 693    struct brw_compile *p = &c->func;
 694
 695    /* There must be a better way to do this:
 696     */
 697    if (dst.dw1.bits.writemask & WRITEMASK_X)
 698       brw_MOV(p, brw_writemask(dst, WRITEMASK_X), brw_imm_f(1.0));
 699    if (dst.dw1.bits.writemask & WRITEMASK_Y)
 700       brw_MUL(p, brw_writemask(dst, WRITEMASK_Y), arg0, arg1);
 701    if (dst.dw1.bits.writemask & WRITEMASK_Z)
 702       brw_MOV(p, brw_writemask(dst, WRITEMASK_Z), arg0);
 703    if (dst.dw1.bits.writemask & WRITEMASK_W)
 704       brw_MOV(p, brw_writemask(dst, WRITEMASK_W), arg1);
 705 }
 706
 707
 708 static void emit_xpd( struct brw_compile *p,
 709                       struct brw_reg dst,
 710                       struct brw_reg t,
 711                       struct brw_reg u)
 712 {
 713    brw_MUL(p, brw_null_reg(), brw_swizzle(t, 1,2,0,3),  brw_swizzle(u,2,0,1,3));
 714    brw_MAC(p, dst,     negate(brw_swizzle(t, 2,0,1,3)), brw_swizzle(u,1,2,0,3));
 715 }
 716
 717
 718 static void emit_lit_noalias( struct brw_vs_compile *c,
 719                               struct brw_reg dst,
 720                               struct brw_reg arg0 )
 721 {
 722    struct brw_compile *p = &c->func;
 723    struct brw_instruction *if_insn;
 724    struct brw_reg tmp = dst;
 725    GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
 726
 727    if (need_tmp)
 728       tmp = get_tmp(c);
 729
 730    brw_MOV(p, brw_writemask(dst, WRITEMASK_YZ), brw_imm_f(0));
 731    brw_MOV(p, brw_writemask(dst, WRITEMASK_XW), brw_imm_f(1));
 732
 733    /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
 734     * to get all channels active inside the IF.  In the clipping code
 735     * we run with NoMask, so it's not an option and we can use
 736     * BRW_EXECUTE_1 for all comparisions.
 737     */
 738    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,0), brw_imm_f(0));
 739    if_insn = brw_IF(p, BRW_EXECUTE_8);
 740    {
 741       brw_MOV(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0,0));
 742
 743       brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,1), brw_imm_f(0));
 744       brw_MOV(p, brw_writemask(tmp, WRITEMASK_Z),  brw_swizzle1(arg0,1));
 745       brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 746
 747       emit_math2(c,
 748                  BRW_MATH_FUNCTION_POW,
 749                  brw_writemask(dst, WRITEMASK_Z),
 750                  brw_swizzle1(tmp, 2),
 751                  brw_swizzle1(arg0, 3),
 752                  BRW_MATH_PRECISION_PARTIAL);
 753    }
 754
 755    brw_ENDIF(p, if_insn);
 756
 757    release_tmp(c, tmp);
 758 }
 759
 760 static void emit_lrp_noalias(struct brw_vs_compile *c,
 761                              struct brw_reg dst,
 762                              struct brw_reg arg0,
 763                              struct brw_reg arg1,
 764                              struct brw_reg arg2)
 765 {
 766    struct brw_compile *p = &c->func;
 767
 768    brw_ADD(p, dst, negate(arg0), brw_imm_f(1.0));
 769    brw_MUL(p, brw_null_reg(), dst, arg2);
 770    brw_MAC(p, dst, arg0, arg1);
 771 }
 772
 773 /** 3 or 4-component vector normalization */
 774 static void emit_nrm( struct brw_vs_compile *c,
 775                       struct brw_reg dst,
 776                       struct brw_reg arg0,
 777                       int num_comps)
 778 {
 779    struct brw_compile *p = &c->func;
 780    struct brw_reg tmp = get_tmp(c);
 781
 782    /* tmp = dot(arg0, arg0) */
 783    if (num_comps == 3)
 784       brw_DP3(p, tmp, arg0, arg0);
 785    else
 786       brw_DP4(p, tmp, arg0, arg0);
 787
 788    /* tmp = 1 / sqrt(tmp) */
 789    emit_math1(c, BRW_MATH_FUNCTION_RSQ, tmp, tmp, BRW_MATH_PRECISION_FULL);
 790
 791    /* dst = arg0 * tmp */
 792    brw_MUL(p, dst, arg0, tmp);
 793
 794    release_tmp(c, tmp);
 795 }
 796
 797
 798 static struct brw_reg
 799 get_constant(struct brw_vs_compile *c,
 800              const struct prog_instruction *inst,
 801              GLuint argIndex)
 802 {
 803    const struct prog_src_register *src = &inst->SrcReg[argIndex];
 804    struct brw_compile *p = &c->func;
 805    struct brw_reg const_reg = c->current_const[argIndex].reg;
 806
 807    assert(argIndex < 3);
 808
 809    if (c->current_const[argIndex].index != src->Index) {
 810       struct brw_reg addrReg = c->regs[PROGRAM_ADDRESS][0];
 811
 812       /* Keep track of the last constant loaded in this slot, for reuse. */
 813       c->current_const[argIndex].index = src->Index;
 814
 815 #if 0
 816       printf("  fetch const[%d] for arg %d into reg %d\n",
 817              src->Index, argIndex, c->current_const[argIndex].reg.nr);
 818 #endif
 819       /* need to fetch the constant now */
 820       brw_dp_READ_4_vs(p,
 821                        const_reg,                     /* writeback dest */
 822                        0,                             /* oword */
 823                        0,                             /* relative indexing? */
 824                        addrReg,                       /* address register */
 825                        16 * src->Index,               /* byte offset */
 826                        SURF_INDEX_VERT_CONST_BUFFER   /* binding table index */
 827                        );
 828    }
 829
 830    /* replicate lower four floats into upper half (to get XYZWXYZW) */
 831    const_reg = stride(const_reg, 0, 4, 0);
 832    const_reg.subnr = 0;
 833
 834    return const_reg;
 835 }
 836
 837 static struct brw_reg
 838 get_reladdr_constant(struct brw_vs_compile *c,
 839                      const struct prog_instruction *inst,
 840                      GLuint argIndex)
 841 {
 842    const struct prog_src_register *src = &inst->SrcReg[argIndex];
 843    struct brw_compile *p = &c->func;
 844    struct brw_reg const_reg = c->current_const[argIndex].reg;
 845    struct brw_reg const2_reg;
 846    struct brw_reg addrReg = c->regs[PROGRAM_ADDRESS][0];
 847
 848    assert(argIndex < 3);
 849
 850    /* Can't reuse a reladdr constant load. */
 851    c->current_const[argIndex].index = -1;
 852
 853  #if 0
 854    printf("  fetch const[a0.x+%d] for arg %d into reg %d\n",
 855           src->Index, argIndex, c->current_const[argIndex].reg.nr);
 856 #endif
 857
 858    /* fetch the first vec4 */
 859    brw_dp_READ_4_vs(p,
 860                     const_reg,                     /* writeback dest */
 861                     0,                             /* oword */
 862                     1,                             /* relative indexing? */
 863                     addrReg,                       /* address register */
 864                     16 * src->Index,               /* byte offset */
 865                     SURF_INDEX_VERT_CONST_BUFFER   /* binding table index */
 866                     );
 867    /* second vec4 */
 868    const2_reg = get_tmp(c);
 869
 870    /* use upper half of address reg for second read */
 871    addrReg = stride(addrReg, 0, 4, 0);
 872    addrReg.subnr = 16;
 873
 874    brw_dp_READ_4_vs(p,
 875                     const2_reg,              /* writeback dest */
 876                     1,                       /* oword */
 877                     1,                       /* relative indexing? */
 878                     addrReg,                 /* address register */
 879                     16 * src->Index,         /* byte offset */
 880                     SURF_INDEX_VERT_CONST_BUFFER
 881                     );
 882
 883    /* merge the two Owords into the constant register */
 884    /* const_reg[7..4] = const2_reg[7..4] */
 885    brw_MOV(p,
 886            suboffset(stride(const_reg, 0, 4, 1), 4),
 887            suboffset(stride(const2_reg, 0, 4, 1), 4));
 888    release_tmp(c, const2_reg);
 889
 890    return const_reg;
 891 }
 892
 893
 894
 895 /* TODO: relative addressing!
 896  */
 897 static struct brw_reg get_reg( struct brw_vs_compile *c,
 898                                gl_register_file file,
 899                                GLuint index )
 900 {
 901    switch (file) {
 902    case PROGRAM_TEMPORARY:
 903    case PROGRAM_INPUT:
 904    case PROGRAM_OUTPUT:
 905       assert(c->regs[file][index].nr != 0);
 906       return c->regs[file][index];
 907    case PROGRAM_STATE_VAR:
 908    case PROGRAM_CONSTANT:
 909    case PROGRAM_UNIFORM:
 910       assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
 911       return c->regs[PROGRAM_STATE_VAR][index];
 912    case PROGRAM_ADDRESS:
 913       assert(index == 0);
 914       return c->regs[file][index];
 915
 916    case PROGRAM_UNDEFINED:                      /* undef values */
 917       return brw_null_reg();
 918
 919    case PROGRAM_LOCAL_PARAM:
 920    case PROGRAM_ENV_PARAM:
 921    case PROGRAM_WRITE_ONLY:
 922    default:
 923       assert(0);
 924       return brw_null_reg();
 925    }
 926 }
 927
 928
 929 /**
 930  * Indirect addressing:  get reg[[arg] + offset].
 931  */
 932 static struct brw_reg deref( struct brw_vs_compile *c,
 933                              struct brw_reg arg,
 934                              GLint offset)
 935 {
 936    struct brw_compile *p = &c->func;
 937    struct brw_reg tmp = vec4(get_tmp(c));
 938    struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
 939    struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_UW);
 940    GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * 16;
 941    struct brw_reg indirect = brw_vec4_indirect(0,0);
 942
 943    {
 944       brw_push_insn_state(p);
 945       brw_set_access_mode(p, BRW_ALIGN_1);
 946
 947       /* This is pretty clunky - load the address register twice and
 948        * fetch each 4-dword value in turn.  There must be a way to do
 949        * this in a single pass, but I couldn't get it to work.
 950        */
 951       brw_ADD(p, brw_address_reg(0), vp_address, brw_imm_d(byte_offset));
 952       brw_MOV(p, tmp, indirect);
 953
 954       brw_ADD(p, brw_address_reg(0), suboffset(vp_address, 8), brw_imm_d(byte_offset));
 955       brw_MOV(p, suboffset(tmp, 4), indirect);
 956
 957       brw_pop_insn_state(p);
 958    }
 959
 960    /* NOTE: tmp not released */
 961    return vec8(tmp);
 962 }
 963
 964
 965 /**
 966  * Get brw reg corresponding to the instruction's [argIndex] src reg.
 967  * TODO: relative addressing!
 968  */
 969 static struct brw_reg
 970 get_src_reg( struct brw_vs_compile *c,
 971              const struct prog_instruction *inst,
 972              GLuint argIndex )
 973 {
 974    const GLuint file = inst->SrcReg[argIndex].File;
 975    const GLint index = inst->SrcReg[argIndex].Index;
 976    const GLboolean relAddr = inst->SrcReg[argIndex].RelAddr;
 977
 978    switch (file) {
 979    case PROGRAM_TEMPORARY:
 980    case PROGRAM_INPUT:
 981    case PROGRAM_OUTPUT:
 982       if (relAddr) {
 983          return deref(c, c->regs[file][0], index);
 984       }
 985       else {
 986          assert(c->regs[file][index].nr != 0);
 987          return c->regs[file][index];
 988       }
 989
 990    case PROGRAM_STATE_VAR:
 991    case PROGRAM_CONSTANT:
 992    case PROGRAM_UNIFORM:
 993    case PROGRAM_ENV_PARAM:
 994    case PROGRAM_LOCAL_PARAM:
 995       if (c->vp->use_const_buffer) {
 996          if (!relAddr && c->constant_map[index] != -1) {
 997             assert(c->regs[PROGRAM_STATE_VAR][c->constant_map[index]].nr != 0);
 998             return c->regs[PROGRAM_STATE_VAR][c->constant_map[index]];
 999          } else if (relAddr)
1000             return get_reladdr_constant(c, inst, argIndex);
1001          else
1002             return get_constant(c, inst, argIndex);
1003       }
1004       else if (relAddr) {
1005          return deref(c, c->regs[PROGRAM_STATE_VAR][0], index);
1006       }
1007       else {
1008          assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
1009          return c->regs[PROGRAM_STATE_VAR][index];
1010       }
1011    case PROGRAM_ADDRESS:
1012       assert(index == 0);
1013       return c->regs[file][index];
1014
1015    case PROGRAM_UNDEFINED:
1016       /* this is a normal case since we loop over all three src args */
1017       return brw_null_reg();
1018
1019    case PROGRAM_WRITE_ONLY:
1020    default:
1021       assert(0);
1022       return brw_null_reg();
1023    }
1024 }
1025
1026
1027 static void emit_arl( struct brw_vs_compile *c,
1028                       struct brw_reg dst,
1029                       struct brw_reg arg0 )
1030 {
1031    struct brw_compile *p = &c->func;
1032    struct brw_reg tmp = dst;
1033    GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
1034
1035    if (need_tmp)
1036       tmp = get_tmp(c);
1037
1038    brw_RNDD(p, tmp, arg0);               /* tmp = round(arg0) */
1039    brw_MUL(p, dst, tmp, brw_imm_d(16));  /* dst = tmp * 16 */
1040
1041    if (need_tmp)
1042       release_tmp(c, tmp);
1043 }
1044
1045
1046 /**
1047  * Return the brw reg for the given instruction's src argument.
1048  * Will return mangled results for SWZ op.  The emit_swz() function
1049  * ignores this result and recalculates taking extended swizzles into
1050  * account.
1051  */
1052 static struct brw_reg get_arg( struct brw_vs_compile *c,
1053                                const struct prog_instruction *inst,
1054                                GLuint argIndex )
1055 {
1056    const struct prog_src_register *src = &inst->SrcReg[argIndex];
1057    struct brw_reg reg;
1058
1059    if (src->File == PROGRAM_UNDEFINED)
1060       return brw_null_reg();
1061
1062    reg = get_src_reg(c, inst, argIndex);
1063
1064    /* Convert 3-bit swizzle to 2-bit.
1065     */
1066    reg.dw1.bits.swizzle = BRW_SWIZZLE4(GET_SWZ(src->Swizzle, 0),
1067                                        GET_SWZ(src->Swizzle, 1),
1068                                        GET_SWZ(src->Swizzle, 2),
1069                                        GET_SWZ(src->Swizzle, 3));
1070
1071    /* Note this is ok for non-swizzle instructions:
1072     */
1073    reg.negate = src->Negate ? 1 : 0;
1074
1075    return reg;
1076 }
1077
1078
1079 /**
1080  * Get brw register for the given program dest register.
1081  */
1082 static struct brw_reg get_dst( struct brw_vs_compile *c,
1083                                struct prog_dst_register dst )
1084 {
1085    struct brw_reg reg;
1086
1087    switch (dst.File) {
1088    case PROGRAM_TEMPORARY:
1089    case PROGRAM_OUTPUT:
1090       assert(c->regs[dst.File][dst.Index].nr != 0);
1091       reg = c->regs[dst.File][dst.Index];
1092       break;
1093    case PROGRAM_ADDRESS:
1094       assert(dst.Index == 0);
1095       reg = c->regs[dst.File][dst.Index];
1096       break;
1097    case PROGRAM_UNDEFINED:
1098       /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
1099       reg = brw_null_reg();
1100       break;
1101    default:
1102       assert(0);
1103       reg = brw_null_reg();
1104    }
1105
1106    reg.dw1.bits.writemask = dst.WriteMask;
1107
1108    return reg;
1109 }
1110
1111
1112 static void emit_swz( struct brw_vs_compile *c,
1113                       struct brw_reg dst,
1114                       const struct prog_instruction *inst)
1115 {
1116    const GLuint argIndex = 0;
1117    const struct prog_src_register src = inst->SrcReg[argIndex];
1118    struct brw_compile *p = &c->func;
1119    GLuint zeros_mask = 0;
1120    GLuint ones_mask = 0;
1121    GLuint src_mask = 0;
1122    GLubyte src_swz[4];
1123    GLboolean need_tmp = (src.Negate &&
1124                          dst.file != BRW_GENERAL_REGISTER_FILE);
1125    struct brw_reg tmp = dst;
1126    GLuint i;
1127
1128    if (need_tmp)
1129       tmp = get_tmp(c);
1130
1131    for (i = 0; i < 4; i++) {
1132       if (dst.dw1.bits.writemask & (1<<i)) {
1133          GLubyte s = GET_SWZ(src.Swizzle, i);
1134          switch (s) {
1135          case SWIZZLE_X:
1136          case SWIZZLE_Y:
1137          case SWIZZLE_Z:
1138          case SWIZZLE_W:
1139             src_mask |= 1<<i;
1140             src_swz[i] = s;
1141             break;
1142          case SWIZZLE_ZERO:
1143             zeros_mask |= 1<<i;
1144             break;
1145          case SWIZZLE_ONE:
1146             ones_mask |= 1<<i;
1147             break;
1148          }
1149       }
1150    }
1151
1152    /* Do src first, in case dst aliases src:
1153     */
1154    if (src_mask) {
1155       struct brw_reg arg0;
1156
1157       arg0 = get_src_reg(c, inst, argIndex);
1158
1159       arg0 = brw_swizzle(arg0,
1160                          src_swz[0], src_swz[1],
1161                          src_swz[2], src_swz[3]);
1162
1163       brw_MOV(p, brw_writemask(tmp, src_mask), arg0);
1164    }
1165
1166    if (zeros_mask)
1167       brw_MOV(p, brw_writemask(tmp, zeros_mask), brw_imm_f(0));
1168
1169    if (ones_mask)
1170       brw_MOV(p, brw_writemask(tmp, ones_mask), brw_imm_f(1));
1171
1172    if (src.Negate)
1173       brw_MOV(p, brw_writemask(tmp, src.Negate), negate(tmp));
1174
1175    if (need_tmp) {
1176       brw_MOV(p, dst, tmp);
1177       release_tmp(c, tmp);
1178    }
1179 }
1180
1181
1182 /**
1183  * Post-vertex-program processing.  Send the results to the URB.
1184  */
1185 static void emit_vertex_write( struct brw_vs_compile *c)
1186 {
1187    struct brw_compile *p = &c->func;
1188    struct brw_context *brw = p->brw;
1189    struct intel_context *intel = &brw->intel;
1190    struct brw_reg m0 = brw_message_reg(0);
1191    struct brw_reg pos = c->regs[PROGRAM_OUTPUT][VERT_RESULT_HPOS];
1192    struct brw_reg ndc;
1193    int eot;
1194    GLuint len_vertext_header = 2;
1195
1196    if (c->key.copy_edgeflag) {
1197       brw_MOV(p,
1198               get_reg(c, PROGRAM_OUTPUT, VERT_RESULT_EDGE),
1199               get_reg(c, PROGRAM_INPUT, VERT_ATTRIB_EDGEFLAG));
1200    }
1201
1202    /* Build ndc coords */
1203    ndc = get_tmp(c);
1204    /* ndc = 1.0 / pos.w */
1205    emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
1206    /* ndc.xyz = pos * ndc */
1207    brw_MUL(p, brw_writemask(ndc, WRITEMASK_XYZ), pos, ndc);
1208
1209    /* Update the header for point size, user clipping flags, and -ve rhw
1210     * workaround.
1211     */
1212    if ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
1213        c->key.nr_userclip || brw->has_negative_rhw_bug)
1214    {
1215       struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
1216       GLuint i;
1217
1218       brw_MOV(p, header1, brw_imm_ud(0));
1219
1220       brw_set_access_mode(p, BRW_ALIGN_16);
1221
1222       if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
1223          struct brw_reg psiz = c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ];
1224          brw_MUL(p, brw_writemask(header1, WRITEMASK_W), brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
1225          brw_AND(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(0x7ff<<8));
1226       }
1227
1228       for (i = 0; i < c->key.nr_userclip; i++) {
1229          brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
1230          brw_DP4(p, brw_null_reg(), pos, c->userplane[i]);
1231          brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<i));
1232          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1233       }
1234
1235       /* i965 clipping workaround:
1236        * 1) Test for -ve rhw
1237        * 2) If set,
1238        *      set ndc = (0,0,0,0)
1239        *      set ucp[6] = 1
1240        *
1241        * Later, clipping will detect ucp[6] and ensure the primitive is
1242        * clipped against all fixed planes.
1243        */
1244       if (brw->has_negative_rhw_bug) {
1245          brw_CMP(p,
1246                  vec8(brw_null_reg()),
1247                  BRW_CONDITIONAL_L,
1248                  brw_swizzle1(ndc, 3),
1249                  brw_imm_f(0));
1250
1251          brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
1252          brw_MOV(p, ndc, brw_imm_f(0));
1253          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1254       }
1255
1256       brw_set_access_mode(p, BRW_ALIGN_1);      /* why? */
1257       brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), header1);
1258       brw_set_access_mode(p, BRW_ALIGN_16);
1259
1260       release_tmp(c, header1);
1261    }
1262    else {
1263       brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
1264    }
1265
1266    /* Emit the (interleaved) headers for the two vertices - an 8-reg
1267     * of zeros followed by two sets of NDC coordinates:
1268     */
1269    brw_set_access_mode(p, BRW_ALIGN_1);
1270    brw_MOV(p, offset(m0, 2), ndc);
1271
1272    if (intel->is_ironlake) {
1273        /* There are 20 DWs (D0-D19) in VUE vertex header on Ironlake */
1274        brw_MOV(p, offset(m0, 3), pos); /* a portion of vertex header */
1275        /* m4, m5 contain the distances from vertex to the user clip planeXXX.
1276         * Seems it is useless for us.
1277         * m6 is used for aligning, so that the remainder of vertex element is
1278         * reg-aligned.
1279         */
1280        brw_MOV(p, offset(m0, 7), pos); /* the remainder of vertex element */
1281        len_vertext_header = 6;
1282    } else {
1283        brw_MOV(p, offset(m0, 3), pos);
1284        len_vertext_header = 2;
1285    }
1286
1287    eot = (c->first_overflow_output == 0);
1288
1289    brw_urb_WRITE(p,
1290                  brw_null_reg(), /* dest */
1291                  0,             /* starting mrf reg nr */
1292                  c->r0,         /* src */
1293                  0,             /* allocate */
1294                  1,             /* used */
1295                  MIN2(c->nr_outputs + 1 + len_vertext_header, (BRW_MAX_MRF-1)), /* msg len */
1296                  0,             /* response len */
1297                  eot,           /* eot */
1298                  eot,           /* writes complete */
1299                  0,             /* urb destination offset */
1300                  BRW_URB_SWIZZLE_INTERLEAVE);
1301
1302    if (c->first_overflow_output > 0) {
1303       /* Not all of the vertex outputs/results fit into the MRF.
1304        * Move the overflowed attributes from the GRF to the MRF and
1305        * issue another brw_urb_WRITE().
1306        */
1307       /* XXX I'm not 100% sure about which MRF regs to use here.  Starting
1308        * at mrf[4] atm...
1309        */
1310       GLuint i, mrf = 0;
1311       for (i = c->first_overflow_output; i < VERT_RESULT_MAX; i++) {
1312          if (c->prog_data.outputs_written & BITFIELD64_BIT(i)) {
1313             /* move from GRF to MRF */
1314             brw_MOV(p, brw_message_reg(4+mrf), c->regs[PROGRAM_OUTPUT][i]);
1315             mrf++;
1316          }
1317       }
1318
1319       brw_urb_WRITE(p,
1320                     brw_null_reg(), /* dest */
1321                     4,              /* starting mrf reg nr */
1322                     c->r0,          /* src */
1323                     0,              /* allocate */
1324                     1,              /* used */
1325                     mrf+1,          /* msg len */
1326                     0,              /* response len */
1327                     1,              /* eot */
1328                     1,              /* writes complete */
1329                     BRW_MAX_MRF-1,  /* urb destination offset */
1330                     BRW_URB_SWIZZLE_INTERLEAVE);
1331    }
1332 }
1333
1334
1335 /**
1336  * Called after code generation to resolve subroutine calls and the
1337  * END instruction.
1338  * \param end_inst  points to brw code for END instruction
1339  * \param last_inst  points to last instruction emitted before vertex write
1340  */
1341 static void
1342 post_vs_emit( struct brw_vs_compile *c,
1343               struct brw_instruction *end_inst,
1344               struct brw_instruction *last_inst )
1345 {
1346    GLint offset;
1347
1348    brw_resolve_cals(&c->func);
1349
1350    /* patch up the END code to jump past subroutines, etc */
1351    offset = last_inst - end_inst;
1352    if (offset > 1) {
1353       brw_set_src1(end_inst, brw_imm_d(offset * 16));
1354    } else {
1355       end_inst->header.opcode = BRW_OPCODE_NOP;
1356    }
1357 }
1358
1359 static GLboolean
1360 accumulator_contains(struct brw_vs_compile *c, struct brw_reg val)
1361 {
1362    struct brw_compile *p = &c->func;
1363    struct brw_instruction *prev_insn = &p->store[p->nr_insn - 1];
1364
1365    if (p->nr_insn == 0)
1366       return GL_FALSE;
1367
1368    if (val.address_mode != BRW_ADDRESS_DIRECT)
1369       return GL_FALSE;
1370
1371    switch (prev_insn->header.opcode) {
1372    case BRW_OPCODE_MOV:
1373    case BRW_OPCODE_MAC:
1374    case BRW_OPCODE_MUL:
1375       if (prev_insn->header.access_mode == BRW_ALIGN_16 &&
1376           prev_insn->header.execution_size == val.width &&
1377           prev_insn->bits1.da1.dest_reg_file == val.file &&
1378           prev_insn->bits1.da1.dest_reg_type == val.type &&
1379           prev_insn->bits1.da1.dest_address_mode == val.address_mode &&
1380           prev_insn->bits1.da1.dest_reg_nr == val.nr &&
1381           prev_insn->bits1.da16.dest_subreg_nr == val.subnr / 16 &&
1382           prev_insn->bits1.da16.dest_writemask == 0xf)
1383          return GL_TRUE;
1384       else
1385          return GL_FALSE;
1386    default:
1387       return GL_FALSE;
1388    }
1389 }
1390
1391 static uint32_t
1392 get_predicate(const struct prog_instruction *inst)
1393 {
1394    if (inst->DstReg.CondMask == COND_TR)
1395       return BRW_PREDICATE_NONE;
1396
1397    /* All of GLSL only produces predicates for COND_NE and one channel per
1398     * vector.  Fail badly if someone starts doing something else, as it might
1399     * mean infinite looping or something.
1400     *
1401     * We'd like to support all the condition codes, but our hardware doesn't
1402     * quite match the Mesa IR, which is modeled after the NV extensions.  For
1403     * those, the instruction may update the condition codes or not, then any
1404     * later instruction may use one of those condition codes.  For gen4, the
1405     * instruction may update the flags register based on one of the condition
1406     * codes output by the instruction, and then further instructions may
1407     * predicate on that.  We can probably support this, but it won't
1408     * necessarily be easy.
1409     */
1410    assert(inst->DstReg.CondMask == COND_NE);
1411
1412    switch (inst->DstReg.CondSwizzle) {
1413    case SWIZZLE_XXXX:
1414       return BRW_PREDICATE_ALIGN16_REPLICATE_X;
1415    case SWIZZLE_YYYY:
1416       return BRW_PREDICATE_ALIGN16_REPLICATE_Y;
1417    case SWIZZLE_ZZZZ:
1418       return BRW_PREDICATE_ALIGN16_REPLICATE_Z;
1419    case SWIZZLE_WWWW:
1420       return BRW_PREDICATE_ALIGN16_REPLICATE_W;
1421    default:
1422       _mesa_problem(NULL, "Unexpected predicate: 0x%08x\n",
1423                     inst->DstReg.CondMask);
1424       return BRW_PREDICATE_NORMAL;
1425    }
1426 }
1427
1428 /* Emit the vertex program instructions here.
1429  */
1430 void brw_vs_emit(struct brw_vs_compile *c )
1431 {
1432 #define MAX_IF_DEPTH 32
1433 #define MAX_LOOP_DEPTH 32
1434    struct brw_compile *p = &c->func;
1435    struct brw_context *brw = p->brw;
1436    struct intel_context *intel = &brw->intel;
1437    const GLuint nr_insns = c->vp->program.Base.NumInstructions;
1438    GLuint insn, if_depth = 0, loop_depth = 0;
1439    GLuint end_offset = 0;
1440    struct brw_instruction *end_inst, *last_inst;
1441    struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH] = { 0 };
1442    const struct brw_indirect stack_index = brw_indirect(0, 0);
1443    GLuint index;
1444    GLuint file;
1445
1446    if (INTEL_DEBUG & DEBUG_VS) {
1447       printf("vs-mesa:\n");
1448       _mesa_print_program(&c->vp->program.Base);
1449       printf("\n");
1450    }
1451
1452    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1453    brw_set_access_mode(p, BRW_ALIGN_16);
1454
1455    for (insn = 0; insn < nr_insns; insn++) {
1456        GLuint i;
1457        struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1458
1459        /* Message registers can't be read, so copy the output into GRF
1460         * register if they are used in source registers
1461         */
1462        for (i = 0; i < 3; i++) {
1463            struct prog_src_register *src = &inst->SrcReg[i];
1464            GLuint index = src->Index;
1465            GLuint file = src->File;
1466            if (file == PROGRAM_OUTPUT && index != VERT_RESULT_HPOS)
1467                c->output_regs[index].used_in_src = GL_TRUE;
1468        }
1469
1470        switch (inst->Opcode) {
1471        case OPCODE_CAL:
1472        case OPCODE_RET:
1473           c->needs_stack = GL_TRUE;
1474           break;
1475        default:
1476           break;
1477        }
1478    }
1479
1480    /* Static register allocation
1481     */
1482    brw_vs_alloc_regs(c);
1483
1484    if (c->needs_stack)
1485       brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
1486
1487    for (insn = 0; insn < nr_insns; insn++) {
1488
1489       const struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1490       struct brw_reg args[3], dst;
1491       GLuint i;
1492
1493 #if 0
1494       printf("%d: ", insn);
1495       _mesa_print_instruction(inst);
1496 #endif
1497
1498       /* Get argument regs.  SWZ is special and does this itself.
1499        */
1500       if (inst->Opcode != OPCODE_SWZ)
1501           for (i = 0; i < 3; i++) {
1502               const struct prog_src_register *src = &inst->SrcReg[i];
1503               index = src->Index;
1504               file = src->File;
1505               if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1506                   args[i] = c->output_regs[index].reg;
1507               else
1508                   args[i] = get_arg(c, inst, i);
1509           }
1510
1511       /* Get dest regs.  Note that it is possible for a reg to be both
1512        * dst and arg, given the static allocation of registers.  So
1513        * care needs to be taken emitting multi-operation instructions.
1514        */
1515       index = inst->DstReg.Index;
1516       file = inst->DstReg.File;
1517       if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1518           dst = c->output_regs[index].reg;
1519       else
1520           dst = get_dst(c, inst->DstReg);
1521
1522       if (inst->SaturateMode != SATURATE_OFF) {
1523          _mesa_problem(NULL, "Unsupported saturate %d in vertex shader",
1524                        inst->SaturateMode);
1525       }
1526
1527       switch (inst->Opcode) {
1528       case OPCODE_ABS:
1529          brw_MOV(p, dst, brw_abs(args[0]));
1530          break;
1531       case OPCODE_ADD:
1532          brw_ADD(p, dst, args[0], args[1]);
1533          break;
1534       case OPCODE_COS:
1535          emit_math1(c, BRW_MATH_FUNCTION_COS, dst, args[0], BRW_MATH_PRECISION_FULL);
1536          break;
1537       case OPCODE_DP3:
1538          brw_DP3(p, dst, args[0], args[1]);
1539          break;
1540       case OPCODE_DP4:
1541          brw_DP4(p, dst, args[0], args[1]);
1542          break;
1543       case OPCODE_DPH:
1544          brw_DPH(p, dst, args[0], args[1]);
1545          break;
1546       case OPCODE_NRM3:
1547          emit_nrm(c, dst, args[0], 3);
1548          break;
1549       case OPCODE_NRM4:
1550          emit_nrm(c, dst, args[0], 4);
1551          break;
1552       case OPCODE_DST:
1553          unalias2(c, dst, args[0], args[1], emit_dst_noalias);
1554          break;
1555       case OPCODE_EXP:
1556          unalias1(c, dst, args[0], emit_exp_noalias);
1557          break;
1558       case OPCODE_EX2:
1559          emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
1560          break;
1561       case OPCODE_ARL:
1562          emit_arl(c, dst, args[0]);
1563          break;
1564       case OPCODE_FLR:
1565          brw_RNDD(p, dst, args[0]);
1566          break;
1567       case OPCODE_FRC:
1568          brw_FRC(p, dst, args[0]);
1569          break;
1570       case OPCODE_LOG:
1571          unalias1(c, dst, args[0], emit_log_noalias);
1572          break;
1573       case OPCODE_LG2:
1574          emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
1575          break;
1576       case OPCODE_LIT:
1577          unalias1(c, dst, args[0], emit_lit_noalias);
1578          break;
1579       case OPCODE_LRP:
1580          unalias3(c, dst, args[0], args[1], args[2], emit_lrp_noalias);
1581          break;
1582       case OPCODE_MAD:
1583          if (!accumulator_contains(c, args[2]))
1584             brw_MOV(p, brw_acc_reg(), args[2]);
1585          brw_MAC(p, dst, args[0], args[1]);
1586          break;
1587       case OPCODE_CMP:
1588          emit_cmp(p, dst, args[0], args[1], args[2]);
1589          break;
1590       case OPCODE_MAX:
1591          emit_max(p, dst, args[0], args[1]);
1592          break;
1593       case OPCODE_MIN:
1594          emit_min(p, dst, args[0], args[1]);
1595          break;
1596       case OPCODE_MOV:
1597          brw_MOV(p, dst, args[0]);
1598          break;
1599       case OPCODE_MUL:
1600          brw_MUL(p, dst, args[0], args[1]);
1601          break;
1602       case OPCODE_POW:
1603          emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL);
1604          break;
1605       case OPCODE_RCP:
1606          emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
1607          break;
1608       case OPCODE_RSQ:
1609          emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, args[0], BRW_MATH_PRECISION_FULL);
1610          break;
1611
1612       case OPCODE_SEQ:
1613          unalias2(c, dst, args[0], args[1], emit_seq);
1614          break;
1615       case OPCODE_SIN:
1616          emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, args[0], BRW_MATH_PRECISION_FULL);
1617          break;
1618       case OPCODE_SNE:
1619          unalias2(c, dst, args[0], args[1], emit_sne);
1620          break;
1621       case OPCODE_SGE:
1622          unalias2(c, dst, args[0], args[1], emit_sge);
1623          break;
1624       case OPCODE_SGT:
1625          unalias2(c, dst, args[0], args[1], emit_sgt);
1626          break;
1627       case OPCODE_SLT:
1628          unalias2(c, dst, args[0], args[1], emit_slt);
1629          break;
1630       case OPCODE_SLE:
1631          unalias2(c, dst, args[0], args[1], emit_sle);
1632          break;
1633       case OPCODE_SUB:
1634          brw_ADD(p, dst, args[0], negate(args[1]));
1635          break;
1636       case OPCODE_SWZ:
1637          /* The args[0] value can't be used here as it won't have
1638           * correctly encoded the full swizzle:
1639           */
1640          emit_swz(c, dst, inst);
1641          break;
1642       case OPCODE_TRUNC:
1643          /* round toward zero */
1644          brw_RNDZ(p, dst, args[0]);
1645          break;
1646       case OPCODE_XPD:
1647          emit_xpd(p, dst, args[0], args[1]);
1648          break;
1649       case OPCODE_IF:
1650          assert(if_depth < MAX_IF_DEPTH);
1651          if_inst[if_depth] = brw_IF(p, BRW_EXECUTE_8);
1652          /* Note that brw_IF smashes the predicate_control field. */
1653          if_inst[if_depth]->header.predicate_control = get_predicate(inst);
1654          if_depth++;
1655          break;
1656       case OPCODE_ELSE:
1657          if_inst[if_depth-1] = brw_ELSE(p, if_inst[if_depth-1]);
1658          break;
1659       case OPCODE_ENDIF:
1660          assert(if_depth > 0);
1661          brw_ENDIF(p, if_inst[--if_depth]);
1662          break;
1663       case OPCODE_BGNLOOP:
1664          loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
1665          break;
1666       case OPCODE_BRK:
1667          brw_set_predicate_control(p, get_predicate(inst));
1668          brw_BREAK(p);
1669          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1670          break;
1671       case OPCODE_CONT:
1672          brw_set_predicate_control(p, get_predicate(inst));
1673          brw_CONT(p);
1674          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1675          break;
1676       case OPCODE_ENDLOOP:
1677          {
1678             struct brw_instruction *inst0, *inst1;
1679             GLuint br = 1;
1680
1681             loop_depth--;
1682
1683             if (intel->is_ironlake)
1684                br = 2;
1685
1686             inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
1687             /* patch all the BREAK/CONT instructions from last BEGINLOOP */
1688             while (inst0 > loop_inst[loop_depth]) {
1689                inst0--;
1690                if (inst0->header.opcode == BRW_OPCODE_BREAK) {
1691                   inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
1692                   inst0->bits3.if_else.pop_count = 0;
1693                }
1694                else if (inst0->header.opcode == BRW_OPCODE_CONTINUE) {
1695                   inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
1696                   inst0->bits3.if_else.pop_count = 0;
1697                }
1698             }
1699          }
1700          break;
1701       case OPCODE_BRA:
1702          brw_set_predicate_control(p, get_predicate(inst));
1703          brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1704          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1705          break;
1706       case OPCODE_CAL:
1707          brw_set_access_mode(p, BRW_ALIGN_1);
1708          brw_ADD(p, deref_1d(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
1709          brw_set_access_mode(p, BRW_ALIGN_16);
1710          brw_ADD(p, get_addr_reg(stack_index),
1711                          get_addr_reg(stack_index), brw_imm_d(4));
1712          brw_save_call(p, inst->Comment, p->nr_insn);
1713          brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1714          break;
1715       case OPCODE_RET:
1716          brw_ADD(p, get_addr_reg(stack_index),
1717                          get_addr_reg(stack_index), brw_imm_d(-4));
1718          brw_set_access_mode(p, BRW_ALIGN_1);
1719          brw_MOV(p, brw_ip_reg(), deref_1d(stack_index, 0));
1720          brw_set_access_mode(p, BRW_ALIGN_16);
1721          break;
1722       case OPCODE_END:
1723          end_offset = p->nr_insn;
1724          /* this instruction will get patched later to jump past subroutine
1725           * code, etc.
1726           */
1727          brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1728          break;
1729       case OPCODE_PRINT:
1730          /* no-op */
1731          break;
1732       case OPCODE_BGNSUB:
1733          brw_save_label(p, inst->Comment, p->nr_insn);
1734          break;
1735       case OPCODE_ENDSUB:
1736          /* no-op */
1737          break;
1738       default:
1739          _mesa_problem(NULL, "Unsupported opcode %i (%s) in vertex shader",
1740                        inst->Opcode, inst->Opcode < MAX_OPCODE ?
1741                                     _mesa_opcode_string(inst->Opcode) :
1742                                     "unknown");
1743       }
1744
1745       /* Set the predication update on the last instruction of the native
1746        * instruction sequence.
1747        *
1748        * This would be problematic if it was set on a math instruction,
1749        * but that shouldn't be the case with the current GLSL compiler.
1750        */
1751       if (inst->CondUpdate) {
1752          struct brw_instruction *hw_insn = &p->store[p->nr_insn - 1];
1753
1754          assert(hw_insn->header.destreg__conditionalmod == 0);
1755          hw_insn->header.destreg__conditionalmod = BRW_CONDITIONAL_NZ;
1756       }
1757
1758       if ((inst->DstReg.File == PROGRAM_OUTPUT)
1759           && (inst->DstReg.Index != VERT_RESULT_HPOS)
1760           && c->output_regs[inst->DstReg.Index].used_in_src) {
1761          brw_MOV(p, get_dst(c, inst->DstReg), dst);
1762       }
1763
1764       /* Result color clamping.
1765        *
1766        * When destination register is an output register and
1767        * it's primary/secondary front/back color, we have to clamp
1768        * the result to [0,1]. This is done by enabling the
1769        * saturation bit for the last instruction.
1770        *
1771        * We don't use brw_set_saturate() as it modifies
1772        * p->current->header.saturate, which affects all the subsequent
1773        * instructions. Instead, we directly modify the header
1774        * of the last (already stored) instruction.
1775        */
1776       if (inst->DstReg.File == PROGRAM_OUTPUT) {
1777          if ((inst->DstReg.Index == VERT_RESULT_COL0)
1778              || (inst->DstReg.Index == VERT_RESULT_COL1)
1779              || (inst->DstReg.Index == VERT_RESULT_BFC0)
1780              || (inst->DstReg.Index == VERT_RESULT_BFC1)) {
1781             p->store[p->nr_insn-1].header.saturate = 1;
1782          }
1783       }
1784
1785       release_tmps(c);
1786    }
1787
1788    end_inst = &p->store[end_offset];
1789    last_inst = &p->store[p->nr_insn];
1790
1791    /* The END instruction will be patched to jump to this code */
1792    emit_vertex_write(c);
1793
1794    post_vs_emit(c, end_inst, last_inst);
1795
1796    if (INTEL_DEBUG & DEBUG_VS) {
1797       int i;
1798
1799       printf("vs-native:\n");
1800       for (i = 0; i < p->nr_insn; i++)
1801          brw_disasm(stderr, &p->store[i]);
1802       printf("\n");
1803    }
1804 }