src/gallium/drivers/i965/brw_vs_emit.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keith@tungstengraphics.com>
  30   */
  31
  32 #include "pipe/p_shader_tokens.h"
  33
  34 #include "util/u_memory.h"
  35 #include "util/u_math.h"
  36
  37 #include "tgsi/tgsi_parse.h"
  38 #include "tgsi/tgsi_dump.h"
  39 #include "tgsi/tgsi_info.h"
  40
  41 #include "brw_context.h"
  42 #include "brw_vs.h"
  43 #include "brw_debug.h"
  44
  45
  46
  47 static struct brw_reg get_tmp( struct brw_vs_compile *c )
  48 {
  49    struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
  50
  51    if (++c->last_tmp > c->prog_data.total_grf)
  52       c->prog_data.total_grf = c->last_tmp;
  53
  54    return tmp;
  55 }
  56
  57 static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
  58 {
  59    if (tmp.nr == c->last_tmp-1)
  60       c->last_tmp--;
  61 }
  62
  63 static void release_tmps( struct brw_vs_compile *c )
  64 {
  65    c->last_tmp = c->first_tmp;
  66 }
  67
  68
  69
  70 /**
  71  * Preallocate GRF register before code emit.
  72  * Do things as simply as possible.  Allocate and populate all regs
  73  * ahead of time.
  74  */
  75 static void brw_vs_alloc_regs( struct brw_vs_compile *c )
  76 {
  77    GLuint i, reg = 0, mrf;
  78    int attributes_in_vue;
  79
  80    /* Determine whether to use a real constant buffer or use a block
  81     * of GRF registers for constants.  The later is faster but only
  82     * works if everything fits in the GRF.
  83     * XXX this heuristic/check may need some fine tuning...
  84     */
  85    if (c->vp->info.file_max[TGSI_FILE_CONSTANT] +
  86        c->vp->info.file_max[TGSI_FILE_IMMEDIATE] +
  87        c->vp->info.file_max[TGSI_FILE_TEMPORARY] + 21 > BRW_MAX_GRF)
  88       c->vp->use_const_buffer = GL_TRUE;
  89    else {
  90       /* XXX: immediates can go elsewhere if necessary:
  91        */
  92       assert(c->vp->info.file_max[TGSI_FILE_IMMEDIATE] +
  93              c->vp->info.file_max[TGSI_FILE_TEMPORARY] + 21 <= BRW_MAX_GRF);
  94
  95       c->vp->use_const_buffer = GL_FALSE;
  96    }
  97
  98    /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
  99
 100    /* r0 -- reserved as usual
 101     */
 102    c->r0 = brw_vec8_grf(reg, 0);
 103    reg++;
 104
 105    /* User clip planes from curbe:
 106     */
 107    if (c->key.nr_userclip) {
 108       for (i = 0; i < c->key.nr_userclip; i++) {
 109          c->userplane[i] = stride( brw_vec4_grf(reg+3+i/2, (i%2) * 4), 0, 4, 1);
 110       }
 111
 112       /* Deal with curbe alignment:
 113        */
 114       reg += ((6 + c->key.nr_userclip + 3) / 4) * 2;
 115    }
 116
 117    /* Vertex program parameters from curbe:
 118     */
 119    if (c->vp->use_const_buffer) {
 120       /* get constants from a real constant buffer */
 121       c->prog_data.curb_read_length = 0;
 122       c->prog_data.nr_params = 4; /* XXX 0 causes a bug elsewhere... */
 123    }
 124    else {
 125       /* use a section of the GRF for constants */
 126       GLuint nr_params = c->vp->info.file_max[TGSI_FILE_CONSTANT] + 1;
 127       for (i = 0; i < nr_params; i++) {
 128          c->regs[TGSI_FILE_CONSTANT][i] = stride( brw_vec4_grf(reg+i/2, (i%2) * 4), 0, 4, 1);
 129       }
 130       reg += (nr_params + 1) / 2;
 131       c->prog_data.curb_read_length = reg - 1;
 132       c->prog_data.nr_params = nr_params * 4;
 133    }
 134
 135    /* Allocate input regs:
 136     */
 137    c->nr_inputs = c->vp->info.num_inputs;
 138    for (i = 0; i < c->nr_inputs; i++) {
 139       c->regs[TGSI_FILE_INPUT][i] = brw_vec8_grf(reg, 0);
 140       reg++;
 141    }
 142
 143    /* If there are no inputs, we'll still be reading one attribute's worth
 144     * because it's required -- see urb_read_length setting.
 145     */
 146    if (c->nr_inputs == 0)
 147       reg++;
 148
 149    /* Allocate a GRF and load immediate values by hand with 4 MOVs!!!
 150     *
 151     * XXX: Try to encode float immediates as brw immediates
 152     * XXX: Put immediates into the CURBE.
 153     * XXX: Make sure ureg sets minimal immediate size and respect it
 154     * here.
 155     */
 156    for (i = 0; i < c->nr_immediates; i++) {
 157       struct brw_reg r;
 158       int j;
 159
 160       r = brw_vec8_grf(reg, 0);
 161
 162       for (j = 0; j < 4; j++) {
 163          brw_MOV(&c->func,
 164                  brw_writemask(r, (1<<j)),
 165                  brw_imm_f(c->immediate[i][j]));
 166       }
 167
 168       reg++;
 169    }
 170
 171
 172    /* Allocate outputs.  The non-position outputs go straight into message regs.
 173     */
 174    c->nr_outputs = c->prog_data.nr_outputs;
 175    c->first_output = reg;
 176    c->first_overflow_output = 0;
 177
 178    if (c->chipset.is_igdng)
 179       mrf = 8;
 180    else
 181       mrf = 4;
 182
 183    /* XXX: need to access vertex output semantics here:
 184     */
 185    for (i = 0; i < c->prog_data.nr_outputs; i++) {
 186       assert(i < Elements(c->regs[TGSI_FILE_OUTPUT]));
 187
 188       /* XXX: Hardwire position to zero:
 189        */
 190       if (i == 0) {
 191          c->regs[TGSI_FILE_OUTPUT][i] = brw_vec8_grf(reg, 0);
 192          reg++;
 193       }
 194       /* XXX: disable psiz:
 195        */
 196       else if (0) {
 197          c->regs[TGSI_FILE_OUTPUT][i] = brw_vec8_grf(reg, 0);
 198          reg++;
 199          mrf++;         /* just a placeholder?  XXX fix later stages & remove this */
 200       }
 201       else if (mrf < 16) {
 202          c->regs[TGSI_FILE_OUTPUT][i] = brw_message_reg(mrf);
 203          mrf++;
 204       }
 205       else {
 206          /* too many vertex results to fit in MRF, use GRF for overflow */
 207          if (!c->first_overflow_output)
 208             c->first_overflow_output = i;
 209          c->regs[TGSI_FILE_OUTPUT][i] = brw_vec8_grf(reg, 0);
 210          reg++;
 211       }
 212    }
 213
 214    /* Allocate program temporaries:
 215     */
 216
 217    for (i = 0; i < c->vp->info.file_max[TGSI_FILE_TEMPORARY]+1; i++) {
 218       c->regs[TGSI_FILE_TEMPORARY][i] = brw_vec8_grf(reg, 0);
 219       reg++;
 220    }
 221
 222    /* Address reg(s).  Don't try to use the internal address reg until
 223     * deref time.
 224     */
 225    for (i = 0; i < c->vp->info.file_max[TGSI_FILE_ADDRESS]+1; i++) {
 226       c->regs[TGSI_FILE_ADDRESS][i] =  brw_reg(BRW_GENERAL_REGISTER_FILE,
 227                                              reg,
 228                                              0,
 229                                              BRW_REGISTER_TYPE_D,
 230                                              BRW_VERTICAL_STRIDE_8,
 231                                              BRW_WIDTH_8,
 232                                              BRW_HORIZONTAL_STRIDE_1,
 233                                              BRW_SWIZZLE_XXXX,
 234                                              BRW_WRITEMASK_X);
 235       reg++;
 236    }
 237
 238    if (c->vp->use_const_buffer) {
 239       for (i = 0; i < 3; i++) {
 240          c->current_const[i].index = -1;
 241          c->current_const[i].reg = brw_vec8_grf(reg, 0);
 242          reg++;
 243       }
 244    }
 245
 246 #if 0
 247    for (i = 0; i < 128; i++) {
 248       if (c->output_regs[i].used_in_src) {
 249          c->output_regs[i].reg = brw_vec8_grf(reg, 0);
 250          reg++;
 251       }
 252    }
 253 #endif
 254
 255    c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0);
 256    reg += 2;
 257
 258    /* Some opcodes need an internal temporary:
 259     */
 260    c->first_tmp = reg;
 261    c->last_tmp = reg;           /* for allocation purposes */
 262
 263    /* Each input reg holds data from two vertices.  The
 264     * urb_read_length is the number of registers read from *each*
 265     * vertex urb, so is half the amount:
 266     */
 267    c->prog_data.urb_read_length = (c->nr_inputs + 1) / 2;
 268
 269    /* Setting this field to 0 leads to undefined behavior according to the
 270     * the VS_STATE docs.  Our VUEs will always have at least one attribute
 271     * sitting in them, even if it's padding.
 272     */
 273    if (c->prog_data.urb_read_length == 0)
 274       c->prog_data.urb_read_length = 1;
 275
 276    /* The VS VUEs are shared by VF (outputting our inputs) and VS, so size
 277     * them to fit the biggest thing they need to.
 278     */
 279    attributes_in_vue = MAX2(c->nr_outputs, c->nr_inputs);
 280
 281    if (c->chipset.is_igdng)
 282       c->prog_data.urb_entry_size = (attributes_in_vue + 6 + 3) / 4;
 283    else
 284       c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 3) / 4;
 285
 286    c->prog_data.total_grf = reg;
 287
 288    if (BRW_DEBUG & DEBUG_VS) {
 289       debug_printf("%s NumAddrRegs %d\n", __FUNCTION__,
 290                    c->vp->info.file_max[TGSI_FILE_ADDRESS]+1);
 291       debug_printf("%s NumTemps %d\n", __FUNCTION__,
 292                    c->vp->info.file_max[TGSI_FILE_TEMPORARY]+1);
 293       debug_printf("%s reg = %d\n", __FUNCTION__, reg);
 294    }
 295 }
 296
 297
 298 /**
 299  * If an instruction uses a temp reg both as a src and the dest, we
 300  * sometimes need to allocate an intermediate temporary.
 301  */
 302 static void unalias1( struct brw_vs_compile *c,
 303                       struct brw_reg dst,
 304                       struct brw_reg arg0,
 305                       void (*func)( struct brw_vs_compile *,
 306                                     struct brw_reg,
 307                                     struct brw_reg ))
 308 {
 309    if (dst.file == arg0.file && dst.nr == arg0.nr) {
 310       struct brw_compile *p = &c->func;
 311       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 312       func(c, tmp, arg0);
 313       brw_MOV(p, dst, tmp);
 314       release_tmp(c, tmp);
 315    }
 316    else {
 317       func(c, dst, arg0);
 318    }
 319 }
 320
 321 /**
 322  * \sa unalias2
 323  * Checkes if 2-operand instruction needs an intermediate temporary.
 324  */
 325 static void unalias2( struct brw_vs_compile *c,
 326                       struct brw_reg dst,
 327                       struct brw_reg arg0,
 328                       struct brw_reg arg1,
 329                       void (*func)( struct brw_vs_compile *,
 330                                     struct brw_reg,
 331                                     struct brw_reg,
 332                                     struct brw_reg ))
 333 {
 334    if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
 335        (dst.file == arg1.file && dst.nr == arg1.nr)) {
 336       struct brw_compile *p = &c->func;
 337       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 338       func(c, tmp, arg0, arg1);
 339       brw_MOV(p, dst, tmp);
 340       release_tmp(c, tmp);
 341    }
 342    else {
 343       func(c, dst, arg0, arg1);
 344    }
 345 }
 346
 347 /**
 348  * \sa unalias2
 349  * Checkes if 3-operand instruction needs an intermediate temporary.
 350  */
 351 static void unalias3( struct brw_vs_compile *c,
 352                       struct brw_reg dst,
 353                       struct brw_reg arg0,
 354                       struct brw_reg arg1,
 355                       struct brw_reg arg2,
 356                       void (*func)( struct brw_vs_compile *,
 357                                     struct brw_reg,
 358                                     struct brw_reg,
 359                                     struct brw_reg,
 360                                     struct brw_reg ))
 361 {
 362    if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
 363        (dst.file == arg1.file && dst.nr == arg1.nr) ||
 364        (dst.file == arg2.file && dst.nr == arg2.nr)) {
 365       struct brw_compile *p = &c->func;
 366       struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
 367       func(c, tmp, arg0, arg1, arg2);
 368       brw_MOV(p, dst, tmp);
 369       release_tmp(c, tmp);
 370    }
 371    else {
 372       func(c, dst, arg0, arg1, arg2);
 373    }
 374 }
 375
 376 static void emit_sop( struct brw_compile *p,
 377                       struct brw_reg dst,
 378                       struct brw_reg arg0,
 379                       struct brw_reg arg1,
 380                       GLuint cond)
 381 {
 382    brw_MOV(p, dst, brw_imm_f(0.0f));
 383    brw_CMP(p, brw_null_reg(), cond, arg0, arg1);
 384    brw_MOV(p, dst, brw_imm_f(1.0f));
 385    brw_set_predicate_control_flag_value(p, 0xff);
 386 }
 387
 388 static void emit_seq( struct brw_compile *p,
 389                       struct brw_reg dst,
 390                       struct brw_reg arg0,
 391                       struct brw_reg arg1 )
 392 {
 393    emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_EQ);
 394 }
 395
 396 static void emit_sne( struct brw_compile *p,
 397                       struct brw_reg dst,
 398                       struct brw_reg arg0,
 399                       struct brw_reg arg1 )
 400 {
 401    emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_NEQ);
 402 }
 403 static void emit_slt( struct brw_compile *p,
 404                       struct brw_reg dst,
 405                       struct brw_reg arg0,
 406                       struct brw_reg arg1 )
 407 {
 408    emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_L);
 409 }
 410
 411 static void emit_sle( struct brw_compile *p,
 412                       struct brw_reg dst,
 413                       struct brw_reg arg0,
 414                       struct brw_reg arg1 )
 415 {
 416    emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_LE);
 417 }
 418
 419 static void emit_sgt( struct brw_compile *p,
 420                       struct brw_reg dst,
 421                       struct brw_reg arg0,
 422                       struct brw_reg arg1 )
 423 {
 424    emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_G);
 425 }
 426
 427 static void emit_sge( struct brw_compile *p,
 428                       struct brw_reg dst,
 429                       struct brw_reg arg0,
 430                       struct brw_reg arg1 )
 431 {
 432   emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_GE);
 433 }
 434
 435 static void emit_max( struct brw_compile *p,
 436                       struct brw_reg dst,
 437                       struct brw_reg arg0,
 438                       struct brw_reg arg1 )
 439 {
 440    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
 441    brw_SEL(p, dst, arg1, arg0);
 442    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 443 }
 444
 445 static void emit_min( struct brw_compile *p,
 446                       struct brw_reg dst,
 447                       struct brw_reg arg0,
 448                       struct brw_reg arg1 )
 449 {
 450    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
 451    brw_SEL(p, dst, arg0, arg1);
 452    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 453 }
 454
 455
 456 static void emit_math1( struct brw_vs_compile *c,
 457                         GLuint function,
 458                         struct brw_reg dst,
 459                         struct brw_reg arg0,
 460                         GLuint precision)
 461 {
 462    /* There are various odd behaviours with SEND on the simulator.  In
 463     * addition there are documented issues with the fact that the GEN4
 464     * processor doesn't do dependency control properly on SEND
 465     * results.  So, on balance, this kludge to get around failures
 466     * with writemasked math results looks like it might be necessary
 467     * whether that turns out to be a simulator bug or not:
 468     */
 469    struct brw_compile *p = &c->func;
 470    struct brw_reg tmp = dst;
 471    GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
 472                          dst.file != BRW_GENERAL_REGISTER_FILE);
 473
 474    if (need_tmp)
 475       tmp = get_tmp(c);
 476
 477    brw_math(p,
 478             tmp,
 479             function,
 480             BRW_MATH_SATURATE_NONE,
 481             2,
 482             arg0,
 483             BRW_MATH_DATA_SCALAR,
 484             precision);
 485
 486    if (need_tmp) {
 487       brw_MOV(p, dst, tmp);
 488       release_tmp(c, tmp);
 489    }
 490 }
 491
 492
 493 static void emit_math2( struct brw_vs_compile *c,
 494                         GLuint function,
 495                         struct brw_reg dst,
 496                         struct brw_reg arg0,
 497                         struct brw_reg arg1,
 498                         GLuint precision)
 499 {
 500    struct brw_compile *p = &c->func;
 501    struct brw_reg tmp = dst;
 502    GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
 503                          dst.file != BRW_GENERAL_REGISTER_FILE);
 504
 505    if (need_tmp)
 506       tmp = get_tmp(c);
 507
 508    brw_MOV(p, brw_message_reg(3), arg1);
 509
 510    brw_math(p,
 511             tmp,
 512             function,
 513             BRW_MATH_SATURATE_NONE,
 514             2,
 515             arg0,
 516             BRW_MATH_DATA_SCALAR,
 517             precision);
 518
 519    if (need_tmp) {
 520       brw_MOV(p, dst, tmp);
 521       release_tmp(c, tmp);
 522    }
 523 }
 524
 525
 526 static void emit_exp_noalias( struct brw_vs_compile *c,
 527                               struct brw_reg dst,
 528                               struct brw_reg arg0 )
 529 {
 530    struct brw_compile *p = &c->func;
 531
 532
 533    if (dst.dw1.bits.writemask & BRW_WRITEMASK_X) {
 534       struct brw_reg tmp = get_tmp(c);
 535       struct brw_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
 536
 537       /* tmp_d = floor(arg0.x) */
 538       brw_RNDD(p, tmp_d, brw_swizzle1(arg0, 0));
 539
 540       /* result[0] = 2.0 ^ tmp */
 541
 542       /* Adjust exponent for floating point:
 543        * exp += 127
 544        */
 545       brw_ADD(p, brw_writemask(tmp_d, BRW_WRITEMASK_X), tmp_d, brw_imm_d(127));
 546
 547       /* Install exponent and sign.
 548        * Excess drops off the edge:
 549        */
 550       brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), BRW_WRITEMASK_X),
 551               tmp_d, brw_imm_d(23));
 552
 553       release_tmp(c, tmp);
 554    }
 555
 556    if (dst.dw1.bits.writemask & BRW_WRITEMASK_Y) {
 557       /* result[1] = arg0.x - floor(arg0.x) */
 558       brw_FRC(p, brw_writemask(dst, BRW_WRITEMASK_Y), brw_swizzle1(arg0, 0));
 559    }
 560
 561    if (dst.dw1.bits.writemask & BRW_WRITEMASK_Z) {
 562       /* As with the LOG instruction, we might be better off just
 563        * doing a taylor expansion here, seeing as we have to do all
 564        * the prep work.
 565        *
 566        * If mathbox partial precision is too low, consider also:
 567        * result[3] = result[0] * EXP(result[1])
 568        */
 569       emit_math1(c,
 570                  BRW_MATH_FUNCTION_EXP,
 571                  brw_writemask(dst, BRW_WRITEMASK_Z),
 572                  brw_swizzle1(arg0, 0),
 573                  BRW_MATH_PRECISION_FULL);
 574    }
 575
 576    if (dst.dw1.bits.writemask & BRW_WRITEMASK_W) {
 577       /* result[3] = 1.0; */
 578       brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_W), brw_imm_f(1));
 579    }
 580 }
 581
 582
 583 static void emit_log_noalias( struct brw_vs_compile *c,
 584                               struct brw_reg dst,
 585                               struct brw_reg arg0 )
 586 {
 587    struct brw_compile *p = &c->func;
 588    struct brw_reg tmp = dst;
 589    struct brw_reg tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
 590    struct brw_reg arg0_ud = retype(arg0, BRW_REGISTER_TYPE_UD);
 591    GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
 592                          dst.file != BRW_GENERAL_REGISTER_FILE);
 593
 594    if (need_tmp) {
 595       tmp = get_tmp(c);
 596       tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
 597    }
 598
 599    /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
 600     * according to spec:
 601     *
 602     * These almost look likey they could be joined up, but not really
 603     * practical:
 604     *
 605     * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
 606     * result[1].i = (x.i & ((1<<23)-1)        + (127<<23)
 607     */
 608    if (dst.dw1.bits.writemask & BRW_WRITEMASK_XZ) {
 609       brw_AND(p,
 610               brw_writemask(tmp_ud, BRW_WRITEMASK_X),
 611               brw_swizzle1(arg0_ud, 0),
 612               brw_imm_ud((1U<<31)-1));
 613
 614       brw_SHR(p,
 615               brw_writemask(tmp_ud, BRW_WRITEMASK_X),
 616               tmp_ud,
 617               brw_imm_ud(23));
 618
 619       brw_ADD(p,
 620               brw_writemask(tmp, BRW_WRITEMASK_X),
 621               retype(tmp_ud, BRW_REGISTER_TYPE_D),      /* does it matter? */
 622               brw_imm_d(-127));
 623    }
 624
 625    if (dst.dw1.bits.writemask & BRW_WRITEMASK_YZ) {
 626       brw_AND(p,
 627               brw_writemask(tmp_ud, BRW_WRITEMASK_Y),
 628               brw_swizzle1(arg0_ud, 0),
 629               brw_imm_ud((1<<23)-1));
 630
 631       brw_OR(p,
 632              brw_writemask(tmp_ud, BRW_WRITEMASK_Y),
 633              tmp_ud,
 634              brw_imm_ud(127<<23));
 635    }
 636
 637    if (dst.dw1.bits.writemask & BRW_WRITEMASK_Z) {
 638       /* result[2] = result[0] + LOG2(result[1]); */
 639
 640       /* Why bother?  The above is just a hint how to do this with a
 641        * taylor series.  Maybe we *should* use a taylor series as by
 642        * the time all the above has been done it's almost certainly
 643        * quicker than calling the mathbox, even with low precision.
 644        *
 645        * Options are:
 646        *    - result[0] + mathbox.LOG2(result[1])
 647        *    - mathbox.LOG2(arg0.x)
 648        *    - result[0] + inline_taylor_approx(result[1])
 649        */
 650       emit_math1(c,
 651                  BRW_MATH_FUNCTION_LOG,
 652                  brw_writemask(tmp, BRW_WRITEMASK_Z),
 653                  brw_swizzle1(tmp, 1),
 654                  BRW_MATH_PRECISION_FULL);
 655
 656       brw_ADD(p,
 657               brw_writemask(tmp, BRW_WRITEMASK_Z),
 658               brw_swizzle1(tmp, 2),
 659               brw_swizzle1(tmp, 0));
 660    }
 661
 662    if (dst.dw1.bits.writemask & BRW_WRITEMASK_W) {
 663       /* result[3] = 1.0; */
 664       brw_MOV(p, brw_writemask(tmp, BRW_WRITEMASK_W), brw_imm_f(1));
 665    }
 666
 667    if (need_tmp) {
 668       brw_MOV(p, dst, tmp);
 669       release_tmp(c, tmp);
 670    }
 671 }
 672
 673
 674 /* Need to unalias - consider swizzles:   r0 = DST r0.xxxx r1
 675  */
 676 static void emit_dst_noalias( struct brw_vs_compile *c,
 677                               struct brw_reg dst,
 678                               struct brw_reg arg0,
 679                               struct brw_reg arg1)
 680 {
 681    struct brw_compile *p = &c->func;
 682
 683    /* There must be a better way to do this:
 684     */
 685    if (dst.dw1.bits.writemask & BRW_WRITEMASK_X)
 686       brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_X), brw_imm_f(1.0));
 687    if (dst.dw1.bits.writemask & BRW_WRITEMASK_Y)
 688       brw_MUL(p, brw_writemask(dst, BRW_WRITEMASK_Y), arg0, arg1);
 689    if (dst.dw1.bits.writemask & BRW_WRITEMASK_Z)
 690       brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_Z), arg0);
 691    if (dst.dw1.bits.writemask & BRW_WRITEMASK_W)
 692       brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_W), arg1);
 693 }
 694
 695
 696 static void emit_xpd( struct brw_compile *p,
 697                       struct brw_reg dst,
 698                       struct brw_reg t,
 699                       struct brw_reg u)
 700 {
 701    brw_MUL(p, brw_null_reg(), brw_swizzle(t, 1,2,0,3),  brw_swizzle(u,2,0,1,3));
 702    brw_MAC(p, dst,     negate(brw_swizzle(t, 2,0,1,3)), brw_swizzle(u,1,2,0,3));
 703 }
 704
 705
 706 static void emit_lit_noalias( struct brw_vs_compile *c,
 707                               struct brw_reg dst,
 708                               struct brw_reg arg0 )
 709 {
 710    struct brw_compile *p = &c->func;
 711    struct brw_instruction *if_insn;
 712    struct brw_reg tmp = dst;
 713    GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
 714
 715    if (need_tmp)
 716       tmp = get_tmp(c);
 717
 718    brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_YZ), brw_imm_f(0));
 719    brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_XW), brw_imm_f(1));
 720
 721    /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
 722     * to get all channels active inside the IF.  In the clipping code
 723     * we run with NoMask, so it's not an option and we can use
 724     * BRW_EXECUTE_1 for all comparisions.
 725     */
 726    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,0), brw_imm_f(0));
 727    if_insn = brw_IF(p, BRW_EXECUTE_8);
 728    {
 729       brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_Y), brw_swizzle1(arg0,0));
 730
 731       brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,1), brw_imm_f(0));
 732       brw_MOV(p, brw_writemask(tmp, BRW_WRITEMASK_Z),  brw_swizzle1(arg0,1));
 733       brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 734
 735       emit_math2(c,
 736                  BRW_MATH_FUNCTION_POW,
 737                  brw_writemask(dst, BRW_WRITEMASK_Z),
 738                  brw_swizzle1(tmp, 2),
 739                  brw_swizzle1(arg0, 3),
 740                  BRW_MATH_PRECISION_PARTIAL);
 741    }
 742
 743    brw_ENDIF(p, if_insn);
 744
 745    release_tmp(c, tmp);
 746 }
 747
 748 static void emit_lrp_noalias(struct brw_vs_compile *c,
 749                              struct brw_reg dst,
 750                              struct brw_reg arg0,
 751                              struct brw_reg arg1,
 752                              struct brw_reg arg2)
 753 {
 754    struct brw_compile *p = &c->func;
 755
 756    brw_ADD(p, dst, negate(arg0), brw_imm_f(1.0));
 757    brw_MUL(p, brw_null_reg(), dst, arg2);
 758    brw_MAC(p, dst, arg0, arg1);
 759 }
 760
 761 /** 3 or 4-component vector normalization */
 762 static void emit_nrm( struct brw_vs_compile *c,
 763                       struct brw_reg dst,
 764                       struct brw_reg arg0,
 765                       int num_comps)
 766 {
 767    struct brw_compile *p = &c->func;
 768    struct brw_reg tmp = get_tmp(c);
 769
 770    /* tmp = dot(arg0, arg0) */
 771    if (num_comps == 3)
 772       brw_DP3(p, tmp, arg0, arg0);
 773    else
 774       brw_DP4(p, tmp, arg0, arg0);
 775
 776    /* tmp = 1 / sqrt(tmp) */
 777    emit_math1(c, BRW_MATH_FUNCTION_RSQ, tmp, tmp, BRW_MATH_PRECISION_FULL);
 778
 779    /* dst = arg0 * tmp */
 780    brw_MUL(p, dst, arg0, tmp);
 781
 782    release_tmp(c, tmp);
 783 }
 784
 785
 786 static struct brw_reg
 787 get_constant(struct brw_vs_compile *c,
 788              GLuint argIndex,
 789              GLuint index,
 790              GLboolean relAddr)
 791 {
 792    struct brw_compile *p = &c->func;
 793    struct brw_reg const_reg;
 794    struct brw_reg const2_reg;
 795
 796    assert(argIndex < 3);
 797
 798    if (c->current_const[argIndex].index != index || relAddr) {
 799       struct brw_reg addrReg = c->regs[TGSI_FILE_ADDRESS][0];
 800
 801       c->current_const[argIndex].index = index;
 802
 803 #if 0
 804       printf("  fetch const[%d] for arg %d into reg %d\n",
 805              src.Index, argIndex, c->current_const[argIndex].reg.nr);
 806 #endif
 807       /* need to fetch the constant now */
 808       brw_dp_READ_4_vs(p,
 809                        c->current_const[argIndex].reg,/* writeback dest */
 810                        0,                             /* oword */
 811                        relAddr,                       /* relative indexing? */
 812                        addrReg,                       /* address register */
 813                        16 * index,               /* byte offset */
 814                        SURF_INDEX_VERT_CONST_BUFFER   /* binding table index */
 815                        );
 816
 817       if (relAddr) {
 818          /* second read */
 819          const2_reg = get_tmp(c);
 820
 821          /* use upper half of address reg for second read */
 822          addrReg = stride(addrReg, 0, 4, 0);
 823          addrReg.subnr = 16;
 824
 825          brw_dp_READ_4_vs(p,
 826                           const2_reg,              /* writeback dest */
 827                           1,                       /* oword */
 828                           relAddr,                 /* relative indexing? */
 829                           addrReg,                 /* address register */
 830                           16 * index,         /* byte offset */
 831                           SURF_INDEX_VERT_CONST_BUFFER
 832                           );
 833       }
 834    }
 835
 836    const_reg = c->current_const[argIndex].reg;
 837
 838    if (relAddr) {
 839       /* merge the two Owords into the constant register */
 840       /* const_reg[7..4] = const2_reg[7..4] */
 841       brw_MOV(p,
 842               suboffset(stride(const_reg, 0, 4, 1), 4),
 843               suboffset(stride(const2_reg, 0, 4, 1), 4));
 844       release_tmp(c, const2_reg);
 845    }
 846    else {
 847       /* replicate lower four floats into upper half (to get XYZWXYZW) */
 848       const_reg = stride(const_reg, 0, 4, 0);
 849       const_reg.subnr = 0;
 850    }
 851
 852    return const_reg;
 853 }
 854
 855
 856
 857 /* TODO: relative addressing!
 858  */
 859 static struct brw_reg get_reg( struct brw_vs_compile *c,
 860                                enum tgsi_file_type file,
 861                                GLuint index )
 862 {
 863    switch (file) {
 864    case TGSI_FILE_TEMPORARY:
 865    case TGSI_FILE_INPUT:
 866    case TGSI_FILE_OUTPUT:
 867    case TGSI_FILE_CONSTANT:
 868       assert(c->regs[file][index].nr != 0);
 869       return c->regs[file][index];
 870
 871    case TGSI_FILE_ADDRESS:
 872       assert(index == 0);
 873       return c->regs[file][index];
 874
 875    case TGSI_FILE_NULL:                 /* undef values */
 876       return brw_null_reg();
 877
 878    default:
 879       assert(0);
 880       return brw_null_reg();
 881    }
 882 }
 883
 884
 885 /**
 886  * Indirect addressing:  get reg[[arg] + offset].
 887  */
 888 static struct brw_reg deref( struct brw_vs_compile *c,
 889                              struct brw_reg arg,
 890                              GLint offset)
 891 {
 892    struct brw_compile *p = &c->func;
 893    struct brw_reg tmp = vec4(get_tmp(c));
 894    struct brw_reg addr_reg = c->regs[TGSI_FILE_ADDRESS][0];
 895    struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_UW);
 896    GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * 16;
 897    struct brw_reg indirect = brw_vec4_indirect(0,0);
 898
 899    {
 900       brw_push_insn_state(p);
 901       brw_set_access_mode(p, BRW_ALIGN_1);
 902
 903       /* This is pretty clunky - load the address register twice and
 904        * fetch each 4-dword value in turn.  There must be a way to do
 905        * this in a single pass, but I couldn't get it to work.
 906        */
 907       brw_ADD(p, brw_address_reg(0), vp_address, brw_imm_d(byte_offset));
 908       brw_MOV(p, tmp, indirect);
 909
 910       brw_ADD(p, brw_address_reg(0), suboffset(vp_address, 8), brw_imm_d(byte_offset));
 911       brw_MOV(p, suboffset(tmp, 4), indirect);
 912
 913       brw_pop_insn_state(p);
 914    }
 915
 916    /* NOTE: tmp not released */
 917    return vec8(tmp);
 918 }
 919
 920
 921 /**
 922  * Get brw reg corresponding to the instruction's [argIndex] src reg.
 923  * TODO: relative addressing!
 924  */
 925 static struct brw_reg
 926 get_src_reg( struct brw_vs_compile *c,
 927              GLuint argIndex,
 928              GLuint file,
 929              GLint index,
 930              GLboolean relAddr )
 931 {
 932
 933    switch (file) {
 934    case TGSI_FILE_TEMPORARY:
 935    case TGSI_FILE_INPUT:
 936    case TGSI_FILE_OUTPUT:
 937       if (relAddr) {
 938          return deref(c, c->regs[file][0], index);
 939       }
 940       else {
 941          assert(c->regs[file][index].nr != 0);
 942          return c->regs[file][index];
 943       }
 944
 945    case TGSI_FILE_IMMEDIATE:
 946       return c->regs[file][index];
 947
 948    case TGSI_FILE_CONSTANT:
 949       if (c->vp->use_const_buffer) {
 950          return get_constant(c, argIndex, index, relAddr);
 951       }
 952       else if (relAddr) {
 953          return deref(c, c->regs[TGSI_FILE_CONSTANT][0], index);
 954       }
 955       else {
 956          assert(c->regs[TGSI_FILE_CONSTANT][index].nr != 0);
 957          return c->regs[TGSI_FILE_CONSTANT][index];
 958       }
 959    case TGSI_FILE_ADDRESS:
 960       assert(index == 0);
 961       return c->regs[file][index];
 962
 963    case TGSI_FILE_NULL:
 964       /* this is a normal case since we loop over all three src args */
 965       return brw_null_reg();
 966
 967    default:
 968       assert(0);
 969       return brw_null_reg();
 970    }
 971 }
 972
 973
 974 static void emit_arl( struct brw_vs_compile *c,
 975                       struct brw_reg dst,
 976                       struct brw_reg arg0 )
 977 {
 978    struct brw_compile *p = &c->func;
 979    struct brw_reg tmp = dst;
 980    GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
 981
 982    if (need_tmp)
 983       tmp = get_tmp(c);
 984
 985    brw_RNDD(p, tmp, arg0);               /* tmp = round(arg0) */
 986    brw_MUL(p, dst, tmp, brw_imm_d(16));  /* dst = tmp * 16 */
 987
 988    if (need_tmp)
 989       release_tmp(c, tmp);
 990 }
 991
 992
 993 /**
 994  * Return the brw reg for the given instruction's src argument.
 995  */
 996 static struct brw_reg get_arg( struct brw_vs_compile *c,
 997                                const struct tgsi_full_src_register *src,
 998                                GLuint argIndex )
 999 {
1000    struct brw_reg reg;
1001
1002    if (src->SrcRegister.File == TGSI_FILE_NULL)
1003       return brw_null_reg();
1004
1005    reg = get_src_reg(c, argIndex,
1006                      src->SrcRegister.File,
1007                      src->SrcRegister.Index,
1008                      src->SrcRegister.Indirect);
1009
1010    /* Convert 3-bit swizzle to 2-bit.
1011     */
1012    reg.dw1.bits.swizzle = BRW_SWIZZLE4(src->SrcRegister.SwizzleX,
1013                                        src->SrcRegister.SwizzleY,
1014                                        src->SrcRegister.SwizzleZ,
1015                                        src->SrcRegister.SwizzleW);
1016
1017    reg.negate = src->SrcRegister.Negate ? 1 : 0;
1018
1019    /* XXX: abs, absneg
1020     */
1021
1022    return reg;
1023 }
1024
1025
1026 /**
1027  * Get brw register for the given program dest register.
1028  */
1029 static struct brw_reg get_dst( struct brw_vs_compile *c,
1030                                unsigned file,
1031                                unsigned index,
1032                                unsigned writemask )
1033 {
1034    struct brw_reg reg;
1035
1036    switch (file) {
1037    case TGSI_FILE_TEMPORARY:
1038    case TGSI_FILE_OUTPUT:
1039       assert(c->regs[file][index].nr != 0);
1040       reg = c->regs[file][index];
1041       break;
1042    case TGSI_FILE_ADDRESS:
1043       assert(index == 0);
1044       reg = c->regs[file][index];
1045       break;
1046    case TGSI_FILE_NULL:
1047       /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
1048       reg = brw_null_reg();
1049       break;
1050    default:
1051       assert(0);
1052       reg = brw_null_reg();
1053    }
1054
1055    reg.dw1.bits.writemask = writemask;
1056
1057    return reg;
1058 }
1059
1060
1061
1062
1063 /**
1064  * Post-vertex-program processing.  Send the results to the URB.
1065  */
1066 static void emit_vertex_write( struct brw_vs_compile *c)
1067 {
1068    struct brw_compile *p = &c->func;
1069    struct brw_reg m0 = brw_message_reg(0);
1070    struct brw_reg pos = c->regs[TGSI_FILE_OUTPUT][VERT_RESULT_HPOS];
1071    struct brw_reg ndc;
1072    int eot;
1073    GLuint len_vertext_header = 2;
1074
1075    if (c->key.copy_edgeflag) {
1076       assert(0);
1077       brw_MOV(p,
1078               get_reg(c, TGSI_FILE_OUTPUT, 0),
1079               get_reg(c, TGSI_FILE_INPUT, 0));
1080    }
1081
1082    /* Build ndc coords */
1083    ndc = get_tmp(c);
1084    /* ndc = 1.0 / pos.w */
1085    emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
1086    /* ndc.xyz = pos * ndc */
1087    brw_MUL(p, brw_writemask(ndc, BRW_WRITEMASK_XYZ), pos, ndc);
1088
1089    /* Update the header for point size, user clipping flags, and -ve rhw
1090     * workaround.
1091     */
1092    if (c->prog_data.writes_psiz ||
1093        c->key.nr_userclip ||
1094        c->chipset.is_965)
1095    {
1096       struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
1097       GLuint i;
1098
1099       brw_MOV(p, header1, brw_imm_ud(0));
1100
1101       brw_set_access_mode(p, BRW_ALIGN_16);
1102
1103       if (c->prog_data.writes_psiz) {
1104          struct brw_reg psiz = c->regs[TGSI_FILE_OUTPUT][VERT_RESULT_PSIZ];
1105          brw_MUL(p, brw_writemask(header1, BRW_WRITEMASK_W), brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
1106          brw_AND(p, brw_writemask(header1, BRW_WRITEMASK_W), header1, brw_imm_ud(0x7ff<<8));
1107       }
1108
1109       for (i = 0; i < c->key.nr_userclip; i++) {
1110          brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
1111          brw_DP4(p, brw_null_reg(), pos, c->userplane[i]);
1112          brw_OR(p, brw_writemask(header1, BRW_WRITEMASK_W), header1, brw_imm_ud(1<<i));
1113          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1114       }
1115
1116       /* i965 clipping workaround:
1117        * 1) Test for -ve rhw
1118        * 2) If set,
1119        *      set ndc = (0,0,0,0)
1120        *      set ucp[6] = 1
1121        *
1122        * Later, clipping will detect ucp[6] and ensure the primitive is
1123        * clipped against all fixed planes.
1124        */
1125       if (c->chipset.is_965) {
1126          brw_CMP(p,
1127                  vec8(brw_null_reg()),
1128                  BRW_CONDITIONAL_L,
1129                  brw_swizzle1(ndc, 3),
1130                  brw_imm_f(0));
1131
1132          brw_OR(p, brw_writemask(header1, BRW_WRITEMASK_W), header1, brw_imm_ud(1<<6));
1133          brw_MOV(p, ndc, brw_imm_f(0));
1134          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1135       }
1136
1137       brw_set_access_mode(p, BRW_ALIGN_1);      /* why? */
1138       brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), header1);
1139       brw_set_access_mode(p, BRW_ALIGN_16);
1140
1141       release_tmp(c, header1);
1142    }
1143    else {
1144       brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
1145    }
1146
1147    /* Emit the (interleaved) headers for the two vertices - an 8-reg
1148     * of zeros followed by two sets of NDC coordinates:
1149     */
1150    brw_set_access_mode(p, BRW_ALIGN_1);
1151    brw_MOV(p, offset(m0, 2), ndc);
1152
1153    if (c->chipset.is_igdng) {
1154        /* There are 20 DWs (D0-D19) in VUE vertex header on IGDNG */
1155        brw_MOV(p, offset(m0, 3), pos); /* a portion of vertex header */
1156        /* m4, m5 contain the distances from vertex to the user clip planeXXX.
1157         * Seems it is useless for us.
1158         * m6 is used for aligning, so that the remainder of vertex element is
1159         * reg-aligned.
1160         */
1161        brw_MOV(p, offset(m0, 7), pos); /* the remainder of vertex element */
1162        len_vertext_header = 6;
1163    } else {
1164        brw_MOV(p, offset(m0, 3), pos);
1165        len_vertext_header = 2;
1166    }
1167
1168    eot = (c->first_overflow_output == 0);
1169
1170    brw_urb_WRITE(p,
1171                  brw_null_reg(), /* dest */
1172                  0,             /* starting mrf reg nr */
1173                  c->r0,         /* src */
1174                  0,             /* allocate */
1175                  1,             /* used */
1176                  MIN2(c->nr_outputs + 1 + len_vertext_header, (BRW_MAX_MRF-1)), /* msg len */
1177                  0,             /* response len */
1178                  eot,           /* eot */
1179                  eot,           /* writes complete */
1180                  0,             /* urb destination offset */
1181                  BRW_URB_SWIZZLE_INTERLEAVE);
1182
1183    if (c->first_overflow_output > 0) {
1184       /* Not all of the vertex outputs/results fit into the MRF.
1185        * Move the overflowed attributes from the GRF to the MRF and
1186        * issue another brw_urb_WRITE().
1187        */
1188       /* XXX I'm not 100% sure about which MRF regs to use here.  Starting
1189        * at mrf[4] atm...
1190        */
1191       GLuint i, mrf = 0;
1192       for (i = c->first_overflow_output; i < c->prog_data.nr_outputs; i++) {
1193          /* move from GRF to MRF */
1194          brw_MOV(p, brw_message_reg(4+mrf), c->regs[TGSI_FILE_OUTPUT][i]);
1195          mrf++;
1196       }
1197
1198       brw_urb_WRITE(p,
1199                     brw_null_reg(), /* dest */
1200                     4,              /* starting mrf reg nr */
1201                     c->r0,          /* src */
1202                     0,              /* allocate */
1203                     1,              /* used */
1204                     mrf+1,          /* msg len */
1205                     0,              /* response len */
1206                     1,              /* eot */
1207                     1,              /* writes complete */
1208                     BRW_MAX_MRF-1,  /* urb destination offset */
1209                     BRW_URB_SWIZZLE_INTERLEAVE);
1210    }
1211 }
1212
1213
1214 /**
1215  * Called after code generation to resolve subroutine calls and the
1216  * END instruction.
1217  * \param end_inst  points to brw code for END instruction
1218  * \param last_inst  points to last instruction emitted before vertex write
1219  */
1220 static void
1221 post_vs_emit( struct brw_vs_compile *c,
1222               struct brw_instruction *end_inst,
1223               struct brw_instruction *last_inst )
1224 {
1225    GLint offset;
1226
1227    brw_resolve_cals(&c->func);
1228
1229    /* patch up the END code to jump past subroutines, etc */
1230    offset = last_inst - end_inst;
1231    if (offset > 1) {
1232       brw_set_src1(end_inst, brw_imm_d(offset * 16));
1233    } else {
1234       end_inst->header.opcode = BRW_OPCODE_NOP;
1235    }
1236 }
1237
1238 static uint32_t
1239 get_predicate(const struct tgsi_full_instruction *inst)
1240 {
1241    /* XXX: disabling for now
1242     */
1243 #if 0
1244    if (inst->dst.CondMask == COND_TR)
1245       return BRW_PREDICATE_NONE;
1246
1247    /* All of GLSL only produces predicates for COND_NE and one channel per
1248     * vector.  Fail badly if someone starts doing something else, as it might
1249     * mean infinite looping or something.
1250     *
1251     * We'd like to support all the condition codes, but our hardware doesn't
1252     * quite match the Mesa IR, which is modeled after the NV extensions.  For
1253     * those, the instruction may update the condition codes or not, then any
1254     * later instruction may use one of those condition codes.  For gen4, the
1255     * instruction may update the flags register based on one of the condition
1256     * codes output by the instruction, and then further instructions may
1257     * predicate on that.  We can probably support this, but it won't
1258     * necessarily be easy.
1259     */
1260 /*   assert(inst->dst.CondMask == COND_NE); */
1261
1262    switch (inst->dst.CondSwizzle) {
1263    case SWIZZLE_XXXX:
1264       return BRW_PREDICATE_ALIGN16_REPLICATE_X;
1265    case SWIZZLE_YYYY:
1266       return BRW_PREDICATE_ALIGN16_REPLICATE_Y;
1267    case SWIZZLE_ZZZZ:
1268       return BRW_PREDICATE_ALIGN16_REPLICATE_Z;
1269    case SWIZZLE_WWWW:
1270       return BRW_PREDICATE_ALIGN16_REPLICATE_W;
1271    default:
1272       debug_printf("Unexpected predicate: 0x%08x\n",
1273                     inst->dst.CondMask);
1274       return BRW_PREDICATE_NORMAL;
1275    }
1276 #else
1277    return BRW_PREDICATE_NORMAL;
1278 #endif
1279 }
1280
1281 static void emit_insn(struct brw_vs_compile *c,
1282                       const struct tgsi_full_instruction *inst)
1283 {
1284    unsigned opcode = inst->Instruction.Opcode;
1285    unsigned label = inst->InstructionExtLabel.Label;
1286    struct brw_compile *p = &c->func;
1287    struct brw_reg args[3], dst;
1288    GLuint i;
1289
1290 #if 0
1291    printf("%d: ", insn);
1292    _mesa_print_instruction(inst);
1293 #endif
1294
1295    /* Get argument regs.
1296     */
1297    for (i = 0; i < 3; i++) {
1298       args[i] = get_arg(c, &inst->FullSrcRegisters[i], i);
1299    }
1300
1301    /* Get dest regs.  Note that it is possible for a reg to be both
1302     * dst and arg, given the static allocation of registers.  So
1303     * care needs to be taken emitting multi-operation instructions.
1304     */
1305    dst = get_dst(c,
1306                  inst->FullDstRegisters[0].DstRegister.File,
1307                  inst->FullDstRegisters[0].DstRegister.Index,
1308                  inst->FullDstRegisters[0].DstRegister.WriteMask);
1309
1310    /* XXX: saturate
1311     */
1312    if (inst->Instruction.Saturate != TGSI_SAT_NONE) {
1313       debug_printf("Unsupported saturate in vertex shader");
1314    }
1315
1316    switch (opcode) {
1317    case TGSI_OPCODE_ABS:
1318       brw_MOV(p, dst, brw_abs(args[0]));
1319       break;
1320    case TGSI_OPCODE_ADD:
1321       brw_ADD(p, dst, args[0], args[1]);
1322       break;
1323    case TGSI_OPCODE_COS:
1324       emit_math1(c, BRW_MATH_FUNCTION_COS, dst, args[0], BRW_MATH_PRECISION_FULL);
1325       break;
1326    case TGSI_OPCODE_DP3:
1327       brw_DP3(p, dst, args[0], args[1]);
1328       break;
1329    case TGSI_OPCODE_DP4:
1330       brw_DP4(p, dst, args[0], args[1]);
1331       break;
1332    case TGSI_OPCODE_DPH:
1333       brw_DPH(p, dst, args[0], args[1]);
1334       break;
1335    case TGSI_OPCODE_NRM:
1336       emit_nrm(c, dst, args[0], 3);
1337       break;
1338    case TGSI_OPCODE_NRM4:
1339       emit_nrm(c, dst, args[0], 4);
1340       break;
1341    case TGSI_OPCODE_DST:
1342       unalias2(c, dst, args[0], args[1], emit_dst_noalias);
1343       break;
1344    case TGSI_OPCODE_EXP:
1345       unalias1(c, dst, args[0], emit_exp_noalias);
1346       break;
1347    case TGSI_OPCODE_EX2:
1348       emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
1349       break;
1350    case TGSI_OPCODE_ARL:
1351       emit_arl(c, dst, args[0]);
1352       break;
1353    case TGSI_OPCODE_FLR:
1354       brw_RNDD(p, dst, args[0]);
1355       break;
1356    case TGSI_OPCODE_FRC:
1357       brw_FRC(p, dst, args[0]);
1358       break;
1359    case TGSI_OPCODE_LOG:
1360       unalias1(c, dst, args[0], emit_log_noalias);
1361       break;
1362    case TGSI_OPCODE_LG2:
1363       emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
1364       break;
1365    case TGSI_OPCODE_LIT:
1366       unalias1(c, dst, args[0], emit_lit_noalias);
1367       break;
1368    case TGSI_OPCODE_LRP:
1369       unalias3(c, dst, args[0], args[1], args[2], emit_lrp_noalias);
1370       break;
1371    case TGSI_OPCODE_MAD:
1372       brw_MOV(p, brw_acc_reg(), args[2]);
1373       brw_MAC(p, dst, args[0], args[1]);
1374       break;
1375    case TGSI_OPCODE_MAX:
1376       emit_max(p, dst, args[0], args[1]);
1377       break;
1378    case TGSI_OPCODE_MIN:
1379       emit_min(p, dst, args[0], args[1]);
1380       break;
1381    case TGSI_OPCODE_MOV:
1382       brw_MOV(p, dst, args[0]);
1383       break;
1384    case TGSI_OPCODE_MUL:
1385       brw_MUL(p, dst, args[0], args[1]);
1386       break;
1387    case TGSI_OPCODE_POW:
1388       emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL);
1389       break;
1390    case TGSI_OPCODE_RCP:
1391       emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
1392       break;
1393    case TGSI_OPCODE_RSQ:
1394       emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, args[0], BRW_MATH_PRECISION_FULL);
1395       break;
1396    case TGSI_OPCODE_SEQ:
1397       emit_seq(p, dst, args[0], args[1]);
1398       break;
1399    case TGSI_OPCODE_SIN:
1400       emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, args[0], BRW_MATH_PRECISION_FULL);
1401       break;
1402    case TGSI_OPCODE_SNE:
1403       emit_sne(p, dst, args[0], args[1]);
1404       break;
1405    case TGSI_OPCODE_SGE:
1406       emit_sge(p, dst, args[0], args[1]);
1407       break;
1408    case TGSI_OPCODE_SGT:
1409       emit_sgt(p, dst, args[0], args[1]);
1410       break;
1411    case TGSI_OPCODE_SLT:
1412       emit_slt(p, dst, args[0], args[1]);
1413       break;
1414    case TGSI_OPCODE_SLE:
1415       emit_sle(p, dst, args[0], args[1]);
1416       break;
1417    case TGSI_OPCODE_SUB:
1418       brw_ADD(p, dst, args[0], negate(args[1]));
1419       break;
1420    case TGSI_OPCODE_TRUNC:
1421       /* round toward zero */
1422       brw_RNDZ(p, dst, args[0]);
1423       break;
1424    case TGSI_OPCODE_XPD:
1425       emit_xpd(p, dst, args[0], args[1]);
1426       break;
1427    case TGSI_OPCODE_IF:
1428       assert(c->if_depth < MAX_IF_DEPTH);
1429       c->if_inst[c->if_depth] = brw_IF(p, BRW_EXECUTE_8);
1430       /* Note that brw_IF smashes the predicate_control field. */
1431       c->if_inst[c->if_depth]->header.predicate_control = get_predicate(inst);
1432       c->if_depth++;
1433       break;
1434    case TGSI_OPCODE_ELSE:
1435       c->if_inst[c->if_depth-1] = brw_ELSE(p, c->if_inst[c->if_depth-1]);
1436       break;
1437    case TGSI_OPCODE_ENDIF:
1438       assert(c->if_depth > 0);
1439       brw_ENDIF(p, c->if_inst[--c->if_depth]);
1440       break;
1441    case TGSI_OPCODE_BGNLOOP:
1442       c->loop_inst[c->loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
1443       break;
1444    case TGSI_OPCODE_BRK:
1445       brw_set_predicate_control(p, get_predicate(inst));
1446       brw_BREAK(p);
1447       brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1448       break;
1449    case TGSI_OPCODE_CONT:
1450       brw_set_predicate_control(p, get_predicate(inst));
1451       brw_CONT(p);
1452       brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1453       break;
1454    case TGSI_OPCODE_ENDLOOP:
1455    {
1456       struct brw_instruction *inst0, *inst1;
1457       GLuint br = 1;
1458
1459       c->loop_depth--;
1460
1461       if (c->chipset.is_igdng)
1462          br = 2;
1463
1464       inst0 = inst1 = brw_WHILE(p, c->loop_inst[c->loop_depth]);
1465       /* patch all the BREAK/CONT instructions from last BEGINLOOP */
1466       while (inst0 > c->loop_inst[c->loop_depth]) {
1467          inst0--;
1468          if (inst0->header.opcode == TGSI_OPCODE_BRK) {
1469             inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
1470             inst0->bits3.if_else.pop_count = 0;
1471          }
1472          else if (inst0->header.opcode == TGSI_OPCODE_CONT) {
1473             inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
1474             inst0->bits3.if_else.pop_count = 0;
1475          }
1476       }
1477    }
1478    break;
1479    case TGSI_OPCODE_BRA:
1480       brw_set_predicate_control(p, get_predicate(inst));
1481       brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1482       brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1483       break;
1484    case TGSI_OPCODE_CAL:
1485       brw_set_access_mode(p, BRW_ALIGN_1);
1486       brw_ADD(p, deref_1d(c->stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
1487       brw_set_access_mode(p, BRW_ALIGN_16);
1488       brw_ADD(p, get_addr_reg(c->stack_index),
1489               get_addr_reg(c->stack_index), brw_imm_d(4));
1490       brw_save_call(p, label, p->nr_insn);
1491       brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1492       break;
1493    case TGSI_OPCODE_RET:
1494       brw_ADD(p, get_addr_reg(c->stack_index),
1495               get_addr_reg(c->stack_index), brw_imm_d(-4));
1496       brw_set_access_mode(p, BRW_ALIGN_1);
1497       brw_MOV(p, brw_ip_reg(), deref_1d(c->stack_index, 0));
1498       brw_set_access_mode(p, BRW_ALIGN_16);
1499       break;
1500    case TGSI_OPCODE_END:
1501       c->end_offset = p->nr_insn;
1502       /* this instruction will get patched later to jump past subroutine
1503        * code, etc.
1504        */
1505       brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1506       break;
1507    case TGSI_OPCODE_BGNSUB:
1508       brw_save_label(p, p->nr_insn, p->nr_insn);
1509       break;
1510    case TGSI_OPCODE_ENDSUB:
1511       /* no-op */
1512       break;
1513    default:
1514       debug_printf("Unsupported opcode %i (%s) in vertex shader",
1515                    opcode,
1516                    tgsi_get_opcode_name(opcode));
1517    }
1518
1519    /* Set the predication update on the last instruction of the native
1520     * instruction sequence.
1521     *
1522     * This would be problematic if it was set on a math instruction,
1523     * but that shouldn't be the case with the current GLSL compiler.
1524     */
1525 #if 0
1526    /* XXX: disabled
1527     */
1528    if (inst->CondUpdate) {
1529       struct brw_instruction *hw_insn = &p->store[p->nr_insn - 1];
1530
1531       assert(hw_insn->header.destreg__conditionalmod == 0);
1532       hw_insn->header.destreg__conditionalmod = BRW_CONDITIONAL_NZ;
1533    }
1534 #endif
1535
1536    release_tmps(c);
1537 }
1538
1539
1540 /* Emit the vertex program instructions here.
1541  */
1542 void brw_vs_emit(struct brw_vs_compile *c)
1543 {
1544    struct brw_compile *p = &c->func;
1545    const struct tgsi_token *tokens = c->vp->tokens;
1546    struct brw_instruction *end_inst, *last_inst;
1547    struct tgsi_parse_context parse;
1548    struct tgsi_full_instruction *inst;
1549    boolean done = FALSE;
1550    int i;
1551
1552    if (BRW_DEBUG & DEBUG_VS)
1553       tgsi_dump(c->vp->tokens, 0);
1554
1555    c->stack_index = brw_indirect(0, 0);
1556
1557    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1558    brw_set_access_mode(p, BRW_ALIGN_16);
1559
1560    /* Inputs */
1561    tgsi_parse_init( &parse, tokens );
1562    while( !tgsi_parse_end_of_tokens( &parse ) ) {
1563       tgsi_parse_token( &parse );
1564
1565       switch( parse.FullToken.Token.Type ) {
1566       case TGSI_TOKEN_TYPE_DECLARATION:
1567          /* Nothing to do -- using info from tgsi_scan().
1568           */
1569          break;
1570
1571       case TGSI_TOKEN_TYPE_IMMEDIATE: {
1572          static const float id[4] = {0,0,0,1};
1573          const float *imm = &parse.FullToken.FullImmediate.u[i].Float;
1574          unsigned size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
1575
1576          for (i = 0; i < size; i++)
1577             c->immediate[c->nr_immediates][i] = imm[i];
1578
1579          for ( ; i < 4; i++)
1580             c->immediate[c->nr_immediates][i] = id[i];
1581
1582          c->nr_immediates++;
1583          break;
1584       }
1585
1586       case TGSI_TOKEN_TYPE_INSTRUCTION:
1587          done = 1;
1588          break;
1589       }
1590    }
1591
1592    /* Static register allocation
1593     */
1594    brw_vs_alloc_regs(c);
1595    brw_MOV(p, get_addr_reg(c->stack_index), brw_address(c->stack));
1596
1597    /* Instructions
1598     */
1599    tgsi_parse_init( &parse, tokens );
1600    while( !tgsi_parse_end_of_tokens( &parse ) ) {
1601       tgsi_parse_token( &parse );
1602
1603       switch( parse.FullToken.Token.Type ) {
1604       case TGSI_TOKEN_TYPE_DECLARATION:
1605       case TGSI_TOKEN_TYPE_IMMEDIATE:
1606          break;
1607
1608       case TGSI_TOKEN_TYPE_INSTRUCTION:
1609          inst = &parse.FullToken.FullInstruction;
1610          emit_insn( c, inst );
1611          break;
1612
1613       default:
1614          assert( 0 );
1615       }
1616    }
1617    tgsi_parse_free( &parse );
1618
1619    end_inst = &p->store[c->end_offset];
1620    last_inst = &p->store[p->nr_insn];
1621
1622    /* The END instruction will be patched to jump to this code */
1623    emit_vertex_write(c);
1624
1625    post_vs_emit(c, end_inst, last_inst);
1626
1627    if (BRW_DEBUG & DEBUG_VS) {
1628       debug_printf("vs-native:\n");
1629       brw_disasm(stderr, p->store, p->nr_insn);
1630    }
1631 }