src/gallium/drivers/i965simple/brw_wm_decl.c

   1
   2 #include "brw_context.h"
   3 #include "brw_eu.h"
   4 #include "brw_wm.h"
   5 #include "util/u_math.h"
   6 #include "util/u_memory.h"
   7 #include "pipe/p_shader_tokens.h"
   8 #include "tgsi/tgsi_parse.h"
   9
  10 static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
  11 {
  12    c->tmp_index++;
  13    c->reg_index = MAX2(c->reg_index, c->tmp_start + c->tmp_index);
  14    return brw_vec8_grf(c->tmp_start + c->tmp_index, 0);
  15 }
  16
  17 static void release_tmps(struct brw_wm_compile *c)
  18 {
  19    c->tmp_index = 0;
  20 }
  21
  22
  23
  24 static int is_null( struct brw_reg reg )
  25 {
  26    return (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
  27            reg.nr == BRW_ARF_NULL);
  28 }
  29
  30 static void emit_pixel_xy( struct brw_wm_compile *c )
  31 {
  32    if (is_null(c->pixel_xy[0])) {
  33
  34       struct brw_compile *p = &c->func;
  35       struct brw_reg r1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
  36
  37       c->pixel_xy[0] = vec8(retype(alloc_tmp(c), BRW_REGISTER_TYPE_UW));
  38       c->pixel_xy[1] = vec8(retype(alloc_tmp(c), BRW_REGISTER_TYPE_UW));
  39
  40       /* Calculate pixel centers by adding 1 or 0 to each of the
  41        * micro-tile coordinates passed in r1.
  42        */
  43       brw_ADD(p,
  44               c->pixel_xy[0],
  45               stride(suboffset(r1_uw, 4), 2, 4, 0),
  46               brw_imm_v(0x10101010));
  47
  48       brw_ADD(p,
  49               c->pixel_xy[1],
  50               stride(suboffset(r1_uw, 5), 2, 4, 0),
  51               brw_imm_v(0x11001100));
  52    }
  53 }
  54
  55
  56
  57
  58
  59
  60 static void emit_delta_xy( struct brw_wm_compile *c )
  61 {
  62    if (is_null(c->delta_xy[0])) {
  63       struct brw_compile *p = &c->func;
  64       struct brw_reg r1 = brw_vec1_grf(1, 0);
  65
  66       emit_pixel_xy(c);
  67
  68       c->delta_xy[0] = alloc_tmp(c);
  69       c->delta_xy[1] = alloc_tmp(c);
  70
  71       /* Calc delta X,Y by subtracting origin in r1 from the pixel
  72        * centers.
  73        */
  74       brw_ADD(p,
  75               c->delta_xy[0],
  76               retype(c->pixel_xy[0], BRW_REGISTER_TYPE_UW),
  77               negate(r1));
  78
  79       brw_ADD(p,
  80               c->delta_xy[1],
  81               retype(c->pixel_xy[1], BRW_REGISTER_TYPE_UW),
  82               negate(suboffset(r1,1)));
  83    }
  84 }
  85
  86
  87
  88 #if 0
  89 static void emit_pixel_w( struct brw_wm_compile *c )
  90 {
  91    if (is_null(c->pixel_w)) {
  92       struct brw_compile *p = &c->func;
  93
  94       struct brw_reg interp_wpos = c->coef_wpos;
  95
  96       c->pixel_w = alloc_tmp(c);
  97
  98       emit_delta_xy(c);
  99
 100       /* Calc 1/w - just linterp wpos[3] optimized by putting the
 101        * result straight into a message reg.
 102        */
 103       struct brw_reg interp3 = brw_vec1_grf(interp_wpos.nr+1, 4);
 104       brw_LINE(p, brw_null_reg(), interp3, c->delta_xy[0]);
 105       brw_MAC(p, brw_message_reg(2), suboffset(interp3, 1), c->delta_xy[1]);
 106
 107       /* Calc w */
 108       brw_math_16( p,
 109                    c->pixel_w,
 110                    BRW_MATH_FUNCTION_INV,
 111                    BRW_MATH_SATURATE_NONE,
 112                    2,
 113                    brw_null_reg(),
 114                    BRW_MATH_PRECISION_FULL);
 115    }
 116 }
 117 #endif
 118
 119
 120 static void emit_cinterp(struct brw_wm_compile *c,
 121                          int idx,
 122                          int mask )
 123 {
 124    struct brw_compile *p = &c->func;
 125    struct brw_reg interp[4];
 126    struct brw_reg coef = c->payload_coef[idx];
 127    int i;
 128
 129    interp[0] = brw_vec1_grf(coef.nr, 0);
 130    interp[1] = brw_vec1_grf(coef.nr, 4);
 131    interp[2] = brw_vec1_grf(coef.nr+1, 0);
 132    interp[3] = brw_vec1_grf(coef.nr+1, 4);
 133
 134    for(i = 0; i < 4; i++ ) {
 135       if (mask & (1<<i)) {
 136          struct brw_reg dst = c->wm_regs[TGSI_FILE_INPUT][idx][i];
 137          brw_MOV(p, dst, suboffset(interp[i],3));
 138       }
 139    }
 140 }
 141
 142 static void emit_linterp(struct brw_wm_compile *c,
 143                          int idx,
 144                          int mask )
 145 {
 146    struct brw_compile *p = &c->func;
 147    struct brw_reg interp[4];
 148    struct brw_reg coef = c->payload_coef[idx];
 149    int i;
 150
 151    emit_delta_xy(c);
 152
 153    interp[0] = brw_vec1_grf(coef.nr, 0);
 154    interp[1] = brw_vec1_grf(coef.nr, 4);
 155    interp[2] = brw_vec1_grf(coef.nr+1, 0);
 156    interp[3] = brw_vec1_grf(coef.nr+1, 4);
 157
 158    for(i = 0; i < 4; i++ ) {
 159       if (mask & (1<<i)) {
 160          struct brw_reg dst = c->wm_regs[TGSI_FILE_INPUT][idx][i];
 161          brw_LINE(p, brw_null_reg(), interp[i], c->delta_xy[0]);
 162          brw_MAC(p, dst, suboffset(interp[i],1), c->delta_xy[1]);
 163       }
 164    }
 165 }
 166
 167 #if 0
 168 static void emit_pinterp(struct brw_wm_compile *c,
 169                          int idx,
 170                          int mask )
 171 {
 172    struct brw_compile *p = &c->func;
 173    struct brw_reg interp[4];
 174    struct brw_reg coef = c->payload_coef[idx];
 175    int i;
 176
 177    get_delta_xy(c);
 178    get_pixel_w(c);
 179
 180    interp[0] = brw_vec1_grf(coef.nr, 0);
 181    interp[1] = brw_vec1_grf(coef.nr, 4);
 182    interp[2] = brw_vec1_grf(coef.nr+1, 0);
 183    interp[3] = brw_vec1_grf(coef.nr+1, 4);
 184
 185    for(i = 0; i < 4; i++ ) {
 186       if (mask & (1<<i)) {
 187          struct brw_reg dst = allocate_reg(c, TGSI_FILE_INPUT, idx, i);
 188          brw_LINE(p, brw_null_reg(), interp[i], c->delta_xy[0]);
 189          brw_MAC(p, dst, suboffset(interp[i],1), c->delta_xy[1]);
 190          brw_MUL(p, dst, dst, c->pixel_w);
 191       }
 192    }
 193 }
 194 #endif
 195
 196
 197
 198 #if 0
 199 static void emit_wpos( )
 200 {
 201    struct prog_dst_register dst = dst_reg(PROGRAM_INPUT, idx);
 202    struct tgsi_full_src_register interp = src_reg(PROGRAM_PAYLOAD, idx);
 203    struct tgsi_full_src_register deltas = get_delta_xy(c);
 204    struct tgsi_full_src_register arg2;
 205    unsigned opcode;
 206
 207    opcode = WM_LINTERP;
 208    arg2 = src_undef();
 209
 210    /* Have to treat wpos.xy specially:
 211     */
 212    emit_op(c,
 213            WM_WPOSXY,
 214            dst_mask(dst, WRITEMASK_XY),
 215            0, 0, 0,
 216            get_pixel_xy(c),
 217            src_undef(),
 218            src_undef());
 219
 220    dst = dst_mask(dst, WRITEMASK_ZW);
 221
 222    /* PROGRAM_INPUT.attr.xyzw = INTERP payload.interp[attr].x, deltas.xyw
 223     */
 224    emit_op(c,
 225            WM_LINTERP,
 226            dst,
 227            0, 0, 0,
 228            interp,
 229            deltas,
 230            arg2);
 231 }
 232 #endif
 233
 234
 235
 236
 237 /* Perform register allocation:
 238  *
 239  *  -- r0???
 240  *  -- passthrough depth regs (and stencil/aa??)
 241  *  -- curbe ??
 242  *  -- inputs (coefficients)
 243  *
 244  * Use a totally static register allocation.  This will perform poorly
 245  * but is an easy way to get started (again).
 246  */
 247 static void prealloc_reg(struct brw_wm_compile *c)
 248 {
 249    int i, j;
 250    int nr_curbe_regs = 0;
 251
 252    /* R0, then some depth related regs:
 253     */
 254    for (i = 0; i < c->key.nr_depth_regs; i++) {
 255       c->payload_depth[i] =  brw_vec8_grf(i*2, 0);
 256       c->reg_index += 2;
 257    }
 258
 259
 260    /* Then a copy of our part of the CURBE entry:
 261     */
 262    {
 263       int nr_constants = c->fp->info.file_max[TGSI_FILE_CONSTANT] + 1;
 264       int index = 0;
 265
 266       /* XXX number of constants, or highest numbered constant? */
 267       assert(nr_constants == c->fp->info.file_count[TGSI_FILE_CONSTANT]);
 268
 269       c->prog_data.max_const = 4*nr_constants;
 270       for (i = 0; i < nr_constants; i++) {
 271          for (j = 0; j < 4; j++, index++)
 272             c->wm_regs[TGSI_FILE_CONSTANT][i][j] = brw_vec1_grf(c->reg_index + index/8,
 273                                                                 index%8);
 274       }
 275
 276       nr_curbe_regs = 2*((4*nr_constants+15)/16);
 277       c->reg_index += nr_curbe_regs;
 278    }
 279
 280    /* Adjust for parameter coefficients for position, which are
 281     * currently always provided.
 282     */
 283 //   c->position_coef[i] = brw_vec8_grf(c->reg_index, 0);
 284    c->reg_index += 2;
 285
 286    /* Next we receive the plane coefficients for parameter
 287     * interpolation:
 288     */
 289    assert(c->fp->info.file_max[TGSI_FILE_INPUT] == c->fp->info.num_inputs);
 290    for (i = 0; i < c->fp->info.file_max[TGSI_FILE_INPUT] + 1; i++) {
 291       c->payload_coef[i] = brw_vec8_grf(c->reg_index, 0);
 292       c->reg_index += 2;
 293    }
 294
 295    c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
 296    c->prog_data.urb_read_length = (c->fp->info.num_inputs + 1) * 2;
 297    c->prog_data.curb_read_length = nr_curbe_regs;
 298
 299    /* That's the end of the payload, now we can start allocating registers.
 300     */
 301    c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, c->reg_index, 0);
 302    c->reg_index++;
 303
 304    c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, c->reg_index, 0);
 305    c->reg_index += 2;
 306
 307    /* Now allocate room for the interpolated inputs and staging
 308     * registers for the outputs:
 309     */
 310    /* XXX do we want to loop over the _number_ of inputs/outputs or loop
 311     * to the highest input/output index that's used?
 312     *  Probably the same, actually.
 313     */
 314    assert(c->fp->info.file_max[TGSI_FILE_INPUT] + 1 == c->fp->info.num_inputs);
 315    assert(c->fp->info.file_max[TGSI_FILE_OUTPUT] + 1 == c->fp->info.num_outputs);
 316    for (i = 0; i < c->fp->info.file_max[TGSI_FILE_INPUT] + 1; i++)
 317       for (j = 0; j < 4; j++)
 318          c->wm_regs[TGSI_FILE_INPUT][i][j] = brw_vec8_grf( c->reg_index++, 0 );
 319
 320    for (i = 0; i < c->fp->info.file_max[TGSI_FILE_OUTPUT] + 1; i++)
 321       for (j = 0; j < 4; j++)
 322          c->wm_regs[TGSI_FILE_OUTPUT][i][j] = brw_vec8_grf( c->reg_index++, 0 );
 323
 324    /* Beyond this we should only need registers for internal temporaries:
 325     */
 326    c->tmp_start = c->reg_index;
 327 }
 328
 329
 330
 331
 332
 333 /* Need to interpolate fragment program inputs in as a preamble to the
 334  * shader.  A more sophisticated compiler would do this on demand, but
 335  * we'll do it up front:
 336  */
 337 void brw_wm_emit_decls(struct brw_wm_compile *c)
 338 {
 339    struct tgsi_parse_context parse;
 340    int done = 0;
 341
 342    prealloc_reg(c);
 343
 344    tgsi_parse_init( &parse, c->fp->program.tokens );
 345
 346    while( !done &&
 347           !tgsi_parse_end_of_tokens( &parse ) )
 348    {
 349       tgsi_parse_token( &parse );
 350
 351       switch( parse.FullToken.Token.Type ) {
 352       case TGSI_TOKEN_TYPE_DECLARATION:
 353       {
 354          const struct tgsi_full_declaration *decl = &parse.FullToken.FullDeclaration;
 355          unsigned first = decl->DeclarationRange.First;
 356          unsigned last = decl->DeclarationRange.Last;
 357          unsigned mask = decl->Declaration.UsageMask; /* ? */
 358          unsigned i;
 359
 360          if (decl->Declaration.File != TGSI_FILE_INPUT)
 361             break;
 362
 363          for( i = first; i <= last; i++ ) {
 364             switch (decl->Declaration.Interpolate) {
 365             case TGSI_INTERPOLATE_CONSTANT:
 366                emit_cinterp(c, i, mask);
 367                break;
 368
 369             case TGSI_INTERPOLATE_LINEAR:
 370                emit_linterp(c, i, mask);
 371                break;
 372
 373             case TGSI_INTERPOLATE_PERSPECTIVE:
 374                //emit_pinterp(c, i, mask);
 375                emit_linterp(c, i, mask);
 376                break;
 377             }
 378          }
 379          break;
 380       }
 381       case TGSI_TOKEN_TYPE_IMMEDIATE:
 382       case TGSI_TOKEN_TYPE_INSTRUCTION:
 383       default:
 384          done = 1;
 385          break;
 386       }
 387    }
 388
 389    tgsi_parse_free (&parse);
 390
 391    release_tmps(c);
 392 }