src/mesa/drivers/dri/i965/brw_wm.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keith@tungstengraphics.com>
  30   */
  31
  32 #include "brw_context.h"
  33 #include "brw_wm.h"
  34 #include "brw_state.h"
  35 #include "main/formats.h"
  36 #include "main/samplerobj.h"
  37
  38 #include "../glsl/ralloc.h"
  39
  40 /** Return number of src args for given instruction */
  41 GLuint brw_wm_nr_args( GLuint opcode )
  42 {
  43    switch (opcode) {
  44    case WM_FRONTFACING:
  45    case WM_PIXELXY:
  46       return 0;
  47    case WM_CINTERP:
  48    case WM_WPOSXY:
  49    case WM_DELTAXY:
  50       return 1;
  51    case WM_LINTERP:
  52    case WM_PIXELW:
  53       return 2;
  54    case WM_FB_WRITE:
  55    case WM_PINTERP:
  56       return 3;
  57    default:
  58       assert(opcode < MAX_OPCODE);
  59       return _mesa_num_inst_src_regs(opcode);
  60    }
  61 }
  62
  63
  64 GLuint brw_wm_is_scalar_result( GLuint opcode )
  65 {
  66    switch (opcode) {
  67    case OPCODE_COS:
  68    case OPCODE_EX2:
  69    case OPCODE_LG2:
  70    case OPCODE_POW:
  71    case OPCODE_RCP:
  72    case OPCODE_RSQ:
  73    case OPCODE_SIN:
  74    case OPCODE_DP2:
  75    case OPCODE_DP3:
  76    case OPCODE_DP4:
  77    case OPCODE_DPH:
  78    case OPCODE_DST:
  79       return 1;
  80
  81    default:
  82       return 0;
  83    }
  84 }
  85
  86
  87 /**
  88  * Do GPU code generation for non-GLSL shader.  non-GLSL shaders have
  89  * no flow control instructions so we can more readily do SSA-style
  90  * optimizations.
  91  */
  92 static void
  93 brw_wm_non_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
  94 {
  95    /* Augment fragment program.  Add instructions for pre- and
  96     * post-fragment-program tasks such as interpolation and fogging.
  97     */
  98    brw_wm_pass_fp(c);
  99
 100    /* Translate to intermediate representation.  Build register usage
 101     * chains.
 102     */
 103    brw_wm_pass0(c);
 104
 105    /* Dead code removal.
 106     */
 107    brw_wm_pass1(c);
 108
 109    /* Register allocation.
 110     * Divide by two because we operate on 16 pixels at a time and require
 111     * two GRF entries for each logical shader register.
 112     */
 113    c->grf_limit = BRW_WM_MAX_GRF / 2;
 114
 115    brw_wm_pass2(c);
 116
 117    /* how many general-purpose registers are used */
 118    c->prog_data.total_grf = c->max_wm_grf;
 119
 120    /* Emit GEN4 code.
 121     */
 122    brw_wm_emit(c);
 123 }
 124
 125 void
 126 brw_wm_payload_setup(struct brw_context *brw,
 127                      struct brw_wm_compile *c)
 128 {
 129    struct intel_context *intel = &brw->intel;
 130    bool uses_depth = (c->fp->program.Base.InputsRead &
 131                       (1 << FRAG_ATTRIB_WPOS)) != 0;
 132
 133    if (intel->gen >= 6) {
 134       /* R0-1: masks, pixel X/Y coordinates. */
 135       c->nr_payload_regs = 2;
 136       /* R2: only for 32-pixel dispatch.*/
 137       /* R3-4: perspective pixel location barycentric */
 138       c->nr_payload_regs += 2;
 139       /* R5-6: perspective pixel location bary for dispatch width != 8 */
 140       if (c->dispatch_width == 16) {
 141          c->nr_payload_regs += 2;
 142       }
 143       /* R7-10: perspective centroid barycentric */
 144       /* R11-14: perspective sample barycentric */
 145       /* R15-18: linear pixel location barycentric */
 146       /* R19-22: linear centroid barycentric */
 147       /* R23-26: linear sample barycentric */
 148
 149       /* R27: interpolated depth if uses source depth */
 150       if (uses_depth) {
 151          c->source_depth_reg = c->nr_payload_regs;
 152          c->nr_payload_regs++;
 153          if (c->dispatch_width == 16) {
 154             /* R28: interpolated depth if not 8-wide. */
 155             c->nr_payload_regs++;
 156          }
 157       }
 158       /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W.
 159        */
 160       if (uses_depth) {
 161          c->source_w_reg = c->nr_payload_regs;
 162          c->nr_payload_regs++;
 163          if (c->dispatch_width == 16) {
 164             /* R30: interpolated W if not 8-wide. */
 165             c->nr_payload_regs++;
 166          }
 167       }
 168       /* R31: MSAA position offsets. */
 169       /* R32-: bary for 32-pixel. */
 170       /* R58-59: interp W for 32-pixel. */
 171
 172       if (c->fp->program.Base.OutputsWritten &
 173           BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
 174          c->source_depth_to_render_target = GL_TRUE;
 175          c->computes_depth = GL_TRUE;
 176       }
 177    } else {
 178       brw_wm_lookup_iz(intel, c);
 179    }
 180 }
 181
 182 /**
 183  * All Mesa program -> GPU code generation goes through this function.
 184  * Depending on the instructions used (i.e. flow control instructions)
 185  * we'll use one of two code generators.
 186  */
 187 static void do_wm_prog( struct brw_context *brw,
 188                         struct brw_fragment_program *fp,
 189                         struct brw_wm_prog_key *key)
 190 {
 191    struct intel_context *intel = &brw->intel;
 192    struct brw_wm_compile *c;
 193    const GLuint *program;
 194    GLuint program_size;
 195
 196    c = brw->wm.compile_data;
 197    if (c == NULL) {
 198       brw->wm.compile_data = rzalloc(NULL, struct brw_wm_compile);
 199       c = brw->wm.compile_data;
 200       if (c == NULL) {
 201          /* Ouch - big out of memory problem.  Can't continue
 202           * without triggering a segfault, no way to signal,
 203           * so just return.
 204           */
 205          return;
 206       }
 207       c->instruction = rzalloc_array(c, struct brw_wm_instruction, BRW_WM_MAX_INSN);
 208       c->prog_instructions = rzalloc_array(c, struct prog_instruction, BRW_WM_MAX_INSN);
 209       c->vreg = rzalloc_array(c, struct brw_wm_value, BRW_WM_MAX_VREG);
 210       c->refs = rzalloc_array(c, struct brw_wm_ref, BRW_WM_MAX_REF);
 211    } else {
 212       void *instruction = c->instruction;
 213       void *prog_instructions = c->prog_instructions;
 214       void *vreg = c->vreg;
 215       void *refs = c->refs;
 216       memset(c, 0, sizeof(*brw->wm.compile_data));
 217       c->instruction = instruction;
 218       c->prog_instructions = prog_instructions;
 219       c->vreg = vreg;
 220       c->refs = refs;
 221    }
 222    memcpy(&c->key, key, sizeof(*key));
 223
 224    c->fp = fp;
 225    c->env_param = brw->intel.ctx.FragmentProgram.Parameters;
 226
 227    brw_init_compile(brw, &c->func, c);
 228
 229    if (!brw_wm_fs_emit(brw, c)) {
 230       /* Fallback for fixed function and ARB_fp shaders. */
 231       c->dispatch_width = 16;
 232       brw_wm_payload_setup(brw, c);
 233       brw_wm_non_glsl_emit(brw, c);
 234       c->prog_data.dispatch_width = 16;
 235    }
 236
 237    /* Scratch space is used for register spilling */
 238    if (c->last_scratch) {
 239       uint32_t total_scratch;
 240
 241       /* Per-thread scratch space is power-of-two sized. */
 242       for (c->prog_data.total_scratch = 1024;
 243            c->prog_data.total_scratch <= c->last_scratch;
 244            c->prog_data.total_scratch *= 2) {
 245          /* empty */
 246       }
 247       total_scratch = c->prog_data.total_scratch * brw->wm_max_threads;
 248
 249       if (brw->wm.scratch_bo && total_scratch > brw->wm.scratch_bo->size) {
 250          drm_intel_bo_unreference(brw->wm.scratch_bo);
 251          brw->wm.scratch_bo = NULL;
 252       }
 253       if (brw->wm.scratch_bo == NULL) {
 254          brw->wm.scratch_bo = drm_intel_bo_alloc(intel->bufmgr,
 255                                                  "wm scratch",
 256                                                  total_scratch,
 257                                                  4096);
 258       }
 259    }
 260    else {
 261       c->prog_data.total_scratch = 0;
 262    }
 263
 264    if (unlikely(INTEL_DEBUG & DEBUG_WM))
 265       fprintf(stderr, "\n");
 266
 267    /* get the program
 268     */
 269    program = brw_get_program(&c->func, &program_size);
 270
 271    drm_intel_bo_unreference(brw->wm.prog_bo);
 272    brw->wm.prog_bo = brw_upload_cache(&brw->cache, BRW_WM_PROG,
 273                                       &c->key, sizeof(c->key),
 274                                       program, program_size,
 275                                       &c->prog_data, sizeof(c->prog_data),
 276                                       &brw->wm.prog_data);
 277 }
 278
 279
 280
 281 static void brw_wm_populate_key( struct brw_context *brw,
 282                                  struct brw_wm_prog_key *key )
 283 {
 284    struct gl_context *ctx = &brw->intel.ctx;
 285    /* BRW_NEW_FRAGMENT_PROGRAM */
 286    const struct brw_fragment_program *fp =
 287       (struct brw_fragment_program *)brw->fragment_program;
 288    GLuint lookup = 0;
 289    GLuint line_aa;
 290    GLuint i;
 291
 292    memset(key, 0, sizeof(*key));
 293
 294    /* Build the index for table lookup
 295     */
 296    /* _NEW_COLOR */
 297    key->alpha_test = ctx->Color.AlphaEnabled;
 298    if (fp->program.UsesKill ||
 299        ctx->Color.AlphaEnabled)
 300       lookup |= IZ_PS_KILL_ALPHATEST_BIT;
 301
 302    if (fp->program.Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
 303       lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
 304
 305    /* _NEW_DEPTH */
 306    if (ctx->Depth.Test)
 307       lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
 308
 309    if (ctx->Depth.Test &&
 310        ctx->Depth.Mask) /* ?? */
 311       lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
 312
 313    /* _NEW_STENCIL */
 314    if (ctx->Stencil._Enabled) {
 315       lookup |= IZ_STENCIL_TEST_ENABLE_BIT;
 316
 317       if (ctx->Stencil.WriteMask[0] ||
 318           ctx->Stencil.WriteMask[ctx->Stencil._BackFace])
 319          lookup |= IZ_STENCIL_WRITE_ENABLE_BIT;
 320    }
 321
 322    line_aa = AA_NEVER;
 323
 324    /* _NEW_LINE, _NEW_POLYGON, BRW_NEW_REDUCED_PRIMITIVE */
 325    if (ctx->Line.SmoothFlag) {
 326       if (brw->intel.reduced_primitive == GL_LINES) {
 327          line_aa = AA_ALWAYS;
 328       }
 329       else if (brw->intel.reduced_primitive == GL_TRIANGLES) {
 330          if (ctx->Polygon.FrontMode == GL_LINE) {
 331             line_aa = AA_SOMETIMES;
 332
 333             if (ctx->Polygon.BackMode == GL_LINE ||
 334                 (ctx->Polygon.CullFlag &&
 335                  ctx->Polygon.CullFaceMode == GL_BACK))
 336                line_aa = AA_ALWAYS;
 337          }
 338          else if (ctx->Polygon.BackMode == GL_LINE) {
 339             line_aa = AA_SOMETIMES;
 340
 341             if ((ctx->Polygon.CullFlag &&
 342                  ctx->Polygon.CullFaceMode == GL_FRONT))
 343                line_aa = AA_ALWAYS;
 344          }
 345       }
 346    }
 347
 348    key->iz_lookup = lookup;
 349    key->line_aa = line_aa;
 350    key->stats_wm = brw->intel.stats_wm;
 351
 352    /* BRW_NEW_WM_INPUT_DIMENSIONS */
 353    key->proj_attrib_mask = brw->wm.input_size_masks[4-1];
 354
 355    /* _NEW_LIGHT */
 356    key->flat_shade = (ctx->Light.ShadeModel == GL_FLAT);
 357
 358    /* _NEW_HINT */
 359    key->linear_color = (ctx->Hint.PerspectiveCorrection == GL_FASTEST);
 360
 361    /* _NEW_FRAG_CLAMP | _NEW_BUFFERS */
 362    key->clamp_fragment_color = ctx->Color._ClampFragmentColor;
 363
 364    /* _NEW_TEXTURE */
 365    for (i = 0; i < BRW_MAX_TEX_UNIT; i++) {
 366       const struct gl_texture_unit *unit = &ctx->Texture.Unit[i];
 367
 368       if (unit->_ReallyEnabled) {
 369          const struct gl_texture_object *t = unit->_Current;
 370          const struct gl_texture_image *img = t->Image[0][t->BaseLevel];
 371          struct gl_sampler_object *sampler = _mesa_get_samplerobj(ctx, i);
 372          int swizzles[SWIZZLE_NIL + 1] = {
 373             SWIZZLE_X,
 374             SWIZZLE_Y,
 375             SWIZZLE_Z,
 376             SWIZZLE_W,
 377             SWIZZLE_ZERO,
 378             SWIZZLE_ONE,
 379             SWIZZLE_NIL
 380          };
 381
 382          /* GL_DEPTH_TEXTURE_MODE is normally handled through
 383           * brw_wm_surface_state, but it applies to shadow compares as
 384           * well and our shadow compares always return the result in
 385           * all 4 channels.
 386           */
 387          if (sampler->CompareMode == GL_COMPARE_R_TO_TEXTURE_ARB) {
 388             if (sampler->DepthMode == GL_ALPHA) {
 389                swizzles[0] = SWIZZLE_ZERO;
 390                swizzles[1] = SWIZZLE_ZERO;
 391                swizzles[2] = SWIZZLE_ZERO;
 392             } else if (sampler->DepthMode == GL_LUMINANCE) {
 393                swizzles[3] = SWIZZLE_ONE;
 394             } else if (sampler->DepthMode == GL_RED) {
 395                /* See table 3.23 of the GL 3.0 spec. */
 396                swizzles[1] = SWIZZLE_ZERO;
 397                swizzles[2] = SWIZZLE_ZERO;
 398                swizzles[3] = SWIZZLE_ONE;
 399             }
 400          }
 401
 402          if (img->InternalFormat == GL_YCBCR_MESA) {
 403             key->yuvtex_mask |= 1 << i;
 404             if (img->TexFormat == MESA_FORMAT_YCBCR)
 405                 key->yuvtex_swap_mask |= 1 << i;
 406          }
 407
 408          key->tex_swizzles[i] =
 409             MAKE_SWIZZLE4(swizzles[GET_SWZ(t->_Swizzle, 0)],
 410                           swizzles[GET_SWZ(t->_Swizzle, 1)],
 411                           swizzles[GET_SWZ(t->_Swizzle, 2)],
 412                           swizzles[GET_SWZ(t->_Swizzle, 3)]);
 413
 414          if (sampler->MinFilter != GL_NEAREST &&
 415              sampler->MagFilter != GL_NEAREST) {
 416             if (sampler->WrapS == GL_CLAMP)
 417                key->gl_clamp_mask[0] |= 1 << i;
 418             if (sampler->WrapT == GL_CLAMP)
 419                key->gl_clamp_mask[1] |= 1 << i;
 420             if (sampler->WrapR == GL_CLAMP)
 421                key->gl_clamp_mask[2] |= 1 << i;
 422          }
 423       }
 424       else {
 425          key->tex_swizzles[i] = SWIZZLE_NOOP;
 426       }
 427    }
 428
 429    /* Shadow */
 430    key->shadowtex_mask = fp->program.Base.ShadowSamplers;
 431
 432    /* _NEW_BUFFERS */
 433    /*
 434     * Include the draw buffer origin and height so that we can calculate
 435     * fragment position values relative to the bottom left of the drawable,
 436     * from the incoming screen origin relative position we get as part of our
 437     * payload.
 438     *
 439     * This is only needed for the WM_WPOSXY opcode when the fragment program
 440     * uses the gl_FragCoord input.
 441     *
 442     * We could avoid recompiling by including this as a constant referenced by
 443     * our program, but if we were to do that it would also be nice to handle
 444     * getting that constant updated at batchbuffer submit time (when we
 445     * hold the lock and know where the buffer really is) rather than at emit
 446     * time when we don't hold the lock and are just guessing.  We could also
 447     * just avoid using this as key data if the program doesn't use
 448     * fragment.position.
 449     *
 450     * For DRI2 the origin_x/y will always be (0,0) but we still need the
 451     * drawable height in order to invert the Y axis.
 452     */
 453    if (fp->program.Base.InputsRead & FRAG_BIT_WPOS) {
 454       key->drawable_height = ctx->DrawBuffer->Height;
 455       key->render_to_fbo = ctx->DrawBuffer->Name != 0;
 456    }
 457
 458    /* _NEW_BUFFERS */
 459    key->nr_color_regions = ctx->DrawBuffer->_NumColorDrawBuffers;
 460
 461    /* CACHE_NEW_VS_PROG */
 462    key->vp_outputs_written = brw->vs.prog_data->outputs_written;
 463
 464    /* The unique fragment program ID */
 465    key->program_string_id = fp->id;
 466 }
 467
 468
 469 static void brw_prepare_wm_prog(struct brw_context *brw)
 470 {
 471    struct brw_wm_prog_key key;
 472    struct brw_fragment_program *fp = (struct brw_fragment_program *)
 473       brw->fragment_program;
 474
 475    brw_wm_populate_key(brw, &key);
 476
 477    /* Make an early check for the key.
 478     */
 479    drm_intel_bo_unreference(brw->wm.prog_bo);
 480    brw->wm.prog_bo = brw_search_cache(&brw->cache, BRW_WM_PROG,
 481                                       &key, sizeof(key),
 482                                       &brw->wm.prog_data);
 483    if (brw->wm.prog_bo == NULL)
 484       do_wm_prog(brw, fp, &key);
 485 }
 486
 487
 488 const struct brw_tracked_state brw_wm_prog = {
 489    .dirty = {
 490       .mesa  = (_NEW_COLOR |
 491                 _NEW_DEPTH |
 492                 _NEW_HINT |
 493                 _NEW_STENCIL |
 494                 _NEW_POLYGON |
 495                 _NEW_LINE |
 496                 _NEW_LIGHT |
 497                 _NEW_FRAG_CLAMP |
 498                 _NEW_BUFFERS |
 499                 _NEW_TEXTURE),
 500       .brw   = (BRW_NEW_FRAGMENT_PROGRAM |
 501                 BRW_NEW_WM_INPUT_DIMENSIONS |
 502                 BRW_NEW_REDUCED_PRIMITIVE),
 503       .cache = CACHE_NEW_VS_PROG,
 504    },
 505    .prepare = brw_prepare_wm_prog
 506 };
 507