src/mesa/drivers/dri/i965/brw_program.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keithw@vmware.com>
  30   */
  31
  32 #include <pthread.h>
  33 #include "main/imports.h"
  34 #include "main/glspirv.h"
  35 #include "program/prog_parameter.h"
  36 #include "program/prog_print.h"
  37 #include "program/prog_to_nir.h"
  38 #include "program/program.h"
  39 #include "program/programopt.h"
  40 #include "tnl/tnl.h"
  41 #include "util/ralloc.h"
  42 #include "compiler/glsl/ir.h"
  43 #include "compiler/glsl/program.h"
  44 #include "compiler/glsl/gl_nir.h"
  45 #include "compiler/glsl/glsl_to_nir.h"
  46
  47 #include "brw_program.h"
  48 #include "brw_context.h"
  49 #include "compiler/brw_nir.h"
  50 #include "brw_defines.h"
  51 #include "intel_batchbuffer.h"
  52
  53 #include "brw_cs.h"
  54 #include "brw_gs.h"
  55 #include "brw_vs.h"
  56 #include "brw_wm.h"
  57 #include "brw_state.h"
  58
  59 #include "main/shaderapi.h"
  60 #include "main/shaderobj.h"
  61
  62 static bool
  63 brw_nir_lower_uniforms(nir_shader *nir, bool is_scalar)
  64 {
  65    if (is_scalar) {
  66       nir_assign_var_locations(&nir->uniforms, &nir->num_uniforms,
  67                                type_size_scalar_bytes);
  68       return nir_lower_io(nir, nir_var_uniform, type_size_scalar_bytes, 0);
  69    } else {
  70       nir_assign_var_locations(&nir->uniforms, &nir->num_uniforms,
  71                                type_size_vec4_bytes);
  72       return nir_lower_io(nir, nir_var_uniform, type_size_vec4_bytes, 0);
  73    }
  74 }
  75
  76 static struct gl_program *brwNewProgram(struct gl_context *ctx, GLenum target,
  77                                         GLuint id, bool is_arb_asm);
  78
  79 nir_shader *
  80 brw_create_nir(struct brw_context *brw,
  81                const struct gl_shader_program *shader_prog,
  82                struct gl_program *prog,
  83                gl_shader_stage stage,
  84                bool is_scalar)
  85 {
  86    const struct gen_device_info *devinfo = &brw->screen->devinfo;
  87    struct gl_context *ctx = &brw->ctx;
  88    const nir_shader_compiler_options *options =
  89       ctx->Const.ShaderCompilerOptions[stage].NirOptions;
  90    nir_shader *nir;
  91
  92    /* First, lower the GLSL/Mesa IR or SPIR-V to NIR */
  93    if (shader_prog) {
  94       if (shader_prog->data->spirv) {
  95          nir = _mesa_spirv_to_nir(ctx, shader_prog, stage, options);
  96       } else {
  97          nir = glsl_to_nir(ctx, shader_prog, stage, options);
  98       }
  99       assert (nir);
 100
 101       nir_remove_dead_variables(nir, nir_var_shader_in | nir_var_shader_out);
 102       nir_validate_shader(nir, "after glsl_to_nir or spirv_to_nir");
 103       NIR_PASS_V(nir, nir_lower_io_to_temporaries,
 104                  nir_shader_get_entrypoint(nir), true, false);
 105    } else {
 106       nir = prog_to_nir(prog, options);
 107       NIR_PASS_V(nir, nir_lower_regs_to_ssa); /* turn registers into SSA */
 108    }
 109    nir_validate_shader(nir, "before brw_preprocess_nir");
 110
 111    nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
 112
 113    if (!ctx->SoftFP64 && nir->info.uses_64bit &&
 114        (options->lower_doubles_options & nir_lower_fp64_full_software)) {
 115       ctx->SoftFP64 = glsl_float64_funcs_to_nir(ctx, options);
 116    }
 117
 118    brw_preprocess_nir(brw->screen->compiler, nir, ctx->SoftFP64);
 119
 120    if (stage == MESA_SHADER_TESS_CTRL) {
 121       /* Lower gl_PatchVerticesIn from a sys. value to a uniform on Gen8+. */
 122       static const gl_state_index16 tokens[STATE_LENGTH] =
 123          { STATE_INTERNAL, STATE_TCS_PATCH_VERTICES_IN };
 124       nir_lower_patch_vertices(nir, 0, devinfo->gen >= 8 ? tokens : NULL);
 125    }
 126
 127    if (stage == MESA_SHADER_TESS_EVAL) {
 128       /* Lower gl_PatchVerticesIn to a constant if we have a TCS, or
 129        * a uniform if we don't.
 130        */
 131       struct gl_linked_shader *tcs =
 132          shader_prog->_LinkedShaders[MESA_SHADER_TESS_CTRL];
 133       uint32_t static_patch_vertices =
 134          tcs ? tcs->Program->nir->info.tess.tcs_vertices_out : 0;
 135       static const gl_state_index16 tokens[STATE_LENGTH] =
 136          { STATE_INTERNAL, STATE_TES_PATCH_VERTICES_IN };
 137       nir_lower_patch_vertices(nir, static_patch_vertices, tokens);
 138    }
 139
 140    if (stage == MESA_SHADER_FRAGMENT) {
 141       static const struct nir_lower_wpos_ytransform_options wpos_options = {
 142          .state_tokens = {STATE_INTERNAL, STATE_FB_WPOS_Y_TRANSFORM, 0, 0, 0},
 143          .fs_coord_pixel_center_integer = 1,
 144          .fs_coord_origin_upper_left = 1,
 145       };
 146
 147       bool progress = false;
 148       NIR_PASS(progress, nir, nir_lower_wpos_ytransform, &wpos_options);
 149       if (progress) {
 150          _mesa_add_state_reference(prog->Parameters,
 151                                    wpos_options.state_tokens);
 152       }
 153    }
 154
 155    NIR_PASS_V(nir, brw_nir_lower_uniforms, is_scalar);
 156
 157    return nir;
 158 }
 159
 160 void
 161 brw_nir_lower_resources(nir_shader *nir, struct gl_shader_program *shader_prog,
 162                         struct gl_program *prog,
 163                         const struct gen_device_info *devinfo)
 164 {
 165    NIR_PASS_V(prog->nir, gl_nir_lower_samplers, shader_prog);
 166    prog->info.textures_used = prog->nir->info.textures_used;
 167    prog->info.textures_used_by_txf = prog->nir->info.textures_used_by_txf;
 168
 169    NIR_PASS_V(prog->nir, brw_nir_lower_image_load_store, devinfo);
 170
 171    NIR_PASS_V(prog->nir, gl_nir_lower_buffers, shader_prog);
 172    /* Do a round of constant folding to clean up address calculations */
 173    NIR_PASS_V(prog->nir, nir_opt_constant_folding);
 174 }
 175
 176 void
 177 brw_shader_gather_info(nir_shader *nir, struct gl_program *prog)
 178 {
 179    nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
 180
 181    /* Copy the info we just generated back into the gl_program */
 182    const char *prog_name = prog->info.name;
 183    const char *prog_label = prog->info.label;
 184    prog->info = nir->info;
 185    prog->info.name = prog_name;
 186    prog->info.label = prog_label;
 187 }
 188
 189 static unsigned
 190 get_new_program_id(struct intel_screen *screen)
 191 {
 192    return p_atomic_inc_return(&screen->program_id);
 193 }
 194
 195 static struct gl_program *brwNewProgram(struct gl_context *ctx, GLenum target,
 196                                         GLuint id, bool is_arb_asm)
 197 {
 198    struct brw_context *brw = brw_context(ctx);
 199    struct brw_program *prog = rzalloc(NULL, struct brw_program);
 200
 201    if (prog) {
 202       prog->id = get_new_program_id(brw->screen);
 203
 204       return _mesa_init_gl_program(&prog->program, target, id, is_arb_asm);
 205    }
 206
 207    return NULL;
 208 }
 209
 210 static void brwDeleteProgram( struct gl_context *ctx,
 211                               struct gl_program *prog )
 212 {
 213    struct brw_context *brw = brw_context(ctx);
 214
 215    /* Beware!  prog's refcount has reached zero, and it's about to be freed.
 216     *
 217     * In brw_upload_pipeline_state(), we compare brw->programs[i] to
 218     * ctx->FooProgram._Current, and flag BRW_NEW_FOO_PROGRAM if the
 219     * pointer has changed.
 220     *
 221     * We cannot leave brw->programs[i] as a dangling pointer to the dead
 222     * program.  malloc() may allocate the same memory for a new gl_program,
 223     * causing us to see matching pointers...but totally different programs.
 224     *
 225     * We cannot set brw->programs[i] to NULL, either.  If we've deleted the
 226     * active program, Mesa may set ctx->FooProgram._Current to NULL.  That
 227     * would cause us to see matching pointers (NULL == NULL), and fail to
 228     * detect that a program has changed since our last draw.
 229     *
 230     * So, set it to a bogus gl_program pointer that will never match,
 231     * causing us to properly reevaluate the state on our next draw.
 232     *
 233     * Getting this wrong causes heisenbugs which are very hard to catch,
 234     * as you need a very specific allocation pattern to hit the problem.
 235     */
 236    static const struct gl_program deleted_program;
 237
 238    for (int i = 0; i < MESA_SHADER_STAGES; i++) {
 239       if (brw->programs[i] == prog)
 240          brw->programs[i] = (struct gl_program *) &deleted_program;
 241    }
 242
 243    _mesa_delete_program( ctx, prog );
 244 }
 245
 246
 247 static GLboolean
 248 brwProgramStringNotify(struct gl_context *ctx,
 249                        GLenum target,
 250                        struct gl_program *prog)
 251 {
 252    assert(target == GL_VERTEX_PROGRAM_ARB || !prog->arb.IsPositionInvariant);
 253
 254    struct brw_context *brw = brw_context(ctx);
 255    const struct brw_compiler *compiler = brw->screen->compiler;
 256
 257    switch (target) {
 258    case GL_FRAGMENT_PROGRAM_ARB: {
 259       struct brw_program *newFP = brw_program(prog);
 260       const struct brw_program *curFP =
 261          brw_program_const(brw->programs[MESA_SHADER_FRAGMENT]);
 262
 263       if (newFP == curFP)
 264          brw->ctx.NewDriverState |= BRW_NEW_FRAGMENT_PROGRAM;
 265       _mesa_program_fragment_position_to_sysval(&newFP->program);
 266       newFP->id = get_new_program_id(brw->screen);
 267
 268       prog->nir = brw_create_nir(brw, NULL, prog, MESA_SHADER_FRAGMENT, true);
 269
 270       brw_nir_lower_resources(prog->nir, NULL, prog, &brw->screen->devinfo);
 271
 272       brw_shader_gather_info(prog->nir, prog);
 273
 274       brw_fs_precompile(ctx, prog);
 275       break;
 276    }
 277    case GL_VERTEX_PROGRAM_ARB: {
 278       struct brw_program *newVP = brw_program(prog);
 279       const struct brw_program *curVP =
 280          brw_program_const(brw->programs[MESA_SHADER_VERTEX]);
 281
 282       if (newVP == curVP)
 283          brw->ctx.NewDriverState |= BRW_NEW_VERTEX_PROGRAM;
 284       if (newVP->program.arb.IsPositionInvariant) {
 285          _mesa_insert_mvp_code(ctx, &newVP->program);
 286       }
 287       newVP->id = get_new_program_id(brw->screen);
 288
 289       /* Also tell tnl about it:
 290        */
 291       _tnl_program_string(ctx, target, prog);
 292
 293       prog->nir = brw_create_nir(brw, NULL, prog, MESA_SHADER_VERTEX,
 294                                  compiler->scalar_stage[MESA_SHADER_VERTEX]);
 295
 296       brw_nir_lower_resources(prog->nir, NULL, prog, &brw->screen->devinfo);
 297
 298       brw_shader_gather_info(prog->nir, prog);
 299
 300       brw_vs_precompile(ctx, prog);
 301       break;
 302    }
 303    default:
 304       /*
 305        * driver->ProgramStringNotify is only called for ARB programs, fixed
 306        * function vertex programs, and ir_to_mesa (which isn't used by the
 307        * i965 back-end).  Therefore, even after geometry shaders are added,
 308        * this function should only ever be called with a target of
 309        * GL_VERTEX_PROGRAM_ARB or GL_FRAGMENT_PROGRAM_ARB.
 310        */
 311       unreachable("Unexpected target in brwProgramStringNotify");
 312    }
 313
 314    return true;
 315 }
 316
 317 static void
 318 brw_memory_barrier(struct gl_context *ctx, GLbitfield barriers)
 319 {
 320    struct brw_context *brw = brw_context(ctx);
 321    const struct gen_device_info *devinfo = &brw->screen->devinfo;
 322    unsigned bits = PIPE_CONTROL_DATA_CACHE_FLUSH | PIPE_CONTROL_CS_STALL;
 323    assert(devinfo->gen >= 7 && devinfo->gen <= 11);
 324
 325    if (barriers & (GL_VERTEX_ATTRIB_ARRAY_BARRIER_BIT |
 326                    GL_ELEMENT_ARRAY_BARRIER_BIT |
 327                    GL_COMMAND_BARRIER_BIT))
 328       bits |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
 329
 330    if (barriers & GL_UNIFORM_BARRIER_BIT)
 331       bits |= (PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
 332                PIPE_CONTROL_CONST_CACHE_INVALIDATE);
 333
 334    if (barriers & GL_TEXTURE_FETCH_BARRIER_BIT)
 335       bits |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
 336
 337    if (barriers & (GL_TEXTURE_UPDATE_BARRIER_BIT |
 338                    GL_PIXEL_BUFFER_BARRIER_BIT))
 339       bits |= (PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
 340                PIPE_CONTROL_RENDER_TARGET_FLUSH);
 341
 342    if (barriers & GL_FRAMEBUFFER_BARRIER_BIT)
 343       bits |= (PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
 344                PIPE_CONTROL_RENDER_TARGET_FLUSH);
 345
 346    /* Typed surface messages are handled by the render cache on IVB, so we
 347     * need to flush it too.
 348     */
 349    if (devinfo->gen == 7 && !devinfo->is_haswell)
 350       bits |= PIPE_CONTROL_RENDER_TARGET_FLUSH;
 351
 352    brw_emit_pipe_control_flush(brw, bits);
 353 }
 354
 355 static void
 356 brw_framebuffer_fetch_barrier(struct gl_context *ctx)
 357 {
 358    struct brw_context *brw = brw_context(ctx);
 359    const struct gen_device_info *devinfo = &brw->screen->devinfo;
 360
 361    if (!ctx->Extensions.EXT_shader_framebuffer_fetch) {
 362       if (devinfo->gen >= 6) {
 363          brw_emit_pipe_control_flush(brw,
 364                                      PIPE_CONTROL_RENDER_TARGET_FLUSH |
 365                                      PIPE_CONTROL_CS_STALL);
 366          brw_emit_pipe_control_flush(brw,
 367                                      PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE);
 368       } else {
 369          brw_emit_pipe_control_flush(brw,
 370                                      PIPE_CONTROL_RENDER_TARGET_FLUSH);
 371       }
 372    }
 373 }
 374
 375 void
 376 brw_get_scratch_bo(struct brw_context *brw,
 377                    struct brw_bo **scratch_bo, int size)
 378 {
 379    struct brw_bo *old_bo = *scratch_bo;
 380
 381    if (old_bo && old_bo->size < size) {
 382       brw_bo_unreference(old_bo);
 383       old_bo = NULL;
 384    }
 385
 386    if (!old_bo) {
 387       *scratch_bo =
 388          brw_bo_alloc(brw->bufmgr, "scratch bo", size, BRW_MEMZONE_SCRATCH);
 389    }
 390 }
 391
 392 /**
 393  * Reserve enough scratch space for the given stage to hold \p per_thread_size
 394  * bytes times the given \p thread_count.
 395  */
 396 void
 397 brw_alloc_stage_scratch(struct brw_context *brw,
 398                         struct brw_stage_state *stage_state,
 399                         unsigned per_thread_size)
 400 {
 401    if (stage_state->per_thread_scratch >= per_thread_size)
 402       return;
 403
 404    stage_state->per_thread_scratch = per_thread_size;
 405
 406    if (stage_state->scratch_bo)
 407       brw_bo_unreference(stage_state->scratch_bo);
 408
 409    const struct gen_device_info *devinfo = &brw->screen->devinfo;
 410    unsigned thread_count;
 411    switch(stage_state->stage) {
 412    case MESA_SHADER_VERTEX:
 413       thread_count = devinfo->max_vs_threads;
 414       break;
 415    case MESA_SHADER_TESS_CTRL:
 416       thread_count = devinfo->max_tcs_threads;
 417       break;
 418    case MESA_SHADER_TESS_EVAL:
 419       thread_count = devinfo->max_tes_threads;
 420       break;
 421    case MESA_SHADER_GEOMETRY:
 422       thread_count = devinfo->max_gs_threads;
 423       break;
 424    case MESA_SHADER_FRAGMENT:
 425       thread_count = devinfo->max_wm_threads;
 426       break;
 427    case MESA_SHADER_COMPUTE: {
 428       unsigned subslices = MAX2(brw->screen->subslice_total, 1);
 429
 430       /* The documentation for 3DSTATE_PS "Scratch Space Base Pointer" says:
 431        *
 432        * "Scratch Space per slice is computed based on 4 sub-slices.  SW must
 433        *  allocate scratch space enough so that each slice has 4 slices
 434        *  allowed."
 435        *
 436        * According to the other driver team, this applies to compute shaders
 437        * as well.  This is not currently documented at all.
 438        *
 439        * brw->screen->subslice_total is the TOTAL number of subslices
 440        * and we wish to view that there are 4 subslices per slice
 441        * instead of the actual number of subslices per slice.
 442        */
 443       if (devinfo->gen >= 9 && devinfo->gen < 11)
 444          subslices = 4 * brw->screen->devinfo.num_slices;
 445
 446       unsigned scratch_ids_per_subslice;
 447       if (devinfo->is_haswell) {
 448          /* WaCSScratchSize:hsw
 449           *
 450           * Haswell's scratch space address calculation appears to be sparse
 451           * rather than tightly packed. The Thread ID has bits indicating
 452           * which subslice, EU within a subslice, and thread within an EU it
 453           * is. There's a maximum of two slices and two subslices, so these
 454           * can be stored with a single bit. Even though there are only 10 EUs
 455           * per subslice, this is stored in 4 bits, so there's an effective
 456           * maximum value of 16 EUs. Similarly, although there are only 7
 457           * threads per EU, this is stored in a 3 bit number, giving an
 458           * effective maximum value of 8 threads per EU.
 459           *
 460           * This means that we need to use 16 * 8 instead of 10 * 7 for the
 461           * number of threads per subslice.
 462           */
 463          scratch_ids_per_subslice = 16 * 8;
 464       } else if (devinfo->is_cherryview) {
 465          /* Cherryview devices have either 6 or 8 EUs per subslice, and each
 466           * EU has 7 threads. The 6 EU devices appear to calculate thread IDs
 467           * as if it had 8 EUs.
 468           */
 469          scratch_ids_per_subslice = 8 * 7;
 470       } else {
 471          scratch_ids_per_subslice = devinfo->max_cs_threads;
 472       }
 473
 474       thread_count = scratch_ids_per_subslice * subslices;
 475       break;
 476    }
 477    default:
 478       unreachable("Unsupported stage!");
 479    }
 480
 481    stage_state->scratch_bo =
 482       brw_bo_alloc(brw->bufmgr, "shader scratch space",
 483                    per_thread_size * thread_count, BRW_MEMZONE_SCRATCH);
 484 }
 485
 486 void brwInitFragProgFuncs( struct dd_function_table *functions )
 487 {
 488    assert(functions->ProgramStringNotify == _tnl_program_string);
 489
 490    functions->NewProgram = brwNewProgram;
 491    functions->DeleteProgram = brwDeleteProgram;
 492    functions->ProgramStringNotify = brwProgramStringNotify;
 493
 494    functions->LinkShader = brw_link_shader;
 495
 496    functions->MemoryBarrier = brw_memory_barrier;
 497    functions->FramebufferFetchBarrier = brw_framebuffer_fetch_barrier;
 498 }
 499
 500 struct shader_times {
 501    uint64_t time;
 502    uint64_t written;
 503    uint64_t reset;
 504 };
 505
 506 void
 507 brw_init_shader_time(struct brw_context *brw)
 508 {
 509    const int max_entries = 2048;
 510    brw->shader_time.bo =
 511       brw_bo_alloc(brw->bufmgr, "shader time",
 512                    max_entries * BRW_SHADER_TIME_STRIDE * 3,
 513                    BRW_MEMZONE_OTHER);
 514    brw->shader_time.names = rzalloc_array(brw, const char *, max_entries);
 515    brw->shader_time.ids = rzalloc_array(brw, int, max_entries);
 516    brw->shader_time.types = rzalloc_array(brw, enum shader_time_shader_type,
 517                                           max_entries);
 518    brw->shader_time.cumulative = rzalloc_array(brw, struct shader_times,
 519                                                max_entries);
 520    brw->shader_time.max_entries = max_entries;
 521 }
 522
 523 static int
 524 compare_time(const void *a, const void *b)
 525 {
 526    uint64_t * const *a_val = a;
 527    uint64_t * const *b_val = b;
 528
 529    /* We don't just subtract because we're turning the value to an int. */
 530    if (**a_val < **b_val)
 531       return -1;
 532    else if (**a_val == **b_val)
 533       return 0;
 534    else
 535       return 1;
 536 }
 537
 538 static void
 539 print_shader_time_line(const char *stage, const char *name,
 540                        int shader_num, uint64_t time, uint64_t total)
 541 {
 542    fprintf(stderr, "%-6s%-18s", stage, name);
 543
 544    if (shader_num != 0)
 545       fprintf(stderr, "%4d: ", shader_num);
 546    else
 547       fprintf(stderr, "    : ");
 548
 549    fprintf(stderr, "%16lld (%7.2f Gcycles)      %4.1f%%\n",
 550            (long long)time,
 551            (double)time / 1000000000.0,
 552            (double)time / total * 100.0);
 553 }
 554
 555 static void
 556 brw_report_shader_time(struct brw_context *brw)
 557 {
 558    if (!brw->shader_time.bo || !brw->shader_time.num_entries)
 559       return;
 560
 561    uint64_t scaled[brw->shader_time.num_entries];
 562    uint64_t *sorted[brw->shader_time.num_entries];
 563    uint64_t total_by_type[ST_CS + 1];
 564    memset(total_by_type, 0, sizeof(total_by_type));
 565    double total = 0;
 566    for (int i = 0; i < brw->shader_time.num_entries; i++) {
 567       uint64_t written = 0, reset = 0;
 568       enum shader_time_shader_type type = brw->shader_time.types[i];
 569
 570       sorted[i] = &scaled[i];
 571
 572       switch (type) {
 573       case ST_VS:
 574       case ST_TCS:
 575       case ST_TES:
 576       case ST_GS:
 577       case ST_FS8:
 578       case ST_FS16:
 579       case ST_FS32:
 580       case ST_CS:
 581          written = brw->shader_time.cumulative[i].written;
 582          reset = brw->shader_time.cumulative[i].reset;
 583          break;
 584
 585       default:
 586          /* I sometimes want to print things that aren't the 3 shader times.
 587           * Just print the sum in that case.
 588           */
 589          written = 1;
 590          reset = 0;
 591          break;
 592       }
 593
 594       uint64_t time = brw->shader_time.cumulative[i].time;
 595       if (written) {
 596          scaled[i] = time / written * (written + reset);
 597       } else {
 598          scaled[i] = time;
 599       }
 600
 601       switch (type) {
 602       case ST_VS:
 603       case ST_TCS:
 604       case ST_TES:
 605       case ST_GS:
 606       case ST_FS8:
 607       case ST_FS16:
 608       case ST_FS32:
 609       case ST_CS:
 610          total_by_type[type] += scaled[i];
 611          break;
 612       default:
 613          break;
 614       }
 615
 616       total += scaled[i];
 617    }
 618
 619    if (total == 0) {
 620       fprintf(stderr, "No shader time collected yet\n");
 621       return;
 622    }
 623
 624    qsort(sorted, brw->shader_time.num_entries, sizeof(sorted[0]), compare_time);
 625
 626    fprintf(stderr, "\n");
 627    fprintf(stderr, "type          ID                  cycles spent                   %% of total\n");
 628    for (int s = 0; s < brw->shader_time.num_entries; s++) {
 629       const char *stage;
 630       /* Work back from the sorted pointers times to a time to print. */
 631       int i = sorted[s] - scaled;
 632
 633       if (scaled[i] == 0)
 634          continue;
 635
 636       int shader_num = brw->shader_time.ids[i];
 637       const char *shader_name = brw->shader_time.names[i];
 638
 639       switch (brw->shader_time.types[i]) {
 640       case ST_VS:
 641          stage = "vs";
 642          break;
 643       case ST_TCS:
 644          stage = "tcs";
 645          break;
 646       case ST_TES:
 647          stage = "tes";
 648          break;
 649       case ST_GS:
 650          stage = "gs";
 651          break;
 652       case ST_FS8:
 653          stage = "fs8";
 654          break;
 655       case ST_FS16:
 656          stage = "fs16";
 657          break;
 658       case ST_FS32:
 659          stage = "fs32";
 660          break;
 661       case ST_CS:
 662          stage = "cs";
 663          break;
 664       default:
 665          stage = "other";
 666          break;
 667       }
 668
 669       print_shader_time_line(stage, shader_name, shader_num,
 670                              scaled[i], total);
 671    }
 672
 673    fprintf(stderr, "\n");
 674    print_shader_time_line("total", "vs", 0, total_by_type[ST_VS], total);
 675    print_shader_time_line("total", "tcs", 0, total_by_type[ST_TCS], total);
 676    print_shader_time_line("total", "tes", 0, total_by_type[ST_TES], total);
 677    print_shader_time_line("total", "gs", 0, total_by_type[ST_GS], total);
 678    print_shader_time_line("total", "fs8", 0, total_by_type[ST_FS8], total);
 679    print_shader_time_line("total", "fs16", 0, total_by_type[ST_FS16], total);
 680    print_shader_time_line("total", "fs32", 0, total_by_type[ST_FS32], total);
 681    print_shader_time_line("total", "cs", 0, total_by_type[ST_CS], total);
 682 }
 683
 684 static void
 685 brw_collect_shader_time(struct brw_context *brw)
 686 {
 687    if (!brw->shader_time.bo)
 688       return;
 689
 690    /* This probably stalls on the last rendering.  We could fix that by
 691     * delaying reading the reports, but it doesn't look like it's a big
 692     * overhead compared to the cost of tracking the time in the first place.
 693     */
 694    void *bo_map = brw_bo_map(brw, brw->shader_time.bo, MAP_READ | MAP_WRITE);
 695
 696    for (int i = 0; i < brw->shader_time.num_entries; i++) {
 697       uint32_t *times = bo_map + i * 3 * BRW_SHADER_TIME_STRIDE;
 698
 699       brw->shader_time.cumulative[i].time += times[BRW_SHADER_TIME_STRIDE * 0 / 4];
 700       brw->shader_time.cumulative[i].written += times[BRW_SHADER_TIME_STRIDE * 1 / 4];
 701       brw->shader_time.cumulative[i].reset += times[BRW_SHADER_TIME_STRIDE * 2 / 4];
 702    }
 703
 704    /* Zero the BO out to clear it out for our next collection.
 705     */
 706    memset(bo_map, 0, brw->shader_time.bo->size);
 707    brw_bo_unmap(brw->shader_time.bo);
 708 }
 709
 710 void
 711 brw_collect_and_report_shader_time(struct brw_context *brw)
 712 {
 713    brw_collect_shader_time(brw);
 714
 715    if (brw->shader_time.report_time == 0 ||
 716        get_time() - brw->shader_time.report_time >= 1.0) {
 717       brw_report_shader_time(brw);
 718       brw->shader_time.report_time = get_time();
 719    }
 720 }
 721
 722 /**
 723  * Chooses an index in the shader_time buffer and sets up tracking information
 724  * for our printouts.
 725  *
 726  * Note that this holds on to references to the underlying programs, which may
 727  * change their lifetimes compared to normal operation.
 728  */
 729 int
 730 brw_get_shader_time_index(struct brw_context *brw, struct gl_program *prog,
 731                           enum shader_time_shader_type type, bool is_glsl_sh)
 732 {
 733    int shader_time_index = brw->shader_time.num_entries++;
 734    assert(shader_time_index < brw->shader_time.max_entries);
 735    brw->shader_time.types[shader_time_index] = type;
 736
 737    const char *name;
 738    if (prog->Id == 0) {
 739       name = "ff";
 740    } else if (is_glsl_sh) {
 741       name = prog->info.label ?
 742          ralloc_strdup(brw->shader_time.names, prog->info.label) : "glsl";
 743    } else {
 744       name = "prog";
 745    }
 746
 747    brw->shader_time.names[shader_time_index] = name;
 748    brw->shader_time.ids[shader_time_index] = prog->Id;
 749
 750    return shader_time_index;
 751 }
 752
 753 void
 754 brw_destroy_shader_time(struct brw_context *brw)
 755 {
 756    brw_bo_unreference(brw->shader_time.bo);
 757    brw->shader_time.bo = NULL;
 758 }
 759
 760 void
 761 brw_stage_prog_data_free(const void *p)
 762 {
 763    struct brw_stage_prog_data *prog_data = (struct brw_stage_prog_data *)p;
 764
 765    ralloc_free(prog_data->param);
 766    ralloc_free(prog_data->pull_param);
 767 }
 768
 769 void
 770 brw_dump_arb_asm(const char *stage, struct gl_program *prog)
 771 {
 772    fprintf(stderr, "ARB_%s_program %d ir for native %s shader\n",
 773            stage, prog->Id, stage);
 774    _mesa_print_program(prog);
 775 }
 776
 777 void
 778 brw_setup_tex_for_precompile(const struct gen_device_info *devinfo,
 779                              struct brw_sampler_prog_key_data *tex,
 780                              const struct gl_program *prog)
 781 {
 782    const bool has_shader_channel_select = devinfo->is_haswell || devinfo->gen >= 8;
 783    unsigned sampler_count = util_last_bit(prog->SamplersUsed);
 784    for (unsigned i = 0; i < sampler_count; i++) {
 785       if (!has_shader_channel_select && (prog->ShadowSamplers & (1 << i))) {
 786          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
 787          tex->swizzles[i] =
 788             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
 789       } else {
 790          /* Color sampler: assume no swizzling. */
 791          tex->swizzles[i] = SWIZZLE_XYZW;
 792       }
 793    }
 794 }
 795
 796 /**
 797  * Sets up the starting offsets for the groups of binding table entries
 798  * common to all pipeline stages.
 799  *
 800  * Unused groups are initialized to 0xd0d0d0d0 to make it obvious that they're
 801  * unused but also make sure that addition of small offsets to them will
 802  * trigger some of our asserts that surface indices are < BRW_MAX_SURFACES.
 803  */
 804 uint32_t
 805 brw_assign_common_binding_table_offsets(const struct gen_device_info *devinfo,
 806                                         const struct gl_program *prog,
 807                                         struct brw_stage_prog_data *stage_prog_data,
 808                                         uint32_t next_binding_table_offset)
 809 {
 810    int num_textures = util_last_bit(prog->SamplersUsed);
 811
 812    stage_prog_data->binding_table.texture_start = next_binding_table_offset;
 813    next_binding_table_offset += num_textures;
 814
 815    if (prog->info.num_ubos) {
 816       assert(prog->info.num_ubos <= BRW_MAX_UBO);
 817       stage_prog_data->binding_table.ubo_start = next_binding_table_offset;
 818       next_binding_table_offset += prog->info.num_ubos;
 819    } else {
 820       stage_prog_data->binding_table.ubo_start = 0xd0d0d0d0;
 821    }
 822
 823    if (prog->info.num_ssbos || prog->info.num_abos) {
 824       assert(prog->info.num_abos <= BRW_MAX_ABO);
 825       assert(prog->info.num_ssbos <= BRW_MAX_SSBO);
 826       stage_prog_data->binding_table.ssbo_start = next_binding_table_offset;
 827       next_binding_table_offset += prog->info.num_abos + prog->info.num_ssbos;
 828    } else {
 829       stage_prog_data->binding_table.ssbo_start = 0xd0d0d0d0;
 830    }
 831
 832    if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
 833       stage_prog_data->binding_table.shader_time_start = next_binding_table_offset;
 834       next_binding_table_offset++;
 835    } else {
 836       stage_prog_data->binding_table.shader_time_start = 0xd0d0d0d0;
 837    }
 838
 839    if (prog->info.uses_texture_gather) {
 840       if (devinfo->gen >= 8) {
 841          stage_prog_data->binding_table.gather_texture_start =
 842             stage_prog_data->binding_table.texture_start;
 843       } else {
 844          stage_prog_data->binding_table.gather_texture_start = next_binding_table_offset;
 845          next_binding_table_offset += num_textures;
 846       }
 847    } else {
 848       stage_prog_data->binding_table.gather_texture_start = 0xd0d0d0d0;
 849    }
 850
 851    if (prog->info.num_images) {
 852       stage_prog_data->binding_table.image_start = next_binding_table_offset;
 853       next_binding_table_offset += prog->info.num_images;
 854    } else {
 855       stage_prog_data->binding_table.image_start = 0xd0d0d0d0;
 856    }
 857
 858    /* This may or may not be used depending on how the compile goes. */
 859    stage_prog_data->binding_table.pull_constants_start = next_binding_table_offset;
 860    next_binding_table_offset++;
 861
 862    /* Plane 0 is just the regular texture section */
 863    stage_prog_data->binding_table.plane_start[0] = stage_prog_data->binding_table.texture_start;
 864
 865    stage_prog_data->binding_table.plane_start[1] = next_binding_table_offset;
 866    next_binding_table_offset += num_textures;
 867
 868    stage_prog_data->binding_table.plane_start[2] = next_binding_table_offset;
 869    next_binding_table_offset += num_textures;
 870
 871    /* Set the binding table size.  Some callers may append new entries
 872     * and increase this accordingly.
 873     */
 874    stage_prog_data->binding_table.size_bytes = next_binding_table_offset * 4;
 875
 876    assert(next_binding_table_offset <= BRW_MAX_SURFACES);
 877    return next_binding_table_offset;
 878 }
 879
 880 void
 881 brw_populate_default_key(const struct brw_compiler *compiler,
 882                          union brw_any_prog_key *prog_key,
 883                          struct gl_shader_program *sh_prog,
 884                          struct gl_program *prog)
 885 {
 886    switch (prog->info.stage) {
 887    case MESA_SHADER_VERTEX:
 888       brw_vs_populate_default_key(compiler, &prog_key->vs, prog);
 889       break;
 890    case MESA_SHADER_TESS_CTRL:
 891       brw_tcs_populate_default_key(compiler, &prog_key->tcs, sh_prog, prog);
 892       break;
 893    case MESA_SHADER_TESS_EVAL:
 894       brw_tes_populate_default_key(compiler, &prog_key->tes, sh_prog, prog);
 895       break;
 896    case MESA_SHADER_GEOMETRY:
 897       brw_gs_populate_default_key(compiler, &prog_key->gs, prog);
 898       break;
 899    case MESA_SHADER_FRAGMENT:
 900       brw_wm_populate_default_key(compiler, &prog_key->wm, prog);
 901       break;
 902    case MESA_SHADER_COMPUTE:
 903       brw_cs_populate_default_key(compiler, &prog_key->cs, prog);
 904       break;
 905    default:
 906       unreachable("Unsupported stage!");
 907    }
 908 }
 909
 910 void
 911 brw_debug_recompile(struct brw_context *brw,
 912                     gl_shader_stage stage,
 913                     unsigned api_id,
 914                     struct brw_base_prog_key *key)
 915 {
 916    const struct brw_compiler *compiler = brw->screen->compiler;
 917    enum brw_cache_id cache_id = brw_stage_cache_id(stage);
 918
 919    compiler->shader_perf_log(brw, "Recompiling %s shader for program %d\n",
 920                              _mesa_shader_stage_to_string(stage), api_id);
 921
 922    const void *old_key =
 923       brw_find_previous_compile(&brw->cache, cache_id, key->program_string_id);
 924
 925    brw_debug_key_recompile(compiler, brw, stage, old_key, key);
 926 }