src/mesa/drivers/dri/i965/brw_program.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keithw@vmware.com>
  30   */
  31
  32 #include <pthread.h>
  33 #include "main/imports.h"
  34 #include "main/glspirv.h"
  35 #include "program/prog_parameter.h"
  36 #include "program/prog_print.h"
  37 #include "program/prog_to_nir.h"
  38 #include "program/program.h"
  39 #include "program/programopt.h"
  40 #include "tnl/tnl.h"
  41 #include "util/ralloc.h"
  42 #include "compiler/glsl/ir.h"
  43 #include "compiler/glsl/program.h"
  44 #include "compiler/glsl/gl_nir.h"
  45 #include "compiler/glsl/glsl_to_nir.h"
  46
  47 #include "brw_program.h"
  48 #include "brw_context.h"
  49 #include "compiler/brw_nir.h"
  50 #include "brw_defines.h"
  51 #include "intel_batchbuffer.h"
  52
  53 #include "brw_cs.h"
  54 #include "brw_gs.h"
  55 #include "brw_vs.h"
  56 #include "brw_wm.h"
  57 #include "brw_state.h"
  58
  59 #include "main/shaderapi.h"
  60 #include "main/shaderobj.h"
  61
  62 static bool
  63 brw_nir_lower_uniforms(nir_shader *nir, bool is_scalar)
  64 {
  65    if (is_scalar) {
  66       nir_assign_var_locations(&nir->uniforms, &nir->num_uniforms,
  67                                type_size_scalar_bytes);
  68       return nir_lower_io(nir, nir_var_uniform, type_size_scalar_bytes, 0);
  69    } else {
  70       nir_assign_var_locations(&nir->uniforms, &nir->num_uniforms,
  71                                type_size_vec4_bytes);
  72       return nir_lower_io(nir, nir_var_uniform, type_size_vec4_bytes, 0);
  73    }
  74 }
  75
  76 static struct gl_program *brwNewProgram(struct gl_context *ctx, GLenum target,
  77                                         GLuint id, bool is_arb_asm);
  78
  79 nir_shader *
  80 brw_create_nir(struct brw_context *brw,
  81                const struct gl_shader_program *shader_prog,
  82                struct gl_program *prog,
  83                gl_shader_stage stage,
  84                bool is_scalar)
  85 {
  86    const struct gen_device_info *devinfo = &brw->screen->devinfo;
  87    struct gl_context *ctx = &brw->ctx;
  88    const nir_shader_compiler_options *options =
  89       ctx->Const.ShaderCompilerOptions[stage].NirOptions;
  90    nir_shader *nir;
  91
  92    /* First, lower the GLSL/Mesa IR or SPIR-V to NIR */
  93    if (shader_prog) {
  94       if (shader_prog->data->spirv) {
  95          nir = _mesa_spirv_to_nir(ctx, shader_prog, stage, options);
  96       } else {
  97          nir = glsl_to_nir(ctx, shader_prog, stage, options);
  98       }
  99       assert (nir);
 100
 101       nir_remove_dead_variables(nir, nir_var_shader_in | nir_var_shader_out);
 102       nir_validate_shader(nir, "after glsl_to_nir or spirv_to_nir");
 103       NIR_PASS_V(nir, nir_lower_io_to_temporaries,
 104                  nir_shader_get_entrypoint(nir), true, false);
 105    } else {
 106       nir = prog_to_nir(prog, options);
 107       NIR_PASS_V(nir, nir_lower_regs_to_ssa); /* turn registers into SSA */
 108    }
 109    nir_validate_shader(nir, "before brw_preprocess_nir");
 110
 111    nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
 112
 113    nir_shader *softfp64 = NULL;
 114    if ((options->lower_doubles_options & nir_lower_fp64_full_software) &&
 115        nir->info.uses_64bit) {
 116       softfp64 = glsl_float64_funcs_to_nir(ctx, options);
 117       ralloc_steal(ralloc_parent(nir), softfp64);
 118    }
 119
 120    brw_preprocess_nir(brw->screen->compiler, nir, softfp64);
 121
 122    if (stage == MESA_SHADER_TESS_CTRL) {
 123       /* Lower gl_PatchVerticesIn from a sys. value to a uniform on Gen8+. */
 124       static const gl_state_index16 tokens[STATE_LENGTH] =
 125          { STATE_INTERNAL, STATE_TCS_PATCH_VERTICES_IN };
 126       nir_lower_patch_vertices(nir, 0, devinfo->gen >= 8 ? tokens : NULL);
 127    }
 128
 129    if (stage == MESA_SHADER_TESS_EVAL) {
 130       /* Lower gl_PatchVerticesIn to a constant if we have a TCS, or
 131        * a uniform if we don't.
 132        */
 133       struct gl_linked_shader *tcs =
 134          shader_prog->_LinkedShaders[MESA_SHADER_TESS_CTRL];
 135       uint32_t static_patch_vertices =
 136          tcs ? tcs->Program->nir->info.tess.tcs_vertices_out : 0;
 137       static const gl_state_index16 tokens[STATE_LENGTH] =
 138          { STATE_INTERNAL, STATE_TES_PATCH_VERTICES_IN };
 139       nir_lower_patch_vertices(nir, static_patch_vertices, tokens);
 140    }
 141
 142    if (stage == MESA_SHADER_FRAGMENT) {
 143       static const struct nir_lower_wpos_ytransform_options wpos_options = {
 144          .state_tokens = {STATE_INTERNAL, STATE_FB_WPOS_Y_TRANSFORM, 0, 0, 0},
 145          .fs_coord_pixel_center_integer = 1,
 146          .fs_coord_origin_upper_left = 1,
 147       };
 148
 149       bool progress = false;
 150       NIR_PASS(progress, nir, nir_lower_wpos_ytransform, &wpos_options);
 151       if (progress) {
 152          _mesa_add_state_reference(prog->Parameters,
 153                                    wpos_options.state_tokens);
 154       }
 155    }
 156
 157    NIR_PASS_V(nir, brw_nir_lower_uniforms, is_scalar);
 158
 159    return nir;
 160 }
 161
 162 void
 163 brw_nir_lower_resources(nir_shader *nir, struct gl_shader_program *shader_prog,
 164                         struct gl_program *prog,
 165                         const struct gen_device_info *devinfo)
 166 {
 167    NIR_PASS_V(prog->nir, gl_nir_lower_samplers, shader_prog);
 168    prog->info.textures_used = prog->nir->info.textures_used;
 169    prog->info.textures_used_by_txf = prog->nir->info.textures_used_by_txf;
 170
 171    NIR_PASS_V(prog->nir, brw_nir_lower_image_load_store, devinfo);
 172
 173    NIR_PASS_V(prog->nir, gl_nir_lower_buffers, shader_prog);
 174    /* Do a round of constant folding to clean up address calculations */
 175    NIR_PASS_V(prog->nir, nir_opt_constant_folding);
 176 }
 177
 178 void
 179 brw_shader_gather_info(nir_shader *nir, struct gl_program *prog)
 180 {
 181    nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
 182
 183    /* Copy the info we just generated back into the gl_program */
 184    const char *prog_name = prog->info.name;
 185    const char *prog_label = prog->info.label;
 186    prog->info = nir->info;
 187    prog->info.name = prog_name;
 188    prog->info.label = prog_label;
 189 }
 190
 191 static unsigned
 192 get_new_program_id(struct intel_screen *screen)
 193 {
 194    return p_atomic_inc_return(&screen->program_id);
 195 }
 196
 197 static struct gl_program *brwNewProgram(struct gl_context *ctx, GLenum target,
 198                                         GLuint id, bool is_arb_asm)
 199 {
 200    struct brw_context *brw = brw_context(ctx);
 201    struct brw_program *prog = rzalloc(NULL, struct brw_program);
 202
 203    if (prog) {
 204       prog->id = get_new_program_id(brw->screen);
 205
 206       return _mesa_init_gl_program(&prog->program, target, id, is_arb_asm);
 207    }
 208
 209    return NULL;
 210 }
 211
 212 static void brwDeleteProgram( struct gl_context *ctx,
 213                               struct gl_program *prog )
 214 {
 215    struct brw_context *brw = brw_context(ctx);
 216
 217    /* Beware!  prog's refcount has reached zero, and it's about to be freed.
 218     *
 219     * In brw_upload_pipeline_state(), we compare brw->programs[i] to
 220     * ctx->FooProgram._Current, and flag BRW_NEW_FOO_PROGRAM if the
 221     * pointer has changed.
 222     *
 223     * We cannot leave brw->programs[i] as a dangling pointer to the dead
 224     * program.  malloc() may allocate the same memory for a new gl_program,
 225     * causing us to see matching pointers...but totally different programs.
 226     *
 227     * We cannot set brw->programs[i] to NULL, either.  If we've deleted the
 228     * active program, Mesa may set ctx->FooProgram._Current to NULL.  That
 229     * would cause us to see matching pointers (NULL == NULL), and fail to
 230     * detect that a program has changed since our last draw.
 231     *
 232     * So, set it to a bogus gl_program pointer that will never match,
 233     * causing us to properly reevaluate the state on our next draw.
 234     *
 235     * Getting this wrong causes heisenbugs which are very hard to catch,
 236     * as you need a very specific allocation pattern to hit the problem.
 237     */
 238    static const struct gl_program deleted_program;
 239
 240    for (int i = 0; i < MESA_SHADER_STAGES; i++) {
 241       if (brw->programs[i] == prog)
 242          brw->programs[i] = (struct gl_program *) &deleted_program;
 243    }
 244
 245    _mesa_delete_program( ctx, prog );
 246 }
 247
 248
 249 static GLboolean
 250 brwProgramStringNotify(struct gl_context *ctx,
 251                        GLenum target,
 252                        struct gl_program *prog)
 253 {
 254    assert(target == GL_VERTEX_PROGRAM_ARB || !prog->arb.IsPositionInvariant);
 255
 256    struct brw_context *brw = brw_context(ctx);
 257    const struct brw_compiler *compiler = brw->screen->compiler;
 258
 259    switch (target) {
 260    case GL_FRAGMENT_PROGRAM_ARB: {
 261       struct brw_program *newFP = brw_program(prog);
 262       const struct brw_program *curFP =
 263          brw_program_const(brw->programs[MESA_SHADER_FRAGMENT]);
 264
 265       if (newFP == curFP)
 266          brw->ctx.NewDriverState |= BRW_NEW_FRAGMENT_PROGRAM;
 267       newFP->id = get_new_program_id(brw->screen);
 268
 269       prog->nir = brw_create_nir(brw, NULL, prog, MESA_SHADER_FRAGMENT, true);
 270
 271       brw_nir_lower_resources(prog->nir, NULL, prog, &brw->screen->devinfo);
 272
 273       brw_shader_gather_info(prog->nir, prog);
 274
 275       brw_fs_precompile(ctx, prog);
 276       break;
 277    }
 278    case GL_VERTEX_PROGRAM_ARB: {
 279       struct brw_program *newVP = brw_program(prog);
 280       const struct brw_program *curVP =
 281          brw_program_const(brw->programs[MESA_SHADER_VERTEX]);
 282
 283       if (newVP == curVP)
 284          brw->ctx.NewDriverState |= BRW_NEW_VERTEX_PROGRAM;
 285       if (newVP->program.arb.IsPositionInvariant) {
 286          _mesa_insert_mvp_code(ctx, &newVP->program);
 287       }
 288       newVP->id = get_new_program_id(brw->screen);
 289
 290       /* Also tell tnl about it:
 291        */
 292       _tnl_program_string(ctx, target, prog);
 293
 294       prog->nir = brw_create_nir(brw, NULL, prog, MESA_SHADER_VERTEX,
 295                                  compiler->scalar_stage[MESA_SHADER_VERTEX]);
 296
 297       brw_nir_lower_resources(prog->nir, NULL, prog, &brw->screen->devinfo);
 298
 299       brw_shader_gather_info(prog->nir, prog);
 300
 301       brw_vs_precompile(ctx, prog);
 302       break;
 303    }
 304    default:
 305       /*
 306        * driver->ProgramStringNotify is only called for ARB programs, fixed
 307        * function vertex programs, and ir_to_mesa (which isn't used by the
 308        * i965 back-end).  Therefore, even after geometry shaders are added,
 309        * this function should only ever be called with a target of
 310        * GL_VERTEX_PROGRAM_ARB or GL_FRAGMENT_PROGRAM_ARB.
 311        */
 312       unreachable("Unexpected target in brwProgramStringNotify");
 313    }
 314
 315    return true;
 316 }
 317
 318 static void
 319 brw_memory_barrier(struct gl_context *ctx, GLbitfield barriers)
 320 {
 321    struct brw_context *brw = brw_context(ctx);
 322    const struct gen_device_info *devinfo = &brw->screen->devinfo;
 323    unsigned bits = PIPE_CONTROL_DATA_CACHE_FLUSH | PIPE_CONTROL_CS_STALL;
 324    assert(devinfo->gen >= 7 && devinfo->gen <= 11);
 325
 326    if (barriers & (GL_VERTEX_ATTRIB_ARRAY_BARRIER_BIT |
 327                    GL_ELEMENT_ARRAY_BARRIER_BIT |
 328                    GL_COMMAND_BARRIER_BIT))
 329       bits |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
 330
 331    if (barriers & GL_UNIFORM_BARRIER_BIT)
 332       bits |= (PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
 333                PIPE_CONTROL_CONST_CACHE_INVALIDATE);
 334
 335    if (barriers & GL_TEXTURE_FETCH_BARRIER_BIT)
 336       bits |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
 337
 338    if (barriers & (GL_TEXTURE_UPDATE_BARRIER_BIT |
 339                    GL_PIXEL_BUFFER_BARRIER_BIT))
 340       bits |= (PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
 341                PIPE_CONTROL_RENDER_TARGET_FLUSH);
 342
 343    if (barriers & GL_FRAMEBUFFER_BARRIER_BIT)
 344       bits |= (PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
 345                PIPE_CONTROL_RENDER_TARGET_FLUSH);
 346
 347    /* Typed surface messages are handled by the render cache on IVB, so we
 348     * need to flush it too.
 349     */
 350    if (devinfo->gen == 7 && !devinfo->is_haswell)
 351       bits |= PIPE_CONTROL_RENDER_TARGET_FLUSH;
 352
 353    brw_emit_pipe_control_flush(brw, bits);
 354 }
 355
 356 static void
 357 brw_framebuffer_fetch_barrier(struct gl_context *ctx)
 358 {
 359    struct brw_context *brw = brw_context(ctx);
 360    const struct gen_device_info *devinfo = &brw->screen->devinfo;
 361
 362    if (!ctx->Extensions.EXT_shader_framebuffer_fetch) {
 363       if (devinfo->gen >= 6) {
 364          brw_emit_pipe_control_flush(brw,
 365                                      PIPE_CONTROL_RENDER_TARGET_FLUSH |
 366                                      PIPE_CONTROL_CS_STALL);
 367          brw_emit_pipe_control_flush(brw,
 368                                      PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE);
 369       } else {
 370          brw_emit_pipe_control_flush(brw,
 371                                      PIPE_CONTROL_RENDER_TARGET_FLUSH);
 372       }
 373    }
 374 }
 375
 376 void
 377 brw_get_scratch_bo(struct brw_context *brw,
 378                    struct brw_bo **scratch_bo, int size)
 379 {
 380    struct brw_bo *old_bo = *scratch_bo;
 381
 382    if (old_bo && old_bo->size < size) {
 383       brw_bo_unreference(old_bo);
 384       old_bo = NULL;
 385    }
 386
 387    if (!old_bo) {
 388       *scratch_bo =
 389          brw_bo_alloc(brw->bufmgr, "scratch bo", size, BRW_MEMZONE_SCRATCH);
 390    }
 391 }
 392
 393 /**
 394  * Reserve enough scratch space for the given stage to hold \p per_thread_size
 395  * bytes times the given \p thread_count.
 396  */
 397 void
 398 brw_alloc_stage_scratch(struct brw_context *brw,
 399                         struct brw_stage_state *stage_state,
 400                         unsigned per_thread_size)
 401 {
 402    if (stage_state->per_thread_scratch >= per_thread_size)
 403       return;
 404
 405    stage_state->per_thread_scratch = per_thread_size;
 406
 407    if (stage_state->scratch_bo)
 408       brw_bo_unreference(stage_state->scratch_bo);
 409
 410    const struct gen_device_info *devinfo = &brw->screen->devinfo;
 411    unsigned thread_count;
 412    switch(stage_state->stage) {
 413    case MESA_SHADER_VERTEX:
 414       thread_count = devinfo->max_vs_threads;
 415       break;
 416    case MESA_SHADER_TESS_CTRL:
 417       thread_count = devinfo->max_tcs_threads;
 418       break;
 419    case MESA_SHADER_TESS_EVAL:
 420       thread_count = devinfo->max_tes_threads;
 421       break;
 422    case MESA_SHADER_GEOMETRY:
 423       thread_count = devinfo->max_gs_threads;
 424       break;
 425    case MESA_SHADER_FRAGMENT:
 426       thread_count = devinfo->max_wm_threads;
 427       break;
 428    case MESA_SHADER_COMPUTE: {
 429       unsigned subslices = MAX2(brw->screen->subslice_total, 1);
 430
 431       /* The documentation for 3DSTATE_PS "Scratch Space Base Pointer" says:
 432        *
 433        * "Scratch Space per slice is computed based on 4 sub-slices.  SW must
 434        *  allocate scratch space enough so that each slice has 4 slices
 435        *  allowed."
 436        *
 437        * According to the other driver team, this applies to compute shaders
 438        * as well.  This is not currently documented at all.
 439        *
 440        * brw->screen->subslice_total is the TOTAL number of subslices
 441        * and we wish to view that there are 4 subslices per slice
 442        * instead of the actual number of subslices per slice.
 443        */
 444       if (devinfo->gen >= 9 && devinfo->gen < 11)
 445          subslices = 4 * brw->screen->devinfo.num_slices;
 446
 447       unsigned scratch_ids_per_subslice;
 448       if (devinfo->is_haswell) {
 449          /* WaCSScratchSize:hsw
 450           *
 451           * Haswell's scratch space address calculation appears to be sparse
 452           * rather than tightly packed. The Thread ID has bits indicating
 453           * which subslice, EU within a subslice, and thread within an EU it
 454           * is. There's a maximum of two slices and two subslices, so these
 455           * can be stored with a single bit. Even though there are only 10 EUs
 456           * per subslice, this is stored in 4 bits, so there's an effective
 457           * maximum value of 16 EUs. Similarly, although there are only 7
 458           * threads per EU, this is stored in a 3 bit number, giving an
 459           * effective maximum value of 8 threads per EU.
 460           *
 461           * This means that we need to use 16 * 8 instead of 10 * 7 for the
 462           * number of threads per subslice.
 463           */
 464          scratch_ids_per_subslice = 16 * 8;
 465       } else if (devinfo->is_cherryview) {
 466          /* Cherryview devices have either 6 or 8 EUs per subslice, and each
 467           * EU has 7 threads. The 6 EU devices appear to calculate thread IDs
 468           * as if it had 8 EUs.
 469           */
 470          scratch_ids_per_subslice = 8 * 7;
 471       } else {
 472          scratch_ids_per_subslice = devinfo->max_cs_threads;
 473       }
 474
 475       thread_count = scratch_ids_per_subslice * subslices;
 476       break;
 477    }
 478    default:
 479       unreachable("Unsupported stage!");
 480    }
 481
 482    stage_state->scratch_bo =
 483       brw_bo_alloc(brw->bufmgr, "shader scratch space",
 484                    per_thread_size * thread_count, BRW_MEMZONE_SCRATCH);
 485 }
 486
 487 void brwInitFragProgFuncs( struct dd_function_table *functions )
 488 {
 489    assert(functions->ProgramStringNotify == _tnl_program_string);
 490
 491    functions->NewProgram = brwNewProgram;
 492    functions->DeleteProgram = brwDeleteProgram;
 493    functions->ProgramStringNotify = brwProgramStringNotify;
 494
 495    functions->LinkShader = brw_link_shader;
 496
 497    functions->MemoryBarrier = brw_memory_barrier;
 498    functions->FramebufferFetchBarrier = brw_framebuffer_fetch_barrier;
 499 }
 500
 501 struct shader_times {
 502    uint64_t time;
 503    uint64_t written;
 504    uint64_t reset;
 505 };
 506
 507 void
 508 brw_init_shader_time(struct brw_context *brw)
 509 {
 510    const int max_entries = 2048;
 511    brw->shader_time.bo =
 512       brw_bo_alloc(brw->bufmgr, "shader time",
 513                    max_entries * BRW_SHADER_TIME_STRIDE * 3,
 514                    BRW_MEMZONE_OTHER);
 515    brw->shader_time.names = rzalloc_array(brw, const char *, max_entries);
 516    brw->shader_time.ids = rzalloc_array(brw, int, max_entries);
 517    brw->shader_time.types = rzalloc_array(brw, enum shader_time_shader_type,
 518                                           max_entries);
 519    brw->shader_time.cumulative = rzalloc_array(brw, struct shader_times,
 520                                                max_entries);
 521    brw->shader_time.max_entries = max_entries;
 522 }
 523
 524 static int
 525 compare_time(const void *a, const void *b)
 526 {
 527    uint64_t * const *a_val = a;
 528    uint64_t * const *b_val = b;
 529
 530    /* We don't just subtract because we're turning the value to an int. */
 531    if (**a_val < **b_val)
 532       return -1;
 533    else if (**a_val == **b_val)
 534       return 0;
 535    else
 536       return 1;
 537 }
 538
 539 static void
 540 print_shader_time_line(const char *stage, const char *name,
 541                        int shader_num, uint64_t time, uint64_t total)
 542 {
 543    fprintf(stderr, "%-6s%-18s", stage, name);
 544
 545    if (shader_num != 0)
 546       fprintf(stderr, "%4d: ", shader_num);
 547    else
 548       fprintf(stderr, "    : ");
 549
 550    fprintf(stderr, "%16lld (%7.2f Gcycles)      %4.1f%%\n",
 551            (long long)time,
 552            (double)time / 1000000000.0,
 553            (double)time / total * 100.0);
 554 }
 555
 556 static void
 557 brw_report_shader_time(struct brw_context *brw)
 558 {
 559    if (!brw->shader_time.bo || !brw->shader_time.num_entries)
 560       return;
 561
 562    uint64_t scaled[brw->shader_time.num_entries];
 563    uint64_t *sorted[brw->shader_time.num_entries];
 564    uint64_t total_by_type[ST_CS + 1];
 565    memset(total_by_type, 0, sizeof(total_by_type));
 566    double total = 0;
 567    for (int i = 0; i < brw->shader_time.num_entries; i++) {
 568       uint64_t written = 0, reset = 0;
 569       enum shader_time_shader_type type = brw->shader_time.types[i];
 570
 571       sorted[i] = &scaled[i];
 572
 573       switch (type) {
 574       case ST_VS:
 575       case ST_TCS:
 576       case ST_TES:
 577       case ST_GS:
 578       case ST_FS8:
 579       case ST_FS16:
 580       case ST_FS32:
 581       case ST_CS:
 582          written = brw->shader_time.cumulative[i].written;
 583          reset = brw->shader_time.cumulative[i].reset;
 584          break;
 585
 586       default:
 587          /* I sometimes want to print things that aren't the 3 shader times.
 588           * Just print the sum in that case.
 589           */
 590          written = 1;
 591          reset = 0;
 592          break;
 593       }
 594
 595       uint64_t time = brw->shader_time.cumulative[i].time;
 596       if (written) {
 597          scaled[i] = time / written * (written + reset);
 598       } else {
 599          scaled[i] = time;
 600       }
 601
 602       switch (type) {
 603       case ST_VS:
 604       case ST_TCS:
 605       case ST_TES:
 606       case ST_GS:
 607       case ST_FS8:
 608       case ST_FS16:
 609       case ST_FS32:
 610       case ST_CS:
 611          total_by_type[type] += scaled[i];
 612          break;
 613       default:
 614          break;
 615       }
 616
 617       total += scaled[i];
 618    }
 619
 620    if (total == 0) {
 621       fprintf(stderr, "No shader time collected yet\n");
 622       return;
 623    }
 624
 625    qsort(sorted, brw->shader_time.num_entries, sizeof(sorted[0]), compare_time);
 626
 627    fprintf(stderr, "\n");
 628    fprintf(stderr, "type          ID                  cycles spent                   %% of total\n");
 629    for (int s = 0; s < brw->shader_time.num_entries; s++) {
 630       const char *stage;
 631       /* Work back from the sorted pointers times to a time to print. */
 632       int i = sorted[s] - scaled;
 633
 634       if (scaled[i] == 0)
 635          continue;
 636
 637       int shader_num = brw->shader_time.ids[i];
 638       const char *shader_name = brw->shader_time.names[i];
 639
 640       switch (brw->shader_time.types[i]) {
 641       case ST_VS:
 642          stage = "vs";
 643          break;
 644       case ST_TCS:
 645          stage = "tcs";
 646          break;
 647       case ST_TES:
 648          stage = "tes";
 649          break;
 650       case ST_GS:
 651          stage = "gs";
 652          break;
 653       case ST_FS8:
 654          stage = "fs8";
 655          break;
 656       case ST_FS16:
 657          stage = "fs16";
 658          break;
 659       case ST_FS32:
 660          stage = "fs32";
 661          break;
 662       case ST_CS:
 663          stage = "cs";
 664          break;
 665       default:
 666          stage = "other";
 667          break;
 668       }
 669
 670       print_shader_time_line(stage, shader_name, shader_num,
 671                              scaled[i], total);
 672    }
 673
 674    fprintf(stderr, "\n");
 675    print_shader_time_line("total", "vs", 0, total_by_type[ST_VS], total);
 676    print_shader_time_line("total", "tcs", 0, total_by_type[ST_TCS], total);
 677    print_shader_time_line("total", "tes", 0, total_by_type[ST_TES], total);
 678    print_shader_time_line("total", "gs", 0, total_by_type[ST_GS], total);
 679    print_shader_time_line("total", "fs8", 0, total_by_type[ST_FS8], total);
 680    print_shader_time_line("total", "fs16", 0, total_by_type[ST_FS16], total);
 681    print_shader_time_line("total", "fs32", 0, total_by_type[ST_FS32], total);
 682    print_shader_time_line("total", "cs", 0, total_by_type[ST_CS], total);
 683 }
 684
 685 static void
 686 brw_collect_shader_time(struct brw_context *brw)
 687 {
 688    if (!brw->shader_time.bo)
 689       return;
 690
 691    /* This probably stalls on the last rendering.  We could fix that by
 692     * delaying reading the reports, but it doesn't look like it's a big
 693     * overhead compared to the cost of tracking the time in the first place.
 694     */
 695    void *bo_map = brw_bo_map(brw, brw->shader_time.bo, MAP_READ | MAP_WRITE);
 696
 697    for (int i = 0; i < brw->shader_time.num_entries; i++) {
 698       uint32_t *times = bo_map + i * 3 * BRW_SHADER_TIME_STRIDE;
 699
 700       brw->shader_time.cumulative[i].time += times[BRW_SHADER_TIME_STRIDE * 0 / 4];
 701       brw->shader_time.cumulative[i].written += times[BRW_SHADER_TIME_STRIDE * 1 / 4];
 702       brw->shader_time.cumulative[i].reset += times[BRW_SHADER_TIME_STRIDE * 2 / 4];
 703    }
 704
 705    /* Zero the BO out to clear it out for our next collection.
 706     */
 707    memset(bo_map, 0, brw->shader_time.bo->size);
 708    brw_bo_unmap(brw->shader_time.bo);
 709 }
 710
 711 void
 712 brw_collect_and_report_shader_time(struct brw_context *brw)
 713 {
 714    brw_collect_shader_time(brw);
 715
 716    if (brw->shader_time.report_time == 0 ||
 717        get_time() - brw->shader_time.report_time >= 1.0) {
 718       brw_report_shader_time(brw);
 719       brw->shader_time.report_time = get_time();
 720    }
 721 }
 722
 723 /**
 724  * Chooses an index in the shader_time buffer and sets up tracking information
 725  * for our printouts.
 726  *
 727  * Note that this holds on to references to the underlying programs, which may
 728  * change their lifetimes compared to normal operation.
 729  */
 730 int
 731 brw_get_shader_time_index(struct brw_context *brw, struct gl_program *prog,
 732                           enum shader_time_shader_type type, bool is_glsl_sh)
 733 {
 734    int shader_time_index = brw->shader_time.num_entries++;
 735    assert(shader_time_index < brw->shader_time.max_entries);
 736    brw->shader_time.types[shader_time_index] = type;
 737
 738    const char *name;
 739    if (prog->Id == 0) {
 740       name = "ff";
 741    } else if (is_glsl_sh) {
 742       name = prog->info.label ?
 743          ralloc_strdup(brw->shader_time.names, prog->info.label) : "glsl";
 744    } else {
 745       name = "prog";
 746    }
 747
 748    brw->shader_time.names[shader_time_index] = name;
 749    brw->shader_time.ids[shader_time_index] = prog->Id;
 750
 751    return shader_time_index;
 752 }
 753
 754 void
 755 brw_destroy_shader_time(struct brw_context *brw)
 756 {
 757    brw_bo_unreference(brw->shader_time.bo);
 758    brw->shader_time.bo = NULL;
 759 }
 760
 761 void
 762 brw_stage_prog_data_free(const void *p)
 763 {
 764    struct brw_stage_prog_data *prog_data = (struct brw_stage_prog_data *)p;
 765
 766    ralloc_free(prog_data->param);
 767    ralloc_free(prog_data->pull_param);
 768 }
 769
 770 void
 771 brw_dump_arb_asm(const char *stage, struct gl_program *prog)
 772 {
 773    fprintf(stderr, "ARB_%s_program %d ir for native %s shader\n",
 774            stage, prog->Id, stage);
 775    _mesa_print_program(prog);
 776 }
 777
 778 void
 779 brw_setup_tex_for_precompile(const struct gen_device_info *devinfo,
 780                              struct brw_sampler_prog_key_data *tex,
 781                              struct gl_program *prog)
 782 {
 783    const bool has_shader_channel_select = devinfo->is_haswell || devinfo->gen >= 8;
 784    unsigned sampler_count = util_last_bit(prog->SamplersUsed);
 785    for (unsigned i = 0; i < sampler_count; i++) {
 786       if (!has_shader_channel_select && (prog->ShadowSamplers & (1 << i))) {
 787          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
 788          tex->swizzles[i] =
 789             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
 790       } else {
 791          /* Color sampler: assume no swizzling. */
 792          tex->swizzles[i] = SWIZZLE_XYZW;
 793       }
 794    }
 795 }
 796
 797 /**
 798  * Sets up the starting offsets for the groups of binding table entries
 799  * common to all pipeline stages.
 800  *
 801  * Unused groups are initialized to 0xd0d0d0d0 to make it obvious that they're
 802  * unused but also make sure that addition of small offsets to them will
 803  * trigger some of our asserts that surface indices are < BRW_MAX_SURFACES.
 804  */
 805 uint32_t
 806 brw_assign_common_binding_table_offsets(const struct gen_device_info *devinfo,
 807                                         const struct gl_program *prog,
 808                                         struct brw_stage_prog_data *stage_prog_data,
 809                                         uint32_t next_binding_table_offset)
 810 {
 811    int num_textures = util_last_bit(prog->SamplersUsed);
 812
 813    stage_prog_data->binding_table.texture_start = next_binding_table_offset;
 814    next_binding_table_offset += num_textures;
 815
 816    if (prog->info.num_ubos) {
 817       assert(prog->info.num_ubos <= BRW_MAX_UBO);
 818       stage_prog_data->binding_table.ubo_start = next_binding_table_offset;
 819       next_binding_table_offset += prog->info.num_ubos;
 820    } else {
 821       stage_prog_data->binding_table.ubo_start = 0xd0d0d0d0;
 822    }
 823
 824    if (prog->info.num_ssbos || prog->info.num_abos) {
 825       assert(prog->info.num_abos <= BRW_MAX_ABO);
 826       assert(prog->info.num_ssbos <= BRW_MAX_SSBO);
 827       stage_prog_data->binding_table.ssbo_start = next_binding_table_offset;
 828       next_binding_table_offset += prog->info.num_abos + prog->info.num_ssbos;
 829    } else {
 830       stage_prog_data->binding_table.ssbo_start = 0xd0d0d0d0;
 831    }
 832
 833    if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
 834       stage_prog_data->binding_table.shader_time_start = next_binding_table_offset;
 835       next_binding_table_offset++;
 836    } else {
 837       stage_prog_data->binding_table.shader_time_start = 0xd0d0d0d0;
 838    }
 839
 840    if (prog->info.uses_texture_gather) {
 841       if (devinfo->gen >= 8) {
 842          stage_prog_data->binding_table.gather_texture_start =
 843             stage_prog_data->binding_table.texture_start;
 844       } else {
 845          stage_prog_data->binding_table.gather_texture_start = next_binding_table_offset;
 846          next_binding_table_offset += num_textures;
 847       }
 848    } else {
 849       stage_prog_data->binding_table.gather_texture_start = 0xd0d0d0d0;
 850    }
 851
 852    if (prog->info.num_images) {
 853       stage_prog_data->binding_table.image_start = next_binding_table_offset;
 854       next_binding_table_offset += prog->info.num_images;
 855    } else {
 856       stage_prog_data->binding_table.image_start = 0xd0d0d0d0;
 857    }
 858
 859    /* This may or may not be used depending on how the compile goes. */
 860    stage_prog_data->binding_table.pull_constants_start = next_binding_table_offset;
 861    next_binding_table_offset++;
 862
 863    /* Plane 0 is just the regular texture section */
 864    stage_prog_data->binding_table.plane_start[0] = stage_prog_data->binding_table.texture_start;
 865
 866    stage_prog_data->binding_table.plane_start[1] = next_binding_table_offset;
 867    next_binding_table_offset += num_textures;
 868
 869    stage_prog_data->binding_table.plane_start[2] = next_binding_table_offset;
 870    next_binding_table_offset += num_textures;
 871
 872    /* Set the binding table size.  Some callers may append new entries
 873     * and increase this accordingly.
 874     */
 875    stage_prog_data->binding_table.size_bytes = next_binding_table_offset * 4;
 876
 877    assert(next_binding_table_offset <= BRW_MAX_SURFACES);
 878    return next_binding_table_offset;
 879 }
 880
 881 void
 882 brw_populate_default_key(const struct brw_compiler *compiler,
 883                          union brw_any_prog_key *prog_key,
 884                          struct gl_shader_program *sh_prog,
 885                          struct gl_program *prog)
 886 {
 887    switch (prog->info.stage) {
 888    case MESA_SHADER_VERTEX:
 889       brw_vs_populate_default_key(compiler, &prog_key->vs, prog);
 890       break;
 891    case MESA_SHADER_TESS_CTRL:
 892       brw_tcs_populate_default_key(compiler, &prog_key->tcs, sh_prog, prog);
 893       break;
 894    case MESA_SHADER_TESS_EVAL:
 895       brw_tes_populate_default_key(compiler, &prog_key->tes, sh_prog, prog);
 896       break;
 897    case MESA_SHADER_GEOMETRY:
 898       brw_gs_populate_default_key(compiler, &prog_key->gs, prog);
 899       break;
 900    case MESA_SHADER_FRAGMENT:
 901       brw_wm_populate_default_key(compiler, &prog_key->wm, prog);
 902       break;
 903    case MESA_SHADER_COMPUTE:
 904       brw_cs_populate_default_key(compiler, &prog_key->cs, prog);
 905       break;
 906    default:
 907       unreachable("Unsupported stage!");
 908    }
 909 }
 910
 911 void
 912 brw_debug_recompile(struct brw_context *brw,
 913                     gl_shader_stage stage,
 914                     unsigned api_id,
 915                     unsigned key_program_string_id,
 916                     void *key)
 917 {
 918    const struct brw_compiler *compiler = brw->screen->compiler;
 919    enum brw_cache_id cache_id = brw_stage_cache_id(stage);
 920
 921    compiler->shader_perf_log(brw, "Recompiling %s shader for program %d\n",
 922                              _mesa_shader_stage_to_string(stage), api_id);
 923
 924    const void *old_key =
 925       brw_find_previous_compile(&brw->cache, cache_id, key_program_string_id);
 926
 927    brw_debug_key_recompile(compiler, brw, stage, old_key, key);
 928 }