src/mesa/drivers/dri/i965/brw_program.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keithw@vmware.com>
  30   */
  31
  32 #include <pthread.h>
  33 #include "main/imports.h"
  34 #include "main/glspirv.h"
  35 #include "program/prog_parameter.h"
  36 #include "program/prog_print.h"
  37 #include "program/prog_to_nir.h"
  38 #include "program/program.h"
  39 #include "program/programopt.h"
  40 #include "tnl/tnl.h"
  41 #include "util/ralloc.h"
  42 #include "compiler/glsl/ir.h"
  43 #include "compiler/glsl/glsl_to_nir.h"
  44 #include "compiler/nir/nir_serialize.h"
  45
  46 #include "brw_program.h"
  47 #include "brw_context.h"
  48 #include "compiler/brw_nir.h"
  49 #include "brw_defines.h"
  50 #include "intel_batchbuffer.h"
  51
  52 static bool
  53 brw_nir_lower_uniforms(nir_shader *nir, bool is_scalar)
  54 {
  55    if (is_scalar) {
  56       nir_assign_var_locations(&nir->uniforms, &nir->num_uniforms,
  57                                type_size_scalar_bytes);
  58       return nir_lower_io(nir, nir_var_uniform, type_size_scalar_bytes, 0);
  59    } else {
  60       nir_assign_var_locations(&nir->uniforms, &nir->num_uniforms,
  61                                type_size_vec4_bytes);
  62       return nir_lower_io(nir, nir_var_uniform, type_size_vec4_bytes, 0);
  63    }
  64 }
  65
  66 nir_shader *
  67 brw_create_nir(struct brw_context *brw,
  68                const struct gl_shader_program *shader_prog,
  69                struct gl_program *prog,
  70                gl_shader_stage stage,
  71                bool is_scalar)
  72 {
  73    struct gl_context *ctx = &brw->ctx;
  74    const nir_shader_compiler_options *options =
  75       ctx->Const.ShaderCompilerOptions[stage].NirOptions;
  76    nir_shader *nir;
  77
  78    /* First, lower the GLSL/Mesa IR or SPIR-V to NIR */
  79    if (shader_prog) {
  80       if (shader_prog->data->spirv) {
  81          nir = _mesa_spirv_to_nir(ctx, shader_prog, stage, options);
  82       } else {
  83          nir = glsl_to_nir(shader_prog, stage, options);
  84       }
  85       assert (nir);
  86
  87       nir_remove_dead_variables(nir, nir_var_shader_in | nir_var_shader_out);
  88       nir_lower_returns(nir);
  89       nir_validate_shader(nir);
  90       NIR_PASS_V(nir, nir_lower_io_to_temporaries,
  91                  nir_shader_get_entrypoint(nir), true, false);
  92    } else {
  93       nir = prog_to_nir(prog, options);
  94       NIR_PASS_V(nir, nir_lower_regs_to_ssa); /* turn registers into SSA */
  95    }
  96    nir_validate_shader(nir);
  97
  98    /* Lower PatchVerticesIn from system value to uniform. This needs to
  99     * happen before brw_preprocess_nir, since that will lower system values
 100     * to intrinsics.
 101     *
 102     * We only do this for TES if no TCS is present, since otherwise we know
 103     * the number of vertices in the patch at link time and we can lower it
 104     * directly to a constant. We do this in nir_lower_patch_vertices, which
 105     * needs to run after brw_nir_preprocess has turned the system values
 106     * into intrinsics.
 107     */
 108    const bool lower_patch_vertices_in_to_uniform =
 109       (stage == MESA_SHADER_TESS_CTRL && brw->screen->devinfo.gen >= 8) ||
 110       (stage == MESA_SHADER_TESS_EVAL &&
 111        !shader_prog->_LinkedShaders[MESA_SHADER_TESS_CTRL]);
 112
 113    if (lower_patch_vertices_in_to_uniform)
 114       brw_nir_lower_patch_vertices_in_to_uniform(nir);
 115
 116    nir = brw_preprocess_nir(brw->screen->compiler, nir);
 117
 118    if (stage == MESA_SHADER_TESS_EVAL && !lower_patch_vertices_in_to_uniform) {
 119       assert(shader_prog->_LinkedShaders[MESA_SHADER_TESS_CTRL]);
 120       struct gl_linked_shader *linked_tcs =
 121          shader_prog->_LinkedShaders[MESA_SHADER_TESS_CTRL];
 122       uint32_t patch_vertices = linked_tcs->Program->info.tess.tcs_vertices_out;
 123       nir_lower_tes_patch_vertices(nir, patch_vertices);
 124    }
 125
 126    if (stage == MESA_SHADER_FRAGMENT) {
 127       static const struct nir_lower_wpos_ytransform_options wpos_options = {
 128          .state_tokens = {STATE_INTERNAL, STATE_FB_WPOS_Y_TRANSFORM, 0, 0, 0},
 129          .fs_coord_pixel_center_integer = 1,
 130          .fs_coord_origin_upper_left = 1,
 131       };
 132
 133       bool progress = false;
 134       NIR_PASS(progress, nir, nir_lower_wpos_ytransform, &wpos_options);
 135       if (progress) {
 136          _mesa_add_state_reference(prog->Parameters,
 137                                    wpos_options.state_tokens);
 138       }
 139    }
 140
 141    NIR_PASS_V(nir, brw_nir_lower_uniforms, is_scalar);
 142
 143    return nir;
 144 }
 145
 146 void
 147 brw_shader_gather_info(nir_shader *nir, struct gl_program *prog)
 148 {
 149    nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
 150
 151    /* Copy the info we just generated back into the gl_program */
 152    const char *prog_name = prog->info.name;
 153    const char *prog_label = prog->info.label;
 154    prog->info = nir->info;
 155    prog->info.name = prog_name;
 156    prog->info.label = prog_label;
 157 }
 158
 159 static unsigned
 160 get_new_program_id(struct intel_screen *screen)
 161 {
 162    return p_atomic_inc_return(&screen->program_id);
 163 }
 164
 165 static struct gl_program *brwNewProgram(struct gl_context *ctx, GLenum target,
 166                                         GLuint id, bool is_arb_asm)
 167 {
 168    struct brw_context *brw = brw_context(ctx);
 169    struct brw_program *prog = rzalloc(NULL, struct brw_program);
 170
 171    if (prog) {
 172       prog->id = get_new_program_id(brw->screen);
 173
 174       return _mesa_init_gl_program(&prog->program, target, id, is_arb_asm);
 175    }
 176
 177    return NULL;
 178 }
 179
 180 static void brwDeleteProgram( struct gl_context *ctx,
 181                               struct gl_program *prog )
 182 {
 183    struct brw_context *brw = brw_context(ctx);
 184
 185    /* Beware!  prog's refcount has reached zero, and it's about to be freed.
 186     *
 187     * In brw_upload_pipeline_state(), we compare brw->programs[i] to
 188     * ctx->FooProgram._Current, and flag BRW_NEW_FOO_PROGRAM if the
 189     * pointer has changed.
 190     *
 191     * We cannot leave brw->programs[i] as a dangling pointer to the dead
 192     * program.  malloc() may allocate the same memory for a new gl_program,
 193     * causing us to see matching pointers...but totally different programs.
 194     *
 195     * We cannot set brw->programs[i] to NULL, either.  If we've deleted the
 196     * active program, Mesa may set ctx->FooProgram._Current to NULL.  That
 197     * would cause us to see matching pointers (NULL == NULL), and fail to
 198     * detect that a program has changed since our last draw.
 199     *
 200     * So, set it to a bogus gl_program pointer that will never match,
 201     * causing us to properly reevaluate the state on our next draw.
 202     *
 203     * Getting this wrong causes heisenbugs which are very hard to catch,
 204     * as you need a very specific allocation pattern to hit the problem.
 205     */
 206    static const struct gl_program deleted_program;
 207
 208    for (int i = 0; i < MESA_SHADER_STAGES; i++) {
 209       if (brw->programs[i] == prog)
 210          brw->programs[i] = (struct gl_program *) &deleted_program;
 211    }
 212
 213    _mesa_delete_program( ctx, prog );
 214 }
 215
 216
 217 static GLboolean
 218 brwProgramStringNotify(struct gl_context *ctx,
 219                        GLenum target,
 220                        struct gl_program *prog)
 221 {
 222    assert(target == GL_VERTEX_PROGRAM_ARB || !prog->arb.IsPositionInvariant);
 223
 224    struct brw_context *brw = brw_context(ctx);
 225    const struct brw_compiler *compiler = brw->screen->compiler;
 226
 227    switch (target) {
 228    case GL_FRAGMENT_PROGRAM_ARB: {
 229       struct brw_program *newFP = brw_program(prog);
 230       const struct brw_program *curFP =
 231          brw_program_const(brw->programs[MESA_SHADER_FRAGMENT]);
 232
 233       if (newFP == curFP)
 234          brw->ctx.NewDriverState |= BRW_NEW_FRAGMENT_PROGRAM;
 235       newFP->id = get_new_program_id(brw->screen);
 236
 237       prog->nir = brw_create_nir(brw, NULL, prog, MESA_SHADER_FRAGMENT, true);
 238
 239       brw_shader_gather_info(prog->nir, prog);
 240
 241       brw_fs_precompile(ctx, prog);
 242       break;
 243    }
 244    case GL_VERTEX_PROGRAM_ARB: {
 245       struct brw_program *newVP = brw_program(prog);
 246       const struct brw_program *curVP =
 247          brw_program_const(brw->programs[MESA_SHADER_VERTEX]);
 248
 249       if (newVP == curVP)
 250          brw->ctx.NewDriverState |= BRW_NEW_VERTEX_PROGRAM;
 251       if (newVP->program.arb.IsPositionInvariant) {
 252          _mesa_insert_mvp_code(ctx, &newVP->program);
 253       }
 254       newVP->id = get_new_program_id(brw->screen);
 255
 256       /* Also tell tnl about it:
 257        */
 258       _tnl_program_string(ctx, target, prog);
 259
 260       prog->nir = brw_create_nir(brw, NULL, prog, MESA_SHADER_VERTEX,
 261                                  compiler->scalar_stage[MESA_SHADER_VERTEX]);
 262
 263       brw_shader_gather_info(prog->nir, prog);
 264
 265       brw_vs_precompile(ctx, prog);
 266       break;
 267    }
 268    default:
 269       /*
 270        * driver->ProgramStringNotify is only called for ARB programs, fixed
 271        * function vertex programs, and ir_to_mesa (which isn't used by the
 272        * i965 back-end).  Therefore, even after geometry shaders are added,
 273        * this function should only ever be called with a target of
 274        * GL_VERTEX_PROGRAM_ARB or GL_FRAGMENT_PROGRAM_ARB.
 275        */
 276       unreachable("Unexpected target in brwProgramStringNotify");
 277    }
 278
 279    return true;
 280 }
 281
 282 static void
 283 brw_memory_barrier(struct gl_context *ctx, GLbitfield barriers)
 284 {
 285    struct brw_context *brw = brw_context(ctx);
 286    const struct gen_device_info *devinfo = &brw->screen->devinfo;
 287    unsigned bits = PIPE_CONTROL_DATA_CACHE_FLUSH | PIPE_CONTROL_CS_STALL;
 288    assert(devinfo->gen >= 7 && devinfo->gen <= 11);
 289
 290    if (barriers & (GL_VERTEX_ATTRIB_ARRAY_BARRIER_BIT |
 291                    GL_ELEMENT_ARRAY_BARRIER_BIT |
 292                    GL_COMMAND_BARRIER_BIT))
 293       bits |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
 294
 295    if (barriers & GL_UNIFORM_BARRIER_BIT)
 296       bits |= (PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
 297                PIPE_CONTROL_CONST_CACHE_INVALIDATE);
 298
 299    if (barriers & GL_TEXTURE_FETCH_BARRIER_BIT)
 300       bits |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
 301
 302    if (barriers & (GL_TEXTURE_UPDATE_BARRIER_BIT |
 303                    GL_PIXEL_BUFFER_BARRIER_BIT))
 304       bits |= (PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
 305                PIPE_CONTROL_RENDER_TARGET_FLUSH);
 306
 307    if (barriers & GL_FRAMEBUFFER_BARRIER_BIT)
 308       bits |= (PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
 309                PIPE_CONTROL_RENDER_TARGET_FLUSH);
 310
 311    /* Typed surface messages are handled by the render cache on IVB, so we
 312     * need to flush it too.
 313     */
 314    if (devinfo->gen == 7 && !devinfo->is_haswell)
 315       bits |= PIPE_CONTROL_RENDER_TARGET_FLUSH;
 316
 317    brw_emit_pipe_control_flush(brw, bits);
 318 }
 319
 320 static void
 321 brw_framebuffer_fetch_barrier(struct gl_context *ctx)
 322 {
 323    struct brw_context *brw = brw_context(ctx);
 324    const struct gen_device_info *devinfo = &brw->screen->devinfo;
 325
 326    if (!ctx->Extensions.EXT_shader_framebuffer_fetch) {
 327       if (devinfo->gen >= 6) {
 328          brw_emit_pipe_control_flush(brw,
 329                                      PIPE_CONTROL_RENDER_TARGET_FLUSH |
 330                                      PIPE_CONTROL_CS_STALL);
 331          brw_emit_pipe_control_flush(brw,
 332                                      PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE);
 333       } else {
 334          brw_emit_pipe_control_flush(brw,
 335                                      PIPE_CONTROL_RENDER_TARGET_FLUSH);
 336       }
 337    }
 338 }
 339
 340 void
 341 brw_get_scratch_bo(struct brw_context *brw,
 342                    struct brw_bo **scratch_bo, int size)
 343 {
 344    struct brw_bo *old_bo = *scratch_bo;
 345
 346    if (old_bo && old_bo->size < size) {
 347       brw_bo_unreference(old_bo);
 348       old_bo = NULL;
 349    }
 350
 351    if (!old_bo) {
 352       *scratch_bo =
 353          brw_bo_alloc(brw->bufmgr, "scratch bo", size, BRW_MEMZONE_SCRATCH);
 354    }
 355 }
 356
 357 /**
 358  * Reserve enough scratch space for the given stage to hold \p per_thread_size
 359  * bytes times the given \p thread_count.
 360  */
 361 void
 362 brw_alloc_stage_scratch(struct brw_context *brw,
 363                         struct brw_stage_state *stage_state,
 364                         unsigned per_thread_size)
 365 {
 366    if (stage_state->per_thread_scratch >= per_thread_size)
 367       return;
 368
 369    stage_state->per_thread_scratch = per_thread_size;
 370
 371    if (stage_state->scratch_bo)
 372       brw_bo_unreference(stage_state->scratch_bo);
 373
 374    const struct gen_device_info *devinfo = &brw->screen->devinfo;
 375    unsigned thread_count;
 376    switch(stage_state->stage) {
 377    case MESA_SHADER_VERTEX:
 378       thread_count = devinfo->max_vs_threads;
 379       break;
 380    case MESA_SHADER_TESS_CTRL:
 381       thread_count = devinfo->max_tcs_threads;
 382       break;
 383    case MESA_SHADER_TESS_EVAL:
 384       thread_count = devinfo->max_tes_threads;
 385       break;
 386    case MESA_SHADER_GEOMETRY:
 387       thread_count = devinfo->max_gs_threads;
 388       break;
 389    case MESA_SHADER_FRAGMENT:
 390       thread_count = devinfo->max_wm_threads;
 391       break;
 392    case MESA_SHADER_COMPUTE: {
 393       unsigned subslices = MAX2(brw->screen->subslice_total, 1);
 394
 395       /* The documentation for 3DSTATE_PS "Scratch Space Base Pointer" says:
 396        *
 397        * "Scratch Space per slice is computed based on 4 sub-slices.  SW must
 398        *  allocate scratch space enough so that each slice has 4 slices
 399        *  allowed."
 400        *
 401        * According to the other driver team, this applies to compute shaders
 402        * as well.  This is not currently documented at all.
 403        *
 404        * brw->screen->subslice_total is the TOTAL number of subslices
 405        * and we wish to view that there are 4 subslices per slice
 406        * instead of the actual number of subslices per slice.
 407        */
 408       if (devinfo->gen >= 9)
 409          subslices = 4 * brw->screen->devinfo.num_slices;
 410
 411       unsigned scratch_ids_per_subslice;
 412       if (devinfo->is_haswell) {
 413          /* WaCSScratchSize:hsw
 414           *
 415           * Haswell's scratch space address calculation appears to be sparse
 416           * rather than tightly packed. The Thread ID has bits indicating
 417           * which subslice, EU within a subslice, and thread within an EU it
 418           * is. There's a maximum of two slices and two subslices, so these
 419           * can be stored with a single bit. Even though there are only 10 EUs
 420           * per subslice, this is stored in 4 bits, so there's an effective
 421           * maximum value of 16 EUs. Similarly, although there are only 7
 422           * threads per EU, this is stored in a 3 bit number, giving an
 423           * effective maximum value of 8 threads per EU.
 424           *
 425           * This means that we need to use 16 * 8 instead of 10 * 7 for the
 426           * number of threads per subslice.
 427           */
 428          scratch_ids_per_subslice = 16 * 8;
 429       } else if (devinfo->is_cherryview) {
 430          /* Cherryview devices have either 6 or 8 EUs per subslice, and each
 431           * EU has 7 threads. The 6 EU devices appear to calculate thread IDs
 432           * as if it had 8 EUs.
 433           */
 434          scratch_ids_per_subslice = 8 * 7;
 435       } else {
 436          scratch_ids_per_subslice = devinfo->max_cs_threads;
 437       }
 438
 439       thread_count = scratch_ids_per_subslice * subslices;
 440       break;
 441    }
 442    default:
 443       unreachable("Unsupported stage!");
 444    }
 445
 446    stage_state->scratch_bo =
 447       brw_bo_alloc(brw->bufmgr, "shader scratch space",
 448                    per_thread_size * thread_count, BRW_MEMZONE_SCRATCH);
 449 }
 450
 451 void brwInitFragProgFuncs( struct dd_function_table *functions )
 452 {
 453    assert(functions->ProgramStringNotify == _tnl_program_string);
 454
 455    functions->NewProgram = brwNewProgram;
 456    functions->DeleteProgram = brwDeleteProgram;
 457    functions->ProgramStringNotify = brwProgramStringNotify;
 458
 459    functions->LinkShader = brw_link_shader;
 460
 461    functions->MemoryBarrier = brw_memory_barrier;
 462    functions->FramebufferFetchBarrier = brw_framebuffer_fetch_barrier;
 463 }
 464
 465 struct shader_times {
 466    uint64_t time;
 467    uint64_t written;
 468    uint64_t reset;
 469 };
 470
 471 void
 472 brw_init_shader_time(struct brw_context *brw)
 473 {
 474    const int max_entries = 2048;
 475    brw->shader_time.bo =
 476       brw_bo_alloc(brw->bufmgr, "shader time",
 477                    max_entries * BRW_SHADER_TIME_STRIDE * 3,
 478                    BRW_MEMZONE_OTHER);
 479    brw->shader_time.names = rzalloc_array(brw, const char *, max_entries);
 480    brw->shader_time.ids = rzalloc_array(brw, int, max_entries);
 481    brw->shader_time.types = rzalloc_array(brw, enum shader_time_shader_type,
 482                                           max_entries);
 483    brw->shader_time.cumulative = rzalloc_array(brw, struct shader_times,
 484                                                max_entries);
 485    brw->shader_time.max_entries = max_entries;
 486 }
 487
 488 static int
 489 compare_time(const void *a, const void *b)
 490 {
 491    uint64_t * const *a_val = a;
 492    uint64_t * const *b_val = b;
 493
 494    /* We don't just subtract because we're turning the value to an int. */
 495    if (**a_val < **b_val)
 496       return -1;
 497    else if (**a_val == **b_val)
 498       return 0;
 499    else
 500       return 1;
 501 }
 502
 503 static void
 504 print_shader_time_line(const char *stage, const char *name,
 505                        int shader_num, uint64_t time, uint64_t total)
 506 {
 507    fprintf(stderr, "%-6s%-18s", stage, name);
 508
 509    if (shader_num != 0)
 510       fprintf(stderr, "%4d: ", shader_num);
 511    else
 512       fprintf(stderr, "    : ");
 513
 514    fprintf(stderr, "%16lld (%7.2f Gcycles)      %4.1f%%\n",
 515            (long long)time,
 516            (double)time / 1000000000.0,
 517            (double)time / total * 100.0);
 518 }
 519
 520 static void
 521 brw_report_shader_time(struct brw_context *brw)
 522 {
 523    if (!brw->shader_time.bo || !brw->shader_time.num_entries)
 524       return;
 525
 526    uint64_t scaled[brw->shader_time.num_entries];
 527    uint64_t *sorted[brw->shader_time.num_entries];
 528    uint64_t total_by_type[ST_CS + 1];
 529    memset(total_by_type, 0, sizeof(total_by_type));
 530    double total = 0;
 531    for (int i = 0; i < brw->shader_time.num_entries; i++) {
 532       uint64_t written = 0, reset = 0;
 533       enum shader_time_shader_type type = brw->shader_time.types[i];
 534
 535       sorted[i] = &scaled[i];
 536
 537       switch (type) {
 538       case ST_VS:
 539       case ST_TCS:
 540       case ST_TES:
 541       case ST_GS:
 542       case ST_FS8:
 543       case ST_FS16:
 544       case ST_FS32:
 545       case ST_CS:
 546          written = brw->shader_time.cumulative[i].written;
 547          reset = brw->shader_time.cumulative[i].reset;
 548          break;
 549
 550       default:
 551          /* I sometimes want to print things that aren't the 3 shader times.
 552           * Just print the sum in that case.
 553           */
 554          written = 1;
 555          reset = 0;
 556          break;
 557       }
 558
 559       uint64_t time = brw->shader_time.cumulative[i].time;
 560       if (written) {
 561          scaled[i] = time / written * (written + reset);
 562       } else {
 563          scaled[i] = time;
 564       }
 565
 566       switch (type) {
 567       case ST_VS:
 568       case ST_TCS:
 569       case ST_TES:
 570       case ST_GS:
 571       case ST_FS8:
 572       case ST_FS16:
 573       case ST_FS32:
 574       case ST_CS:
 575          total_by_type[type] += scaled[i];
 576          break;
 577       default:
 578          break;
 579       }
 580
 581       total += scaled[i];
 582    }
 583
 584    if (total == 0) {
 585       fprintf(stderr, "No shader time collected yet\n");
 586       return;
 587    }
 588
 589    qsort(sorted, brw->shader_time.num_entries, sizeof(sorted[0]), compare_time);
 590
 591    fprintf(stderr, "\n");
 592    fprintf(stderr, "type          ID                  cycles spent                   %% of total\n");
 593    for (int s = 0; s < brw->shader_time.num_entries; s++) {
 594       const char *stage;
 595       /* Work back from the sorted pointers times to a time to print. */
 596       int i = sorted[s] - scaled;
 597
 598       if (scaled[i] == 0)
 599          continue;
 600
 601       int shader_num = brw->shader_time.ids[i];
 602       const char *shader_name = brw->shader_time.names[i];
 603
 604       switch (brw->shader_time.types[i]) {
 605       case ST_VS:
 606          stage = "vs";
 607          break;
 608       case ST_TCS:
 609          stage = "tcs";
 610          break;
 611       case ST_TES:
 612          stage = "tes";
 613          break;
 614       case ST_GS:
 615          stage = "gs";
 616          break;
 617       case ST_FS8:
 618          stage = "fs8";
 619          break;
 620       case ST_FS16:
 621          stage = "fs16";
 622          break;
 623       case ST_FS32:
 624          stage = "fs32";
 625          break;
 626       case ST_CS:
 627          stage = "cs";
 628          break;
 629       default:
 630          stage = "other";
 631          break;
 632       }
 633
 634       print_shader_time_line(stage, shader_name, shader_num,
 635                              scaled[i], total);
 636    }
 637
 638    fprintf(stderr, "\n");
 639    print_shader_time_line("total", "vs", 0, total_by_type[ST_VS], total);
 640    print_shader_time_line("total", "tcs", 0, total_by_type[ST_TCS], total);
 641    print_shader_time_line("total", "tes", 0, total_by_type[ST_TES], total);
 642    print_shader_time_line("total", "gs", 0, total_by_type[ST_GS], total);
 643    print_shader_time_line("total", "fs8", 0, total_by_type[ST_FS8], total);
 644    print_shader_time_line("total", "fs16", 0, total_by_type[ST_FS16], total);
 645    print_shader_time_line("total", "fs32", 0, total_by_type[ST_FS32], total);
 646    print_shader_time_line("total", "cs", 0, total_by_type[ST_CS], total);
 647 }
 648
 649 static void
 650 brw_collect_shader_time(struct brw_context *brw)
 651 {
 652    if (!brw->shader_time.bo)
 653       return;
 654
 655    /* This probably stalls on the last rendering.  We could fix that by
 656     * delaying reading the reports, but it doesn't look like it's a big
 657     * overhead compared to the cost of tracking the time in the first place.
 658     */
 659    void *bo_map = brw_bo_map(brw, brw->shader_time.bo, MAP_READ | MAP_WRITE);
 660
 661    for (int i = 0; i < brw->shader_time.num_entries; i++) {
 662       uint32_t *times = bo_map + i * 3 * BRW_SHADER_TIME_STRIDE;
 663
 664       brw->shader_time.cumulative[i].time += times[BRW_SHADER_TIME_STRIDE * 0 / 4];
 665       brw->shader_time.cumulative[i].written += times[BRW_SHADER_TIME_STRIDE * 1 / 4];
 666       brw->shader_time.cumulative[i].reset += times[BRW_SHADER_TIME_STRIDE * 2 / 4];
 667    }
 668
 669    /* Zero the BO out to clear it out for our next collection.
 670     */
 671    memset(bo_map, 0, brw->shader_time.bo->size);
 672    brw_bo_unmap(brw->shader_time.bo);
 673 }
 674
 675 void
 676 brw_collect_and_report_shader_time(struct brw_context *brw)
 677 {
 678    brw_collect_shader_time(brw);
 679
 680    if (brw->shader_time.report_time == 0 ||
 681        get_time() - brw->shader_time.report_time >= 1.0) {
 682       brw_report_shader_time(brw);
 683       brw->shader_time.report_time = get_time();
 684    }
 685 }
 686
 687 /**
 688  * Chooses an index in the shader_time buffer and sets up tracking information
 689  * for our printouts.
 690  *
 691  * Note that this holds on to references to the underlying programs, which may
 692  * change their lifetimes compared to normal operation.
 693  */
 694 int
 695 brw_get_shader_time_index(struct brw_context *brw, struct gl_program *prog,
 696                           enum shader_time_shader_type type, bool is_glsl_sh)
 697 {
 698    int shader_time_index = brw->shader_time.num_entries++;
 699    assert(shader_time_index < brw->shader_time.max_entries);
 700    brw->shader_time.types[shader_time_index] = type;
 701
 702    const char *name;
 703    if (prog->Id == 0) {
 704       name = "ff";
 705    } else if (is_glsl_sh) {
 706       name = prog->info.label ?
 707          ralloc_strdup(brw->shader_time.names, prog->info.label) : "glsl";
 708    } else {
 709       name = "prog";
 710    }
 711
 712    brw->shader_time.names[shader_time_index] = name;
 713    brw->shader_time.ids[shader_time_index] = prog->Id;
 714
 715    return shader_time_index;
 716 }
 717
 718 void
 719 brw_destroy_shader_time(struct brw_context *brw)
 720 {
 721    brw_bo_unreference(brw->shader_time.bo);
 722    brw->shader_time.bo = NULL;
 723 }
 724
 725 void
 726 brw_stage_prog_data_free(const void *p)
 727 {
 728    struct brw_stage_prog_data *prog_data = (struct brw_stage_prog_data *)p;
 729
 730    ralloc_free(prog_data->param);
 731    ralloc_free(prog_data->pull_param);
 732 }
 733
 734 void
 735 brw_dump_arb_asm(const char *stage, struct gl_program *prog)
 736 {
 737    fprintf(stderr, "ARB_%s_program %d ir for native %s shader\n",
 738            stage, prog->Id, stage);
 739    _mesa_print_program(prog);
 740 }
 741
 742 void
 743 brw_setup_tex_for_precompile(struct brw_context *brw,
 744                              struct brw_sampler_prog_key_data *tex,
 745                              struct gl_program *prog)
 746 {
 747    const struct gen_device_info *devinfo = &brw->screen->devinfo;
 748    const bool has_shader_channel_select = devinfo->is_haswell || devinfo->gen >= 8;
 749    unsigned sampler_count = util_last_bit(prog->SamplersUsed);
 750    for (unsigned i = 0; i < sampler_count; i++) {
 751       if (!has_shader_channel_select && (prog->ShadowSamplers & (1 << i))) {
 752          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
 753          tex->swizzles[i] =
 754             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
 755       } else {
 756          /* Color sampler: assume no swizzling. */
 757          tex->swizzles[i] = SWIZZLE_XYZW;
 758       }
 759    }
 760 }
 761
 762 /**
 763  * Sets up the starting offsets for the groups of binding table entries
 764  * common to all pipeline stages.
 765  *
 766  * Unused groups are initialized to 0xd0d0d0d0 to make it obvious that they're
 767  * unused but also make sure that addition of small offsets to them will
 768  * trigger some of our asserts that surface indices are < BRW_MAX_SURFACES.
 769  */
 770 uint32_t
 771 brw_assign_common_binding_table_offsets(const struct gen_device_info *devinfo,
 772                                         const struct gl_program *prog,
 773                                         struct brw_stage_prog_data *stage_prog_data,
 774                                         uint32_t next_binding_table_offset)
 775 {
 776    int num_textures = util_last_bit(prog->SamplersUsed);
 777
 778    stage_prog_data->binding_table.texture_start = next_binding_table_offset;
 779    next_binding_table_offset += num_textures;
 780
 781    if (prog->info.num_ubos) {
 782       assert(prog->info.num_ubos <= BRW_MAX_UBO);
 783       stage_prog_data->binding_table.ubo_start = next_binding_table_offset;
 784       next_binding_table_offset += prog->info.num_ubos;
 785    } else {
 786       stage_prog_data->binding_table.ubo_start = 0xd0d0d0d0;
 787    }
 788
 789    if (prog->info.num_ssbos || prog->info.num_abos) {
 790       assert(prog->info.num_abos <= BRW_MAX_ABO);
 791       assert(prog->info.num_ssbos <= BRW_MAX_SSBO);
 792       stage_prog_data->binding_table.ssbo_start = next_binding_table_offset;
 793       next_binding_table_offset += prog->info.num_abos + prog->info.num_ssbos;
 794    } else {
 795       stage_prog_data->binding_table.ssbo_start = 0xd0d0d0d0;
 796    }
 797
 798    if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
 799       stage_prog_data->binding_table.shader_time_start = next_binding_table_offset;
 800       next_binding_table_offset++;
 801    } else {
 802       stage_prog_data->binding_table.shader_time_start = 0xd0d0d0d0;
 803    }
 804
 805    if (prog->info.uses_texture_gather) {
 806       if (devinfo->gen >= 8) {
 807          stage_prog_data->binding_table.gather_texture_start =
 808             stage_prog_data->binding_table.texture_start;
 809       } else {
 810          stage_prog_data->binding_table.gather_texture_start = next_binding_table_offset;
 811          next_binding_table_offset += num_textures;
 812       }
 813    } else {
 814       stage_prog_data->binding_table.gather_texture_start = 0xd0d0d0d0;
 815    }
 816
 817    if (prog->info.num_images) {
 818       stage_prog_data->binding_table.image_start = next_binding_table_offset;
 819       next_binding_table_offset += prog->info.num_images;
 820    } else {
 821       stage_prog_data->binding_table.image_start = 0xd0d0d0d0;
 822    }
 823
 824    /* This may or may not be used depending on how the compile goes. */
 825    stage_prog_data->binding_table.pull_constants_start = next_binding_table_offset;
 826    next_binding_table_offset++;
 827
 828    /* Plane 0 is just the regular texture section */
 829    stage_prog_data->binding_table.plane_start[0] = stage_prog_data->binding_table.texture_start;
 830
 831    stage_prog_data->binding_table.plane_start[1] = next_binding_table_offset;
 832    next_binding_table_offset += num_textures;
 833
 834    stage_prog_data->binding_table.plane_start[2] = next_binding_table_offset;
 835    next_binding_table_offset += num_textures;
 836
 837    /* prog_data->base.binding_table.size will be set by brw_mark_surface_used. */
 838
 839    assert(next_binding_table_offset <= BRW_MAX_SURFACES);
 840    return next_binding_table_offset;
 841 }
 842
 843 void
 844 brw_program_serialize_nir(struct gl_context *ctx, struct gl_program *prog)
 845 {
 846    if (prog->driver_cache_blob)
 847       return;
 848
 849    struct blob writer;
 850    blob_init(&writer);
 851    nir_serialize(&writer, prog->nir);
 852    prog->driver_cache_blob = ralloc_size(NULL, writer.size);
 853    memcpy(prog->driver_cache_blob, writer.data, writer.size);
 854    prog->driver_cache_blob_size = writer.size;
 855    blob_finish(&writer);
 856 }
 857
 858 void
 859 brw_program_deserialize_nir(struct gl_context *ctx, struct gl_program *prog,
 860                             gl_shader_stage stage)
 861 {
 862    if (!prog->nir) {
 863       assert(prog->driver_cache_blob && prog->driver_cache_blob_size > 0);
 864       const struct nir_shader_compiler_options *options =
 865          ctx->Const.ShaderCompilerOptions[stage].NirOptions;
 866       struct blob_reader reader;
 867       blob_reader_init(&reader, prog->driver_cache_blob,
 868                        prog->driver_cache_blob_size);
 869       prog->nir = nir_deserialize(NULL, options, &reader);
 870    }
 871
 872    if (prog->driver_cache_blob) {
 873       ralloc_free(prog->driver_cache_blob);
 874       prog->driver_cache_blob = NULL;
 875       prog->driver_cache_blob_size = 0;
 876    }
 877 }