src/mesa/drivers/dri/i965/brw_program.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keithw@vmware.com>
  30   */
  31
  32 #include <pthread.h>
  33 #include "main/imports.h"
  34 #include "main/glspirv.h"
  35 #include "program/prog_parameter.h"
  36 #include "program/prog_print.h"
  37 #include "program/prog_to_nir.h"
  38 #include "program/program.h"
  39 #include "program/programopt.h"
  40 #include "tnl/tnl.h"
  41 #include "util/ralloc.h"
  42 #include "compiler/glsl/ir.h"
  43 #include "compiler/glsl/glsl_to_nir.h"
  44
  45 #include "brw_program.h"
  46 #include "brw_context.h"
  47 #include "compiler/brw_nir.h"
  48 #include "brw_defines.h"
  49 #include "intel_batchbuffer.h"
  50
  51 #include "brw_cs.h"
  52 #include "brw_gs.h"
  53 #include "brw_vs.h"
  54 #include "brw_wm.h"
  55
  56 static bool
  57 brw_nir_lower_uniforms(nir_shader *nir, bool is_scalar)
  58 {
  59    if (is_scalar) {
  60       nir_assign_var_locations(&nir->uniforms, &nir->num_uniforms,
  61                                type_size_scalar_bytes);
  62       return nir_lower_io(nir, nir_var_uniform, type_size_scalar_bytes, 0);
  63    } else {
  64       nir_assign_var_locations(&nir->uniforms, &nir->num_uniforms,
  65                                type_size_vec4_bytes);
  66       return nir_lower_io(nir, nir_var_uniform, type_size_vec4_bytes, 0);
  67    }
  68 }
  69
  70 nir_shader *
  71 brw_create_nir(struct brw_context *brw,
  72                const struct gl_shader_program *shader_prog,
  73                struct gl_program *prog,
  74                gl_shader_stage stage,
  75                bool is_scalar)
  76 {
  77    struct gl_context *ctx = &brw->ctx;
  78    const nir_shader_compiler_options *options =
  79       ctx->Const.ShaderCompilerOptions[stage].NirOptions;
  80    nir_shader *nir;
  81
  82    /* First, lower the GLSL/Mesa IR or SPIR-V to NIR */
  83    if (shader_prog) {
  84       if (shader_prog->data->spirv) {
  85          nir = _mesa_spirv_to_nir(ctx, shader_prog, stage, options);
  86       } else {
  87          nir = glsl_to_nir(shader_prog, stage, options);
  88       }
  89       assert (nir);
  90
  91       nir_remove_dead_variables(nir, nir_var_shader_in | nir_var_shader_out);
  92       nir_lower_returns(nir);
  93       nir_validate_shader(nir);
  94       NIR_PASS_V(nir, nir_lower_io_to_temporaries,
  95                  nir_shader_get_entrypoint(nir), true, false);
  96    } else {
  97       nir = prog_to_nir(prog, options);
  98       NIR_PASS_V(nir, nir_lower_regs_to_ssa); /* turn registers into SSA */
  99    }
 100    nir_validate_shader(nir);
 101
 102    /* Lower PatchVerticesIn from system value to uniform. This needs to
 103     * happen before brw_preprocess_nir, since that will lower system values
 104     * to intrinsics.
 105     *
 106     * We only do this for TES if no TCS is present, since otherwise we know
 107     * the number of vertices in the patch at link time and we can lower it
 108     * directly to a constant. We do this in nir_lower_patch_vertices, which
 109     * needs to run after brw_nir_preprocess has turned the system values
 110     * into intrinsics.
 111     */
 112    const bool lower_patch_vertices_in_to_uniform =
 113       (stage == MESA_SHADER_TESS_CTRL && brw->screen->devinfo.gen >= 8) ||
 114       (stage == MESA_SHADER_TESS_EVAL &&
 115        !shader_prog->_LinkedShaders[MESA_SHADER_TESS_CTRL]);
 116
 117    if (lower_patch_vertices_in_to_uniform)
 118       brw_nir_lower_patch_vertices_in_to_uniform(nir);
 119
 120    nir = brw_preprocess_nir(brw->screen->compiler, nir);
 121
 122    if (stage == MESA_SHADER_TESS_EVAL && !lower_patch_vertices_in_to_uniform) {
 123       assert(shader_prog->_LinkedShaders[MESA_SHADER_TESS_CTRL]);
 124       struct gl_linked_shader *linked_tcs =
 125          shader_prog->_LinkedShaders[MESA_SHADER_TESS_CTRL];
 126       uint32_t patch_vertices = linked_tcs->Program->info.tess.tcs_vertices_out;
 127       nir_lower_tes_patch_vertices(nir, patch_vertices);
 128    }
 129
 130    if (stage == MESA_SHADER_FRAGMENT) {
 131       static const struct nir_lower_wpos_ytransform_options wpos_options = {
 132          .state_tokens = {STATE_INTERNAL, STATE_FB_WPOS_Y_TRANSFORM, 0, 0, 0},
 133          .fs_coord_pixel_center_integer = 1,
 134          .fs_coord_origin_upper_left = 1,
 135       };
 136
 137       bool progress = false;
 138       NIR_PASS(progress, nir, nir_lower_wpos_ytransform, &wpos_options);
 139       if (progress) {
 140          _mesa_add_state_reference(prog->Parameters,
 141                                    wpos_options.state_tokens);
 142       }
 143    }
 144
 145    NIR_PASS_V(nir, brw_nir_lower_uniforms, is_scalar);
 146
 147    return nir;
 148 }
 149
 150 void
 151 brw_shader_gather_info(nir_shader *nir, struct gl_program *prog)
 152 {
 153    nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
 154
 155    /* Copy the info we just generated back into the gl_program */
 156    const char *prog_name = prog->info.name;
 157    const char *prog_label = prog->info.label;
 158    prog->info = nir->info;
 159    prog->info.name = prog_name;
 160    prog->info.label = prog_label;
 161 }
 162
 163 static unsigned
 164 get_new_program_id(struct intel_screen *screen)
 165 {
 166    return p_atomic_inc_return(&screen->program_id);
 167 }
 168
 169 static struct gl_program *brwNewProgram(struct gl_context *ctx, GLenum target,
 170                                         GLuint id, bool is_arb_asm)
 171 {
 172    struct brw_context *brw = brw_context(ctx);
 173    struct brw_program *prog = rzalloc(NULL, struct brw_program);
 174
 175    if (prog) {
 176       prog->id = get_new_program_id(brw->screen);
 177
 178       return _mesa_init_gl_program(&prog->program, target, id, is_arb_asm);
 179    }
 180
 181    return NULL;
 182 }
 183
 184 static void brwDeleteProgram( struct gl_context *ctx,
 185                               struct gl_program *prog )
 186 {
 187    struct brw_context *brw = brw_context(ctx);
 188
 189    /* Beware!  prog's refcount has reached zero, and it's about to be freed.
 190     *
 191     * In brw_upload_pipeline_state(), we compare brw->programs[i] to
 192     * ctx->FooProgram._Current, and flag BRW_NEW_FOO_PROGRAM if the
 193     * pointer has changed.
 194     *
 195     * We cannot leave brw->programs[i] as a dangling pointer to the dead
 196     * program.  malloc() may allocate the same memory for a new gl_program,
 197     * causing us to see matching pointers...but totally different programs.
 198     *
 199     * We cannot set brw->programs[i] to NULL, either.  If we've deleted the
 200     * active program, Mesa may set ctx->FooProgram._Current to NULL.  That
 201     * would cause us to see matching pointers (NULL == NULL), and fail to
 202     * detect that a program has changed since our last draw.
 203     *
 204     * So, set it to a bogus gl_program pointer that will never match,
 205     * causing us to properly reevaluate the state on our next draw.
 206     *
 207     * Getting this wrong causes heisenbugs which are very hard to catch,
 208     * as you need a very specific allocation pattern to hit the problem.
 209     */
 210    static const struct gl_program deleted_program;
 211
 212    for (int i = 0; i < MESA_SHADER_STAGES; i++) {
 213       if (brw->programs[i] == prog)
 214          brw->programs[i] = (struct gl_program *) &deleted_program;
 215    }
 216
 217    _mesa_delete_program( ctx, prog );
 218 }
 219
 220
 221 static GLboolean
 222 brwProgramStringNotify(struct gl_context *ctx,
 223                        GLenum target,
 224                        struct gl_program *prog)
 225 {
 226    assert(target == GL_VERTEX_PROGRAM_ARB || !prog->arb.IsPositionInvariant);
 227
 228    struct brw_context *brw = brw_context(ctx);
 229    const struct brw_compiler *compiler = brw->screen->compiler;
 230
 231    switch (target) {
 232    case GL_FRAGMENT_PROGRAM_ARB: {
 233       struct brw_program *newFP = brw_program(prog);
 234       const struct brw_program *curFP =
 235          brw_program_const(brw->programs[MESA_SHADER_FRAGMENT]);
 236
 237       if (newFP == curFP)
 238          brw->ctx.NewDriverState |= BRW_NEW_FRAGMENT_PROGRAM;
 239       newFP->id = get_new_program_id(brw->screen);
 240
 241       prog->nir = brw_create_nir(brw, NULL, prog, MESA_SHADER_FRAGMENT, true);
 242
 243       brw_shader_gather_info(prog->nir, prog);
 244
 245       brw_fs_precompile(ctx, prog);
 246       break;
 247    }
 248    case GL_VERTEX_PROGRAM_ARB: {
 249       struct brw_program *newVP = brw_program(prog);
 250       const struct brw_program *curVP =
 251          brw_program_const(brw->programs[MESA_SHADER_VERTEX]);
 252
 253       if (newVP == curVP)
 254          brw->ctx.NewDriverState |= BRW_NEW_VERTEX_PROGRAM;
 255       if (newVP->program.arb.IsPositionInvariant) {
 256          _mesa_insert_mvp_code(ctx, &newVP->program);
 257       }
 258       newVP->id = get_new_program_id(brw->screen);
 259
 260       /* Also tell tnl about it:
 261        */
 262       _tnl_program_string(ctx, target, prog);
 263
 264       prog->nir = brw_create_nir(brw, NULL, prog, MESA_SHADER_VERTEX,
 265                                  compiler->scalar_stage[MESA_SHADER_VERTEX]);
 266
 267       brw_shader_gather_info(prog->nir, prog);
 268
 269       brw_vs_precompile(ctx, prog);
 270       break;
 271    }
 272    default:
 273       /*
 274        * driver->ProgramStringNotify is only called for ARB programs, fixed
 275        * function vertex programs, and ir_to_mesa (which isn't used by the
 276        * i965 back-end).  Therefore, even after geometry shaders are added,
 277        * this function should only ever be called with a target of
 278        * GL_VERTEX_PROGRAM_ARB or GL_FRAGMENT_PROGRAM_ARB.
 279        */
 280       unreachable("Unexpected target in brwProgramStringNotify");
 281    }
 282
 283    return true;
 284 }
 285
 286 static void
 287 brw_memory_barrier(struct gl_context *ctx, GLbitfield barriers)
 288 {
 289    struct brw_context *brw = brw_context(ctx);
 290    const struct gen_device_info *devinfo = &brw->screen->devinfo;
 291    unsigned bits = PIPE_CONTROL_DATA_CACHE_FLUSH | PIPE_CONTROL_CS_STALL;
 292    assert(devinfo->gen >= 7 && devinfo->gen <= 11);
 293
 294    if (barriers & (GL_VERTEX_ATTRIB_ARRAY_BARRIER_BIT |
 295                    GL_ELEMENT_ARRAY_BARRIER_BIT |
 296                    GL_COMMAND_BARRIER_BIT))
 297       bits |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
 298
 299    if (barriers & GL_UNIFORM_BARRIER_BIT)
 300       bits |= (PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
 301                PIPE_CONTROL_CONST_CACHE_INVALIDATE);
 302
 303    if (barriers & GL_TEXTURE_FETCH_BARRIER_BIT)
 304       bits |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
 305
 306    if (barriers & (GL_TEXTURE_UPDATE_BARRIER_BIT |
 307                    GL_PIXEL_BUFFER_BARRIER_BIT))
 308       bits |= (PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
 309                PIPE_CONTROL_RENDER_TARGET_FLUSH);
 310
 311    if (barriers & GL_FRAMEBUFFER_BARRIER_BIT)
 312       bits |= (PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
 313                PIPE_CONTROL_RENDER_TARGET_FLUSH);
 314
 315    /* Typed surface messages are handled by the render cache on IVB, so we
 316     * need to flush it too.
 317     */
 318    if (devinfo->gen == 7 && !devinfo->is_haswell)
 319       bits |= PIPE_CONTROL_RENDER_TARGET_FLUSH;
 320
 321    brw_emit_pipe_control_flush(brw, bits);
 322 }
 323
 324 static void
 325 brw_framebuffer_fetch_barrier(struct gl_context *ctx)
 326 {
 327    struct brw_context *brw = brw_context(ctx);
 328    const struct gen_device_info *devinfo = &brw->screen->devinfo;
 329
 330    if (!ctx->Extensions.EXT_shader_framebuffer_fetch) {
 331       if (devinfo->gen >= 6) {
 332          brw_emit_pipe_control_flush(brw,
 333                                      PIPE_CONTROL_RENDER_TARGET_FLUSH |
 334                                      PIPE_CONTROL_CS_STALL);
 335          brw_emit_pipe_control_flush(brw,
 336                                      PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE);
 337       } else {
 338          brw_emit_pipe_control_flush(brw,
 339                                      PIPE_CONTROL_RENDER_TARGET_FLUSH);
 340       }
 341    }
 342 }
 343
 344 void
 345 brw_get_scratch_bo(struct brw_context *brw,
 346                    struct brw_bo **scratch_bo, int size)
 347 {
 348    struct brw_bo *old_bo = *scratch_bo;
 349
 350    if (old_bo && old_bo->size < size) {
 351       brw_bo_unreference(old_bo);
 352       old_bo = NULL;
 353    }
 354
 355    if (!old_bo) {
 356       *scratch_bo =
 357          brw_bo_alloc(brw->bufmgr, "scratch bo", size, BRW_MEMZONE_SCRATCH);
 358    }
 359 }
 360
 361 /**
 362  * Reserve enough scratch space for the given stage to hold \p per_thread_size
 363  * bytes times the given \p thread_count.
 364  */
 365 void
 366 brw_alloc_stage_scratch(struct brw_context *brw,
 367                         struct brw_stage_state *stage_state,
 368                         unsigned per_thread_size)
 369 {
 370    if (stage_state->per_thread_scratch >= per_thread_size)
 371       return;
 372
 373    stage_state->per_thread_scratch = per_thread_size;
 374
 375    if (stage_state->scratch_bo)
 376       brw_bo_unreference(stage_state->scratch_bo);
 377
 378    const struct gen_device_info *devinfo = &brw->screen->devinfo;
 379    unsigned thread_count;
 380    switch(stage_state->stage) {
 381    case MESA_SHADER_VERTEX:
 382       thread_count = devinfo->max_vs_threads;
 383       break;
 384    case MESA_SHADER_TESS_CTRL:
 385       thread_count = devinfo->max_tcs_threads;
 386       break;
 387    case MESA_SHADER_TESS_EVAL:
 388       thread_count = devinfo->max_tes_threads;
 389       break;
 390    case MESA_SHADER_GEOMETRY:
 391       thread_count = devinfo->max_gs_threads;
 392       break;
 393    case MESA_SHADER_FRAGMENT:
 394       thread_count = devinfo->max_wm_threads;
 395       break;
 396    case MESA_SHADER_COMPUTE: {
 397       unsigned subslices = MAX2(brw->screen->subslice_total, 1);
 398
 399       /* The documentation for 3DSTATE_PS "Scratch Space Base Pointer" says:
 400        *
 401        * "Scratch Space per slice is computed based on 4 sub-slices.  SW must
 402        *  allocate scratch space enough so that each slice has 4 slices
 403        *  allowed."
 404        *
 405        * According to the other driver team, this applies to compute shaders
 406        * as well.  This is not currently documented at all.
 407        *
 408        * brw->screen->subslice_total is the TOTAL number of subslices
 409        * and we wish to view that there are 4 subslices per slice
 410        * instead of the actual number of subslices per slice.
 411        */
 412       if (devinfo->gen >= 9)
 413          subslices = 4 * brw->screen->devinfo.num_slices;
 414
 415       unsigned scratch_ids_per_subslice;
 416       if (devinfo->is_haswell) {
 417          /* WaCSScratchSize:hsw
 418           *
 419           * Haswell's scratch space address calculation appears to be sparse
 420           * rather than tightly packed. The Thread ID has bits indicating
 421           * which subslice, EU within a subslice, and thread within an EU it
 422           * is. There's a maximum of two slices and two subslices, so these
 423           * can be stored with a single bit. Even though there are only 10 EUs
 424           * per subslice, this is stored in 4 bits, so there's an effective
 425           * maximum value of 16 EUs. Similarly, although there are only 7
 426           * threads per EU, this is stored in a 3 bit number, giving an
 427           * effective maximum value of 8 threads per EU.
 428           *
 429           * This means that we need to use 16 * 8 instead of 10 * 7 for the
 430           * number of threads per subslice.
 431           */
 432          scratch_ids_per_subslice = 16 * 8;
 433       } else if (devinfo->is_cherryview) {
 434          /* Cherryview devices have either 6 or 8 EUs per subslice, and each
 435           * EU has 7 threads. The 6 EU devices appear to calculate thread IDs
 436           * as if it had 8 EUs.
 437           */
 438          scratch_ids_per_subslice = 8 * 7;
 439       } else {
 440          scratch_ids_per_subslice = devinfo->max_cs_threads;
 441       }
 442
 443       thread_count = scratch_ids_per_subslice * subslices;
 444       break;
 445    }
 446    default:
 447       unreachable("Unsupported stage!");
 448    }
 449
 450    stage_state->scratch_bo =
 451       brw_bo_alloc(brw->bufmgr, "shader scratch space",
 452                    per_thread_size * thread_count, BRW_MEMZONE_SCRATCH);
 453 }
 454
 455 void brwInitFragProgFuncs( struct dd_function_table *functions )
 456 {
 457    assert(functions->ProgramStringNotify == _tnl_program_string);
 458
 459    functions->NewProgram = brwNewProgram;
 460    functions->DeleteProgram = brwDeleteProgram;
 461    functions->ProgramStringNotify = brwProgramStringNotify;
 462
 463    functions->LinkShader = brw_link_shader;
 464
 465    functions->MemoryBarrier = brw_memory_barrier;
 466    functions->FramebufferFetchBarrier = brw_framebuffer_fetch_barrier;
 467 }
 468
 469 struct shader_times {
 470    uint64_t time;
 471    uint64_t written;
 472    uint64_t reset;
 473 };
 474
 475 void
 476 brw_init_shader_time(struct brw_context *brw)
 477 {
 478    const int max_entries = 2048;
 479    brw->shader_time.bo =
 480       brw_bo_alloc(brw->bufmgr, "shader time",
 481                    max_entries * BRW_SHADER_TIME_STRIDE * 3,
 482                    BRW_MEMZONE_OTHER);
 483    brw->shader_time.names = rzalloc_array(brw, const char *, max_entries);
 484    brw->shader_time.ids = rzalloc_array(brw, int, max_entries);
 485    brw->shader_time.types = rzalloc_array(brw, enum shader_time_shader_type,
 486                                           max_entries);
 487    brw->shader_time.cumulative = rzalloc_array(brw, struct shader_times,
 488                                                max_entries);
 489    brw->shader_time.max_entries = max_entries;
 490 }
 491
 492 static int
 493 compare_time(const void *a, const void *b)
 494 {
 495    uint64_t * const *a_val = a;
 496    uint64_t * const *b_val = b;
 497
 498    /* We don't just subtract because we're turning the value to an int. */
 499    if (**a_val < **b_val)
 500       return -1;
 501    else if (**a_val == **b_val)
 502       return 0;
 503    else
 504       return 1;
 505 }
 506
 507 static void
 508 print_shader_time_line(const char *stage, const char *name,
 509                        int shader_num, uint64_t time, uint64_t total)
 510 {
 511    fprintf(stderr, "%-6s%-18s", stage, name);
 512
 513    if (shader_num != 0)
 514       fprintf(stderr, "%4d: ", shader_num);
 515    else
 516       fprintf(stderr, "    : ");
 517
 518    fprintf(stderr, "%16lld (%7.2f Gcycles)      %4.1f%%\n",
 519            (long long)time,
 520            (double)time / 1000000000.0,
 521            (double)time / total * 100.0);
 522 }
 523
 524 static void
 525 brw_report_shader_time(struct brw_context *brw)
 526 {
 527    if (!brw->shader_time.bo || !brw->shader_time.num_entries)
 528       return;
 529
 530    uint64_t scaled[brw->shader_time.num_entries];
 531    uint64_t *sorted[brw->shader_time.num_entries];
 532    uint64_t total_by_type[ST_CS + 1];
 533    memset(total_by_type, 0, sizeof(total_by_type));
 534    double total = 0;
 535    for (int i = 0; i < brw->shader_time.num_entries; i++) {
 536       uint64_t written = 0, reset = 0;
 537       enum shader_time_shader_type type = brw->shader_time.types[i];
 538
 539       sorted[i] = &scaled[i];
 540
 541       switch (type) {
 542       case ST_VS:
 543       case ST_TCS:
 544       case ST_TES:
 545       case ST_GS:
 546       case ST_FS8:
 547       case ST_FS16:
 548       case ST_FS32:
 549       case ST_CS:
 550          written = brw->shader_time.cumulative[i].written;
 551          reset = brw->shader_time.cumulative[i].reset;
 552          break;
 553
 554       default:
 555          /* I sometimes want to print things that aren't the 3 shader times.
 556           * Just print the sum in that case.
 557           */
 558          written = 1;
 559          reset = 0;
 560          break;
 561       }
 562
 563       uint64_t time = brw->shader_time.cumulative[i].time;
 564       if (written) {
 565          scaled[i] = time / written * (written + reset);
 566       } else {
 567          scaled[i] = time;
 568       }
 569
 570       switch (type) {
 571       case ST_VS:
 572       case ST_TCS:
 573       case ST_TES:
 574       case ST_GS:
 575       case ST_FS8:
 576       case ST_FS16:
 577       case ST_FS32:
 578       case ST_CS:
 579          total_by_type[type] += scaled[i];
 580          break;
 581       default:
 582          break;
 583       }
 584
 585       total += scaled[i];
 586    }
 587
 588    if (total == 0) {
 589       fprintf(stderr, "No shader time collected yet\n");
 590       return;
 591    }
 592
 593    qsort(sorted, brw->shader_time.num_entries, sizeof(sorted[0]), compare_time);
 594
 595    fprintf(stderr, "\n");
 596    fprintf(stderr, "type          ID                  cycles spent                   %% of total\n");
 597    for (int s = 0; s < brw->shader_time.num_entries; s++) {
 598       const char *stage;
 599       /* Work back from the sorted pointers times to a time to print. */
 600       int i = sorted[s] - scaled;
 601
 602       if (scaled[i] == 0)
 603          continue;
 604
 605       int shader_num = brw->shader_time.ids[i];
 606       const char *shader_name = brw->shader_time.names[i];
 607
 608       switch (brw->shader_time.types[i]) {
 609       case ST_VS:
 610          stage = "vs";
 611          break;
 612       case ST_TCS:
 613          stage = "tcs";
 614          break;
 615       case ST_TES:
 616          stage = "tes";
 617          break;
 618       case ST_GS:
 619          stage = "gs";
 620          break;
 621       case ST_FS8:
 622          stage = "fs8";
 623          break;
 624       case ST_FS16:
 625          stage = "fs16";
 626          break;
 627       case ST_FS32:
 628          stage = "fs32";
 629          break;
 630       case ST_CS:
 631          stage = "cs";
 632          break;
 633       default:
 634          stage = "other";
 635          break;
 636       }
 637
 638       print_shader_time_line(stage, shader_name, shader_num,
 639                              scaled[i], total);
 640    }
 641
 642    fprintf(stderr, "\n");
 643    print_shader_time_line("total", "vs", 0, total_by_type[ST_VS], total);
 644    print_shader_time_line("total", "tcs", 0, total_by_type[ST_TCS], total);
 645    print_shader_time_line("total", "tes", 0, total_by_type[ST_TES], total);
 646    print_shader_time_line("total", "gs", 0, total_by_type[ST_GS], total);
 647    print_shader_time_line("total", "fs8", 0, total_by_type[ST_FS8], total);
 648    print_shader_time_line("total", "fs16", 0, total_by_type[ST_FS16], total);
 649    print_shader_time_line("total", "fs32", 0, total_by_type[ST_FS32], total);
 650    print_shader_time_line("total", "cs", 0, total_by_type[ST_CS], total);
 651 }
 652
 653 static void
 654 brw_collect_shader_time(struct brw_context *brw)
 655 {
 656    if (!brw->shader_time.bo)
 657       return;
 658
 659    /* This probably stalls on the last rendering.  We could fix that by
 660     * delaying reading the reports, but it doesn't look like it's a big
 661     * overhead compared to the cost of tracking the time in the first place.
 662     */
 663    void *bo_map = brw_bo_map(brw, brw->shader_time.bo, MAP_READ | MAP_WRITE);
 664
 665    for (int i = 0; i < brw->shader_time.num_entries; i++) {
 666       uint32_t *times = bo_map + i * 3 * BRW_SHADER_TIME_STRIDE;
 667
 668       brw->shader_time.cumulative[i].time += times[BRW_SHADER_TIME_STRIDE * 0 / 4];
 669       brw->shader_time.cumulative[i].written += times[BRW_SHADER_TIME_STRIDE * 1 / 4];
 670       brw->shader_time.cumulative[i].reset += times[BRW_SHADER_TIME_STRIDE * 2 / 4];
 671    }
 672
 673    /* Zero the BO out to clear it out for our next collection.
 674     */
 675    memset(bo_map, 0, brw->shader_time.bo->size);
 676    brw_bo_unmap(brw->shader_time.bo);
 677 }
 678
 679 void
 680 brw_collect_and_report_shader_time(struct brw_context *brw)
 681 {
 682    brw_collect_shader_time(brw);
 683
 684    if (brw->shader_time.report_time == 0 ||
 685        get_time() - brw->shader_time.report_time >= 1.0) {
 686       brw_report_shader_time(brw);
 687       brw->shader_time.report_time = get_time();
 688    }
 689 }
 690
 691 /**
 692  * Chooses an index in the shader_time buffer and sets up tracking information
 693  * for our printouts.
 694  *
 695  * Note that this holds on to references to the underlying programs, which may
 696  * change their lifetimes compared to normal operation.
 697  */
 698 int
 699 brw_get_shader_time_index(struct brw_context *brw, struct gl_program *prog,
 700                           enum shader_time_shader_type type, bool is_glsl_sh)
 701 {
 702    int shader_time_index = brw->shader_time.num_entries++;
 703    assert(shader_time_index < brw->shader_time.max_entries);
 704    brw->shader_time.types[shader_time_index] = type;
 705
 706    const char *name;
 707    if (prog->Id == 0) {
 708       name = "ff";
 709    } else if (is_glsl_sh) {
 710       name = prog->info.label ?
 711          ralloc_strdup(brw->shader_time.names, prog->info.label) : "glsl";
 712    } else {
 713       name = "prog";
 714    }
 715
 716    brw->shader_time.names[shader_time_index] = name;
 717    brw->shader_time.ids[shader_time_index] = prog->Id;
 718
 719    return shader_time_index;
 720 }
 721
 722 void
 723 brw_destroy_shader_time(struct brw_context *brw)
 724 {
 725    brw_bo_unreference(brw->shader_time.bo);
 726    brw->shader_time.bo = NULL;
 727 }
 728
 729 void
 730 brw_stage_prog_data_free(const void *p)
 731 {
 732    struct brw_stage_prog_data *prog_data = (struct brw_stage_prog_data *)p;
 733
 734    ralloc_free(prog_data->param);
 735    ralloc_free(prog_data->pull_param);
 736 }
 737
 738 void
 739 brw_dump_arb_asm(const char *stage, struct gl_program *prog)
 740 {
 741    fprintf(stderr, "ARB_%s_program %d ir for native %s shader\n",
 742            stage, prog->Id, stage);
 743    _mesa_print_program(prog);
 744 }
 745
 746 void
 747 brw_setup_tex_for_precompile(const struct gen_device_info *devinfo,
 748                              struct brw_sampler_prog_key_data *tex,
 749                              struct gl_program *prog)
 750 {
 751    const bool has_shader_channel_select = devinfo->is_haswell || devinfo->gen >= 8;
 752    unsigned sampler_count = util_last_bit(prog->SamplersUsed);
 753    for (unsigned i = 0; i < sampler_count; i++) {
 754       if (!has_shader_channel_select && (prog->ShadowSamplers & (1 << i))) {
 755          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
 756          tex->swizzles[i] =
 757             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
 758       } else {
 759          /* Color sampler: assume no swizzling. */
 760          tex->swizzles[i] = SWIZZLE_XYZW;
 761       }
 762    }
 763 }
 764
 765 /**
 766  * Sets up the starting offsets for the groups of binding table entries
 767  * common to all pipeline stages.
 768  *
 769  * Unused groups are initialized to 0xd0d0d0d0 to make it obvious that they're
 770  * unused but also make sure that addition of small offsets to them will
 771  * trigger some of our asserts that surface indices are < BRW_MAX_SURFACES.
 772  */
 773 uint32_t
 774 brw_assign_common_binding_table_offsets(const struct gen_device_info *devinfo,
 775                                         const struct gl_program *prog,
 776                                         struct brw_stage_prog_data *stage_prog_data,
 777                                         uint32_t next_binding_table_offset)
 778 {
 779    int num_textures = util_last_bit(prog->SamplersUsed);
 780
 781    stage_prog_data->binding_table.texture_start = next_binding_table_offset;
 782    next_binding_table_offset += num_textures;
 783
 784    if (prog->info.num_ubos) {
 785       assert(prog->info.num_ubos <= BRW_MAX_UBO);
 786       stage_prog_data->binding_table.ubo_start = next_binding_table_offset;
 787       next_binding_table_offset += prog->info.num_ubos;
 788    } else {
 789       stage_prog_data->binding_table.ubo_start = 0xd0d0d0d0;
 790    }
 791
 792    if (prog->info.num_ssbos || prog->info.num_abos) {
 793       assert(prog->info.num_abos <= BRW_MAX_ABO);
 794       assert(prog->info.num_ssbos <= BRW_MAX_SSBO);
 795       stage_prog_data->binding_table.ssbo_start = next_binding_table_offset;
 796       next_binding_table_offset += prog->info.num_abos + prog->info.num_ssbos;
 797    } else {
 798       stage_prog_data->binding_table.ssbo_start = 0xd0d0d0d0;
 799    }
 800
 801    if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
 802       stage_prog_data->binding_table.shader_time_start = next_binding_table_offset;
 803       next_binding_table_offset++;
 804    } else {
 805       stage_prog_data->binding_table.shader_time_start = 0xd0d0d0d0;
 806    }
 807
 808    if (prog->info.uses_texture_gather) {
 809       if (devinfo->gen >= 8) {
 810          stage_prog_data->binding_table.gather_texture_start =
 811             stage_prog_data->binding_table.texture_start;
 812       } else {
 813          stage_prog_data->binding_table.gather_texture_start = next_binding_table_offset;
 814          next_binding_table_offset += num_textures;
 815       }
 816    } else {
 817       stage_prog_data->binding_table.gather_texture_start = 0xd0d0d0d0;
 818    }
 819
 820    if (prog->info.num_images) {
 821       stage_prog_data->binding_table.image_start = next_binding_table_offset;
 822       next_binding_table_offset += prog->info.num_images;
 823    } else {
 824       stage_prog_data->binding_table.image_start = 0xd0d0d0d0;
 825    }
 826
 827    /* This may or may not be used depending on how the compile goes. */
 828    stage_prog_data->binding_table.pull_constants_start = next_binding_table_offset;
 829    next_binding_table_offset++;
 830
 831    /* Plane 0 is just the regular texture section */
 832    stage_prog_data->binding_table.plane_start[0] = stage_prog_data->binding_table.texture_start;
 833
 834    stage_prog_data->binding_table.plane_start[1] = next_binding_table_offset;
 835    next_binding_table_offset += num_textures;
 836
 837    stage_prog_data->binding_table.plane_start[2] = next_binding_table_offset;
 838    next_binding_table_offset += num_textures;
 839
 840    /* prog_data->base.binding_table.size will be set by brw_mark_surface_used. */
 841
 842    assert(next_binding_table_offset <= BRW_MAX_SURFACES);
 843    return next_binding_table_offset;
 844 }
 845
 846 void
 847 brw_prog_key_set_id(union brw_any_prog_key *key, gl_shader_stage stage,
 848                     unsigned id)
 849 {
 850    static const unsigned stage_offsets[] = {
 851       offsetof(struct brw_vs_prog_key, program_string_id),
 852       offsetof(struct brw_tcs_prog_key, program_string_id),
 853       offsetof(struct brw_tes_prog_key, program_string_id),
 854       offsetof(struct brw_gs_prog_key, program_string_id),
 855       offsetof(struct brw_wm_prog_key, program_string_id),
 856       offsetof(struct brw_cs_prog_key, program_string_id),
 857    };
 858    assert((int)stage >= 0 && stage < ARRAY_SIZE(stage_offsets));
 859    *(unsigned*)((uint8_t*)key + stage_offsets[stage]) = id;
 860 }
 861
 862 void
 863 brw_populate_default_key(const struct gen_device_info *devinfo,
 864                          union brw_any_prog_key *prog_key,
 865                          struct gl_shader_program *sh_prog,
 866                          struct gl_program *prog)
 867 {
 868    switch (prog->info.stage) {
 869    case MESA_SHADER_VERTEX:
 870       brw_vs_populate_default_key(devinfo, &prog_key->vs, prog);
 871       break;
 872    case MESA_SHADER_TESS_CTRL:
 873       brw_tcs_populate_default_key(devinfo, &prog_key->tcs, sh_prog, prog);
 874       break;
 875    case MESA_SHADER_TESS_EVAL:
 876       brw_tes_populate_default_key(devinfo, &prog_key->tes, sh_prog, prog);
 877       break;
 878    case MESA_SHADER_GEOMETRY:
 879       brw_gs_populate_default_key(devinfo, &prog_key->gs, prog);
 880       break;
 881    case MESA_SHADER_FRAGMENT:
 882       brw_wm_populate_default_key(devinfo, &prog_key->wm, prog);
 883       break;
 884    case MESA_SHADER_COMPUTE:
 885       brw_cs_populate_default_key(devinfo, &prog_key->cs, prog);
 886       break;
 887    default:
 888       unreachable("Unsupported stage!");
 889    }
 890 }