src/mesa/drivers/dri/i965/brw_program.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keithw@vmware.com>
  30   */
  31
  32 #include <pthread.h>
  33 #include "main/imports.h"
  34 #include "main/glspirv.h"
  35 #include "program/prog_parameter.h"
  36 #include "program/prog_print.h"
  37 #include "program/prog_to_nir.h"
  38 #include "program/program.h"
  39 #include "program/programopt.h"
  40 #include "tnl/tnl.h"
  41 #include "util/ralloc.h"
  42 #include "compiler/glsl/ir.h"
  43 #include "compiler/glsl/glsl_to_nir.h"
  44
  45 #include "brw_program.h"
  46 #include "brw_context.h"
  47 #include "compiler/brw_nir.h"
  48 #include "brw_defines.h"
  49 #include "intel_batchbuffer.h"
  50
  51 #include "brw_cs.h"
  52 #include "brw_gs.h"
  53 #include "brw_vs.h"
  54 #include "brw_wm.h"
  55
  56 static bool
  57 brw_nir_lower_uniforms(nir_shader *nir, bool is_scalar)
  58 {
  59    if (is_scalar) {
  60       nir_assign_var_locations(&nir->uniforms, &nir->num_uniforms,
  61                                type_size_scalar_bytes);
  62       return nir_lower_io(nir, nir_var_uniform, type_size_scalar_bytes, 0);
  63    } else {
  64       nir_assign_var_locations(&nir->uniforms, &nir->num_uniforms,
  65                                type_size_vec4_bytes);
  66       return nir_lower_io(nir, nir_var_uniform, type_size_vec4_bytes, 0);
  67    }
  68 }
  69
  70 nir_shader *
  71 brw_create_nir(struct brw_context *brw,
  72                const struct gl_shader_program *shader_prog,
  73                struct gl_program *prog,
  74                gl_shader_stage stage,
  75                bool is_scalar)
  76 {
  77    const struct gen_device_info *devinfo = &brw->screen->devinfo;
  78    struct gl_context *ctx = &brw->ctx;
  79    const nir_shader_compiler_options *options =
  80       ctx->Const.ShaderCompilerOptions[stage].NirOptions;
  81    nir_shader *nir;
  82
  83    /* First, lower the GLSL/Mesa IR or SPIR-V to NIR */
  84    if (shader_prog) {
  85       if (shader_prog->data->spirv) {
  86          nir = _mesa_spirv_to_nir(ctx, shader_prog, stage, options);
  87       } else {
  88          nir = glsl_to_nir(shader_prog, stage, options);
  89       }
  90       assert (nir);
  91
  92       nir_remove_dead_variables(nir, nir_var_shader_in | nir_var_shader_out);
  93       nir_lower_returns(nir);
  94       nir_validate_shader(nir);
  95       NIR_PASS_V(nir, nir_lower_io_to_temporaries,
  96                  nir_shader_get_entrypoint(nir), true, false);
  97    } else {
  98       nir = prog_to_nir(prog, options);
  99       NIR_PASS_V(nir, nir_lower_regs_to_ssa); /* turn registers into SSA */
 100    }
 101    nir_validate_shader(nir);
 102
 103    nir = brw_preprocess_nir(brw->screen->compiler, nir);
 104
 105    NIR_PASS_V(nir, brw_nir_lower_image_load_store, devinfo);
 106
 107    if (stage == MESA_SHADER_TESS_CTRL) {
 108       /* Lower gl_PatchVerticesIn from a sys. value to a uniform on Gen8+. */
 109       static const gl_state_index16 tokens[STATE_LENGTH] =
 110          { STATE_INTERNAL, STATE_TCS_PATCH_VERTICES_IN };
 111       nir_lower_patch_vertices(nir, 0, devinfo->gen >= 8 ? tokens : NULL);
 112    }
 113
 114    if (stage == MESA_SHADER_TESS_EVAL) {
 115       /* Lower gl_PatchVerticesIn to a constant if we have a TCS, or
 116        * a uniform if we don't.
 117        */
 118       struct gl_linked_shader *tcs =
 119          shader_prog->_LinkedShaders[MESA_SHADER_TESS_CTRL];
 120       uint32_t static_patch_vertices =
 121          tcs ? tcs->Program->nir->info.tess.tcs_vertices_out : 0;
 122       static const gl_state_index16 tokens[STATE_LENGTH] =
 123          { STATE_INTERNAL, STATE_TES_PATCH_VERTICES_IN };
 124       nir_lower_patch_vertices(nir, static_patch_vertices, tokens);
 125    }
 126
 127    if (stage == MESA_SHADER_FRAGMENT) {
 128       static const struct nir_lower_wpos_ytransform_options wpos_options = {
 129          .state_tokens = {STATE_INTERNAL, STATE_FB_WPOS_Y_TRANSFORM, 0, 0, 0},
 130          .fs_coord_pixel_center_integer = 1,
 131          .fs_coord_origin_upper_left = 1,
 132       };
 133
 134       bool progress = false;
 135       NIR_PASS(progress, nir, nir_lower_wpos_ytransform, &wpos_options);
 136       if (progress) {
 137          _mesa_add_state_reference(prog->Parameters,
 138                                    wpos_options.state_tokens);
 139       }
 140    }
 141
 142    NIR_PASS_V(nir, brw_nir_lower_uniforms, is_scalar);
 143    NIR_PASS_V(nir, brw_nir_lower_glsl_images, prog);
 144
 145    return nir;
 146 }
 147
 148 void
 149 brw_shader_gather_info(nir_shader *nir, struct gl_program *prog)
 150 {
 151    nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
 152
 153    /* Copy the info we just generated back into the gl_program */
 154    const char *prog_name = prog->info.name;
 155    const char *prog_label = prog->info.label;
 156    prog->info = nir->info;
 157    prog->info.name = prog_name;
 158    prog->info.label = prog_label;
 159 }
 160
 161 static unsigned
 162 get_new_program_id(struct intel_screen *screen)
 163 {
 164    return p_atomic_inc_return(&screen->program_id);
 165 }
 166
 167 static struct gl_program *brwNewProgram(struct gl_context *ctx, GLenum target,
 168                                         GLuint id, bool is_arb_asm)
 169 {
 170    struct brw_context *brw = brw_context(ctx);
 171    struct brw_program *prog = rzalloc(NULL, struct brw_program);
 172
 173    if (prog) {
 174       prog->id = get_new_program_id(brw->screen);
 175
 176       return _mesa_init_gl_program(&prog->program, target, id, is_arb_asm);
 177    }
 178
 179    return NULL;
 180 }
 181
 182 static void brwDeleteProgram( struct gl_context *ctx,
 183                               struct gl_program *prog )
 184 {
 185    struct brw_context *brw = brw_context(ctx);
 186
 187    /* Beware!  prog's refcount has reached zero, and it's about to be freed.
 188     *
 189     * In brw_upload_pipeline_state(), we compare brw->programs[i] to
 190     * ctx->FooProgram._Current, and flag BRW_NEW_FOO_PROGRAM if the
 191     * pointer has changed.
 192     *
 193     * We cannot leave brw->programs[i] as a dangling pointer to the dead
 194     * program.  malloc() may allocate the same memory for a new gl_program,
 195     * causing us to see matching pointers...but totally different programs.
 196     *
 197     * We cannot set brw->programs[i] to NULL, either.  If we've deleted the
 198     * active program, Mesa may set ctx->FooProgram._Current to NULL.  That
 199     * would cause us to see matching pointers (NULL == NULL), and fail to
 200     * detect that a program has changed since our last draw.
 201     *
 202     * So, set it to a bogus gl_program pointer that will never match,
 203     * causing us to properly reevaluate the state on our next draw.
 204     *
 205     * Getting this wrong causes heisenbugs which are very hard to catch,
 206     * as you need a very specific allocation pattern to hit the problem.
 207     */
 208    static const struct gl_program deleted_program;
 209
 210    for (int i = 0; i < MESA_SHADER_STAGES; i++) {
 211       if (brw->programs[i] == prog)
 212          brw->programs[i] = (struct gl_program *) &deleted_program;
 213    }
 214
 215    _mesa_delete_program( ctx, prog );
 216 }
 217
 218
 219 static GLboolean
 220 brwProgramStringNotify(struct gl_context *ctx,
 221                        GLenum target,
 222                        struct gl_program *prog)
 223 {
 224    assert(target == GL_VERTEX_PROGRAM_ARB || !prog->arb.IsPositionInvariant);
 225
 226    struct brw_context *brw = brw_context(ctx);
 227    const struct brw_compiler *compiler = brw->screen->compiler;
 228
 229    switch (target) {
 230    case GL_FRAGMENT_PROGRAM_ARB: {
 231       struct brw_program *newFP = brw_program(prog);
 232       const struct brw_program *curFP =
 233          brw_program_const(brw->programs[MESA_SHADER_FRAGMENT]);
 234
 235       if (newFP == curFP)
 236          brw->ctx.NewDriverState |= BRW_NEW_FRAGMENT_PROGRAM;
 237       newFP->id = get_new_program_id(brw->screen);
 238
 239       prog->nir = brw_create_nir(brw, NULL, prog, MESA_SHADER_FRAGMENT, true);
 240
 241       brw_shader_gather_info(prog->nir, prog);
 242
 243       brw_fs_precompile(ctx, prog);
 244       break;
 245    }
 246    case GL_VERTEX_PROGRAM_ARB: {
 247       struct brw_program *newVP = brw_program(prog);
 248       const struct brw_program *curVP =
 249          brw_program_const(brw->programs[MESA_SHADER_VERTEX]);
 250
 251       if (newVP == curVP)
 252          brw->ctx.NewDriverState |= BRW_NEW_VERTEX_PROGRAM;
 253       if (newVP->program.arb.IsPositionInvariant) {
 254          _mesa_insert_mvp_code(ctx, &newVP->program);
 255       }
 256       newVP->id = get_new_program_id(brw->screen);
 257
 258       /* Also tell tnl about it:
 259        */
 260       _tnl_program_string(ctx, target, prog);
 261
 262       prog->nir = brw_create_nir(brw, NULL, prog, MESA_SHADER_VERTEX,
 263                                  compiler->scalar_stage[MESA_SHADER_VERTEX]);
 264
 265       brw_shader_gather_info(prog->nir, prog);
 266
 267       brw_vs_precompile(ctx, prog);
 268       break;
 269    }
 270    default:
 271       /*
 272        * driver->ProgramStringNotify is only called for ARB programs, fixed
 273        * function vertex programs, and ir_to_mesa (which isn't used by the
 274        * i965 back-end).  Therefore, even after geometry shaders are added,
 275        * this function should only ever be called with a target of
 276        * GL_VERTEX_PROGRAM_ARB or GL_FRAGMENT_PROGRAM_ARB.
 277        */
 278       unreachable("Unexpected target in brwProgramStringNotify");
 279    }
 280
 281    return true;
 282 }
 283
 284 static void
 285 brw_memory_barrier(struct gl_context *ctx, GLbitfield barriers)
 286 {
 287    struct brw_context *brw = brw_context(ctx);
 288    const struct gen_device_info *devinfo = &brw->screen->devinfo;
 289    unsigned bits = PIPE_CONTROL_DATA_CACHE_FLUSH | PIPE_CONTROL_CS_STALL;
 290    assert(devinfo->gen >= 7 && devinfo->gen <= 11);
 291
 292    if (barriers & (GL_VERTEX_ATTRIB_ARRAY_BARRIER_BIT |
 293                    GL_ELEMENT_ARRAY_BARRIER_BIT |
 294                    GL_COMMAND_BARRIER_BIT))
 295       bits |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
 296
 297    if (barriers & GL_UNIFORM_BARRIER_BIT)
 298       bits |= (PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
 299                PIPE_CONTROL_CONST_CACHE_INVALIDATE);
 300
 301    if (barriers & GL_TEXTURE_FETCH_BARRIER_BIT)
 302       bits |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
 303
 304    if (barriers & (GL_TEXTURE_UPDATE_BARRIER_BIT |
 305                    GL_PIXEL_BUFFER_BARRIER_BIT))
 306       bits |= (PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
 307                PIPE_CONTROL_RENDER_TARGET_FLUSH);
 308
 309    if (barriers & GL_FRAMEBUFFER_BARRIER_BIT)
 310       bits |= (PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
 311                PIPE_CONTROL_RENDER_TARGET_FLUSH);
 312
 313    /* Typed surface messages are handled by the render cache on IVB, so we
 314     * need to flush it too.
 315     */
 316    if (devinfo->gen == 7 && !devinfo->is_haswell)
 317       bits |= PIPE_CONTROL_RENDER_TARGET_FLUSH;
 318
 319    brw_emit_pipe_control_flush(brw, bits);
 320 }
 321
 322 static void
 323 brw_framebuffer_fetch_barrier(struct gl_context *ctx)
 324 {
 325    struct brw_context *brw = brw_context(ctx);
 326    const struct gen_device_info *devinfo = &brw->screen->devinfo;
 327
 328    if (!ctx->Extensions.EXT_shader_framebuffer_fetch) {
 329       if (devinfo->gen >= 6) {
 330          brw_emit_pipe_control_flush(brw,
 331                                      PIPE_CONTROL_RENDER_TARGET_FLUSH |
 332                                      PIPE_CONTROL_CS_STALL);
 333          brw_emit_pipe_control_flush(brw,
 334                                      PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE);
 335       } else {
 336          brw_emit_pipe_control_flush(brw,
 337                                      PIPE_CONTROL_RENDER_TARGET_FLUSH);
 338       }
 339    }
 340 }
 341
 342 void
 343 brw_get_scratch_bo(struct brw_context *brw,
 344                    struct brw_bo **scratch_bo, int size)
 345 {
 346    struct brw_bo *old_bo = *scratch_bo;
 347
 348    if (old_bo && old_bo->size < size) {
 349       brw_bo_unreference(old_bo);
 350       old_bo = NULL;
 351    }
 352
 353    if (!old_bo) {
 354       *scratch_bo =
 355          brw_bo_alloc(brw->bufmgr, "scratch bo", size, BRW_MEMZONE_SCRATCH);
 356    }
 357 }
 358
 359 /**
 360  * Reserve enough scratch space for the given stage to hold \p per_thread_size
 361  * bytes times the given \p thread_count.
 362  */
 363 void
 364 brw_alloc_stage_scratch(struct brw_context *brw,
 365                         struct brw_stage_state *stage_state,
 366                         unsigned per_thread_size)
 367 {
 368    if (stage_state->per_thread_scratch >= per_thread_size)
 369       return;
 370
 371    stage_state->per_thread_scratch = per_thread_size;
 372
 373    if (stage_state->scratch_bo)
 374       brw_bo_unreference(stage_state->scratch_bo);
 375
 376    const struct gen_device_info *devinfo = &brw->screen->devinfo;
 377    unsigned thread_count;
 378    switch(stage_state->stage) {
 379    case MESA_SHADER_VERTEX:
 380       thread_count = devinfo->max_vs_threads;
 381       break;
 382    case MESA_SHADER_TESS_CTRL:
 383       thread_count = devinfo->max_tcs_threads;
 384       break;
 385    case MESA_SHADER_TESS_EVAL:
 386       thread_count = devinfo->max_tes_threads;
 387       break;
 388    case MESA_SHADER_GEOMETRY:
 389       thread_count = devinfo->max_gs_threads;
 390       break;
 391    case MESA_SHADER_FRAGMENT:
 392       thread_count = devinfo->max_wm_threads;
 393       break;
 394    case MESA_SHADER_COMPUTE: {
 395       unsigned subslices = MAX2(brw->screen->subslice_total, 1);
 396
 397       /* The documentation for 3DSTATE_PS "Scratch Space Base Pointer" says:
 398        *
 399        * "Scratch Space per slice is computed based on 4 sub-slices.  SW must
 400        *  allocate scratch space enough so that each slice has 4 slices
 401        *  allowed."
 402        *
 403        * According to the other driver team, this applies to compute shaders
 404        * as well.  This is not currently documented at all.
 405        *
 406        * brw->screen->subslice_total is the TOTAL number of subslices
 407        * and we wish to view that there are 4 subslices per slice
 408        * instead of the actual number of subslices per slice.
 409        */
 410       if (devinfo->gen >= 9)
 411          subslices = 4 * brw->screen->devinfo.num_slices;
 412
 413       unsigned scratch_ids_per_subslice;
 414       if (devinfo->is_haswell) {
 415          /* WaCSScratchSize:hsw
 416           *
 417           * Haswell's scratch space address calculation appears to be sparse
 418           * rather than tightly packed. The Thread ID has bits indicating
 419           * which subslice, EU within a subslice, and thread within an EU it
 420           * is. There's a maximum of two slices and two subslices, so these
 421           * can be stored with a single bit. Even though there are only 10 EUs
 422           * per subslice, this is stored in 4 bits, so there's an effective
 423           * maximum value of 16 EUs. Similarly, although there are only 7
 424           * threads per EU, this is stored in a 3 bit number, giving an
 425           * effective maximum value of 8 threads per EU.
 426           *
 427           * This means that we need to use 16 * 8 instead of 10 * 7 for the
 428           * number of threads per subslice.
 429           */
 430          scratch_ids_per_subslice = 16 * 8;
 431       } else if (devinfo->is_cherryview) {
 432          /* Cherryview devices have either 6 or 8 EUs per subslice, and each
 433           * EU has 7 threads. The 6 EU devices appear to calculate thread IDs
 434           * as if it had 8 EUs.
 435           */
 436          scratch_ids_per_subslice = 8 * 7;
 437       } else {
 438          scratch_ids_per_subslice = devinfo->max_cs_threads;
 439       }
 440
 441       thread_count = scratch_ids_per_subslice * subslices;
 442       break;
 443    }
 444    default:
 445       unreachable("Unsupported stage!");
 446    }
 447
 448    stage_state->scratch_bo =
 449       brw_bo_alloc(brw->bufmgr, "shader scratch space",
 450                    per_thread_size * thread_count, BRW_MEMZONE_SCRATCH);
 451 }
 452
 453 void brwInitFragProgFuncs( struct dd_function_table *functions )
 454 {
 455    assert(functions->ProgramStringNotify == _tnl_program_string);
 456
 457    functions->NewProgram = brwNewProgram;
 458    functions->DeleteProgram = brwDeleteProgram;
 459    functions->ProgramStringNotify = brwProgramStringNotify;
 460
 461    functions->LinkShader = brw_link_shader;
 462
 463    functions->MemoryBarrier = brw_memory_barrier;
 464    functions->FramebufferFetchBarrier = brw_framebuffer_fetch_barrier;
 465 }
 466
 467 struct shader_times {
 468    uint64_t time;
 469    uint64_t written;
 470    uint64_t reset;
 471 };
 472
 473 void
 474 brw_init_shader_time(struct brw_context *brw)
 475 {
 476    const int max_entries = 2048;
 477    brw->shader_time.bo =
 478       brw_bo_alloc(brw->bufmgr, "shader time",
 479                    max_entries * BRW_SHADER_TIME_STRIDE * 3,
 480                    BRW_MEMZONE_OTHER);
 481    brw->shader_time.names = rzalloc_array(brw, const char *, max_entries);
 482    brw->shader_time.ids = rzalloc_array(brw, int, max_entries);
 483    brw->shader_time.types = rzalloc_array(brw, enum shader_time_shader_type,
 484                                           max_entries);
 485    brw->shader_time.cumulative = rzalloc_array(brw, struct shader_times,
 486                                                max_entries);
 487    brw->shader_time.max_entries = max_entries;
 488 }
 489
 490 static int
 491 compare_time(const void *a, const void *b)
 492 {
 493    uint64_t * const *a_val = a;
 494    uint64_t * const *b_val = b;
 495
 496    /* We don't just subtract because we're turning the value to an int. */
 497    if (**a_val < **b_val)
 498       return -1;
 499    else if (**a_val == **b_val)
 500       return 0;
 501    else
 502       return 1;
 503 }
 504
 505 static void
 506 print_shader_time_line(const char *stage, const char *name,
 507                        int shader_num, uint64_t time, uint64_t total)
 508 {
 509    fprintf(stderr, "%-6s%-18s", stage, name);
 510
 511    if (shader_num != 0)
 512       fprintf(stderr, "%4d: ", shader_num);
 513    else
 514       fprintf(stderr, "    : ");
 515
 516    fprintf(stderr, "%16lld (%7.2f Gcycles)      %4.1f%%\n",
 517            (long long)time,
 518            (double)time / 1000000000.0,
 519            (double)time / total * 100.0);
 520 }
 521
 522 static void
 523 brw_report_shader_time(struct brw_context *brw)
 524 {
 525    if (!brw->shader_time.bo || !brw->shader_time.num_entries)
 526       return;
 527
 528    uint64_t scaled[brw->shader_time.num_entries];
 529    uint64_t *sorted[brw->shader_time.num_entries];
 530    uint64_t total_by_type[ST_CS + 1];
 531    memset(total_by_type, 0, sizeof(total_by_type));
 532    double total = 0;
 533    for (int i = 0; i < brw->shader_time.num_entries; i++) {
 534       uint64_t written = 0, reset = 0;
 535       enum shader_time_shader_type type = brw->shader_time.types[i];
 536
 537       sorted[i] = &scaled[i];
 538
 539       switch (type) {
 540       case ST_VS:
 541       case ST_TCS:
 542       case ST_TES:
 543       case ST_GS:
 544       case ST_FS8:
 545       case ST_FS16:
 546       case ST_FS32:
 547       case ST_CS:
 548          written = brw->shader_time.cumulative[i].written;
 549          reset = brw->shader_time.cumulative[i].reset;
 550          break;
 551
 552       default:
 553          /* I sometimes want to print things that aren't the 3 shader times.
 554           * Just print the sum in that case.
 555           */
 556          written = 1;
 557          reset = 0;
 558          break;
 559       }
 560
 561       uint64_t time = brw->shader_time.cumulative[i].time;
 562       if (written) {
 563          scaled[i] = time / written * (written + reset);
 564       } else {
 565          scaled[i] = time;
 566       }
 567
 568       switch (type) {
 569       case ST_VS:
 570       case ST_TCS:
 571       case ST_TES:
 572       case ST_GS:
 573       case ST_FS8:
 574       case ST_FS16:
 575       case ST_FS32:
 576       case ST_CS:
 577          total_by_type[type] += scaled[i];
 578          break;
 579       default:
 580          break;
 581       }
 582
 583       total += scaled[i];
 584    }
 585
 586    if (total == 0) {
 587       fprintf(stderr, "No shader time collected yet\n");
 588       return;
 589    }
 590
 591    qsort(sorted, brw->shader_time.num_entries, sizeof(sorted[0]), compare_time);
 592
 593    fprintf(stderr, "\n");
 594    fprintf(stderr, "type          ID                  cycles spent                   %% of total\n");
 595    for (int s = 0; s < brw->shader_time.num_entries; s++) {
 596       const char *stage;
 597       /* Work back from the sorted pointers times to a time to print. */
 598       int i = sorted[s] - scaled;
 599
 600       if (scaled[i] == 0)
 601          continue;
 602
 603       int shader_num = brw->shader_time.ids[i];
 604       const char *shader_name = brw->shader_time.names[i];
 605
 606       switch (brw->shader_time.types[i]) {
 607       case ST_VS:
 608          stage = "vs";
 609          break;
 610       case ST_TCS:
 611          stage = "tcs";
 612          break;
 613       case ST_TES:
 614          stage = "tes";
 615          break;
 616       case ST_GS:
 617          stage = "gs";
 618          break;
 619       case ST_FS8:
 620          stage = "fs8";
 621          break;
 622       case ST_FS16:
 623          stage = "fs16";
 624          break;
 625       case ST_FS32:
 626          stage = "fs32";
 627          break;
 628       case ST_CS:
 629          stage = "cs";
 630          break;
 631       default:
 632          stage = "other";
 633          break;
 634       }
 635
 636       print_shader_time_line(stage, shader_name, shader_num,
 637                              scaled[i], total);
 638    }
 639
 640    fprintf(stderr, "\n");
 641    print_shader_time_line("total", "vs", 0, total_by_type[ST_VS], total);
 642    print_shader_time_line("total", "tcs", 0, total_by_type[ST_TCS], total);
 643    print_shader_time_line("total", "tes", 0, total_by_type[ST_TES], total);
 644    print_shader_time_line("total", "gs", 0, total_by_type[ST_GS], total);
 645    print_shader_time_line("total", "fs8", 0, total_by_type[ST_FS8], total);
 646    print_shader_time_line("total", "fs16", 0, total_by_type[ST_FS16], total);
 647    print_shader_time_line("total", "fs32", 0, total_by_type[ST_FS32], total);
 648    print_shader_time_line("total", "cs", 0, total_by_type[ST_CS], total);
 649 }
 650
 651 static void
 652 brw_collect_shader_time(struct brw_context *brw)
 653 {
 654    if (!brw->shader_time.bo)
 655       return;
 656
 657    /* This probably stalls on the last rendering.  We could fix that by
 658     * delaying reading the reports, but it doesn't look like it's a big
 659     * overhead compared to the cost of tracking the time in the first place.
 660     */
 661    void *bo_map = brw_bo_map(brw, brw->shader_time.bo, MAP_READ | MAP_WRITE);
 662
 663    for (int i = 0; i < brw->shader_time.num_entries; i++) {
 664       uint32_t *times = bo_map + i * 3 * BRW_SHADER_TIME_STRIDE;
 665
 666       brw->shader_time.cumulative[i].time += times[BRW_SHADER_TIME_STRIDE * 0 / 4];
 667       brw->shader_time.cumulative[i].written += times[BRW_SHADER_TIME_STRIDE * 1 / 4];
 668       brw->shader_time.cumulative[i].reset += times[BRW_SHADER_TIME_STRIDE * 2 / 4];
 669    }
 670
 671    /* Zero the BO out to clear it out for our next collection.
 672     */
 673    memset(bo_map, 0, brw->shader_time.bo->size);
 674    brw_bo_unmap(brw->shader_time.bo);
 675 }
 676
 677 void
 678 brw_collect_and_report_shader_time(struct brw_context *brw)
 679 {
 680    brw_collect_shader_time(brw);
 681
 682    if (brw->shader_time.report_time == 0 ||
 683        get_time() - brw->shader_time.report_time >= 1.0) {
 684       brw_report_shader_time(brw);
 685       brw->shader_time.report_time = get_time();
 686    }
 687 }
 688
 689 /**
 690  * Chooses an index in the shader_time buffer and sets up tracking information
 691  * for our printouts.
 692  *
 693  * Note that this holds on to references to the underlying programs, which may
 694  * change their lifetimes compared to normal operation.
 695  */
 696 int
 697 brw_get_shader_time_index(struct brw_context *brw, struct gl_program *prog,
 698                           enum shader_time_shader_type type, bool is_glsl_sh)
 699 {
 700    int shader_time_index = brw->shader_time.num_entries++;
 701    assert(shader_time_index < brw->shader_time.max_entries);
 702    brw->shader_time.types[shader_time_index] = type;
 703
 704    const char *name;
 705    if (prog->Id == 0) {
 706       name = "ff";
 707    } else if (is_glsl_sh) {
 708       name = prog->info.label ?
 709          ralloc_strdup(brw->shader_time.names, prog->info.label) : "glsl";
 710    } else {
 711       name = "prog";
 712    }
 713
 714    brw->shader_time.names[shader_time_index] = name;
 715    brw->shader_time.ids[shader_time_index] = prog->Id;
 716
 717    return shader_time_index;
 718 }
 719
 720 void
 721 brw_destroy_shader_time(struct brw_context *brw)
 722 {
 723    brw_bo_unreference(brw->shader_time.bo);
 724    brw->shader_time.bo = NULL;
 725 }
 726
 727 void
 728 brw_stage_prog_data_free(const void *p)
 729 {
 730    struct brw_stage_prog_data *prog_data = (struct brw_stage_prog_data *)p;
 731
 732    ralloc_free(prog_data->param);
 733    ralloc_free(prog_data->pull_param);
 734 }
 735
 736 void
 737 brw_dump_arb_asm(const char *stage, struct gl_program *prog)
 738 {
 739    fprintf(stderr, "ARB_%s_program %d ir for native %s shader\n",
 740            stage, prog->Id, stage);
 741    _mesa_print_program(prog);
 742 }
 743
 744 void
 745 brw_setup_tex_for_precompile(const struct gen_device_info *devinfo,
 746                              struct brw_sampler_prog_key_data *tex,
 747                              struct gl_program *prog)
 748 {
 749    const bool has_shader_channel_select = devinfo->is_haswell || devinfo->gen >= 8;
 750    unsigned sampler_count = util_last_bit(prog->SamplersUsed);
 751    for (unsigned i = 0; i < sampler_count; i++) {
 752       if (!has_shader_channel_select && (prog->ShadowSamplers & (1 << i))) {
 753          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
 754          tex->swizzles[i] =
 755             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
 756       } else {
 757          /* Color sampler: assume no swizzling. */
 758          tex->swizzles[i] = SWIZZLE_XYZW;
 759       }
 760    }
 761 }
 762
 763 /**
 764  * Sets up the starting offsets for the groups of binding table entries
 765  * common to all pipeline stages.
 766  *
 767  * Unused groups are initialized to 0xd0d0d0d0 to make it obvious that they're
 768  * unused but also make sure that addition of small offsets to them will
 769  * trigger some of our asserts that surface indices are < BRW_MAX_SURFACES.
 770  */
 771 uint32_t
 772 brw_assign_common_binding_table_offsets(const struct gen_device_info *devinfo,
 773                                         const struct gl_program *prog,
 774                                         struct brw_stage_prog_data *stage_prog_data,
 775                                         uint32_t next_binding_table_offset)
 776 {
 777    int num_textures = util_last_bit(prog->SamplersUsed);
 778
 779    stage_prog_data->binding_table.texture_start = next_binding_table_offset;
 780    next_binding_table_offset += num_textures;
 781
 782    if (prog->info.num_ubos) {
 783       assert(prog->info.num_ubos <= BRW_MAX_UBO);
 784       stage_prog_data->binding_table.ubo_start = next_binding_table_offset;
 785       next_binding_table_offset += prog->info.num_ubos;
 786    } else {
 787       stage_prog_data->binding_table.ubo_start = 0xd0d0d0d0;
 788    }
 789
 790    if (prog->info.num_ssbos || prog->info.num_abos) {
 791       assert(prog->info.num_abos <= BRW_MAX_ABO);
 792       assert(prog->info.num_ssbos <= BRW_MAX_SSBO);
 793       stage_prog_data->binding_table.ssbo_start = next_binding_table_offset;
 794       next_binding_table_offset += prog->info.num_abos + prog->info.num_ssbos;
 795    } else {
 796       stage_prog_data->binding_table.ssbo_start = 0xd0d0d0d0;
 797    }
 798
 799    if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
 800       stage_prog_data->binding_table.shader_time_start = next_binding_table_offset;
 801       next_binding_table_offset++;
 802    } else {
 803       stage_prog_data->binding_table.shader_time_start = 0xd0d0d0d0;
 804    }
 805
 806    if (prog->info.uses_texture_gather) {
 807       if (devinfo->gen >= 8) {
 808          stage_prog_data->binding_table.gather_texture_start =
 809             stage_prog_data->binding_table.texture_start;
 810       } else {
 811          stage_prog_data->binding_table.gather_texture_start = next_binding_table_offset;
 812          next_binding_table_offset += num_textures;
 813       }
 814    } else {
 815       stage_prog_data->binding_table.gather_texture_start = 0xd0d0d0d0;
 816    }
 817
 818    if (prog->info.num_images) {
 819       stage_prog_data->binding_table.image_start = next_binding_table_offset;
 820       next_binding_table_offset += prog->info.num_images;
 821    } else {
 822       stage_prog_data->binding_table.image_start = 0xd0d0d0d0;
 823    }
 824
 825    /* This may or may not be used depending on how the compile goes. */
 826    stage_prog_data->binding_table.pull_constants_start = next_binding_table_offset;
 827    next_binding_table_offset++;
 828
 829    /* Plane 0 is just the regular texture section */
 830    stage_prog_data->binding_table.plane_start[0] = stage_prog_data->binding_table.texture_start;
 831
 832    stage_prog_data->binding_table.plane_start[1] = next_binding_table_offset;
 833    next_binding_table_offset += num_textures;
 834
 835    stage_prog_data->binding_table.plane_start[2] = next_binding_table_offset;
 836    next_binding_table_offset += num_textures;
 837
 838    /* prog_data->base.binding_table.size will be set by brw_mark_surface_used. */
 839
 840    assert(next_binding_table_offset <= BRW_MAX_SURFACES);
 841    return next_binding_table_offset;
 842 }
 843
 844 void
 845 brw_prog_key_set_id(union brw_any_prog_key *key, gl_shader_stage stage,
 846                     unsigned id)
 847 {
 848    static const unsigned stage_offsets[] = {
 849       offsetof(struct brw_vs_prog_key, program_string_id),
 850       offsetof(struct brw_tcs_prog_key, program_string_id),
 851       offsetof(struct brw_tes_prog_key, program_string_id),
 852       offsetof(struct brw_gs_prog_key, program_string_id),
 853       offsetof(struct brw_wm_prog_key, program_string_id),
 854       offsetof(struct brw_cs_prog_key, program_string_id),
 855    };
 856    assert((int)stage >= 0 && stage < ARRAY_SIZE(stage_offsets));
 857    *(unsigned*)((uint8_t*)key + stage_offsets[stage]) = id;
 858 }
 859
 860 void
 861 brw_populate_default_key(const struct gen_device_info *devinfo,
 862                          union brw_any_prog_key *prog_key,
 863                          struct gl_shader_program *sh_prog,
 864                          struct gl_program *prog)
 865 {
 866    switch (prog->info.stage) {
 867    case MESA_SHADER_VERTEX:
 868       brw_vs_populate_default_key(devinfo, &prog_key->vs, prog);
 869       break;
 870    case MESA_SHADER_TESS_CTRL:
 871       brw_tcs_populate_default_key(devinfo, &prog_key->tcs, sh_prog, prog);
 872       break;
 873    case MESA_SHADER_TESS_EVAL:
 874       brw_tes_populate_default_key(devinfo, &prog_key->tes, sh_prog, prog);
 875       break;
 876    case MESA_SHADER_GEOMETRY:
 877       brw_gs_populate_default_key(devinfo, &prog_key->gs, prog);
 878       break;
 879    case MESA_SHADER_FRAGMENT:
 880       brw_wm_populate_default_key(devinfo, &prog_key->wm, prog);
 881       break;
 882    case MESA_SHADER_COMPUTE:
 883       brw_cs_populate_default_key(devinfo, &prog_key->cs, prog);
 884       break;
 885    default:
 886       unreachable("Unsupported stage!");
 887    }
 888 }