src/mesa/drivers/dri/i965/brw_program.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keithw@vmware.com>
  30   */
  31
  32 #include <pthread.h>
  33 #include "main/imports.h"
  34 #include "main/glspirv.h"
  35 #include "program/prog_parameter.h"
  36 #include "program/prog_print.h"
  37 #include "program/prog_to_nir.h"
  38 #include "program/program.h"
  39 #include "program/programopt.h"
  40 #include "tnl/tnl.h"
  41 #include "util/ralloc.h"
  42 #include "compiler/glsl/ir.h"
  43 #include "compiler/glsl/glsl_to_nir.h"
  44
  45 #include "brw_program.h"
  46 #include "brw_context.h"
  47 #include "compiler/brw_nir.h"
  48 #include "brw_defines.h"
  49 #include "intel_batchbuffer.h"
  50
  51 #include "brw_cs.h"
  52 #include "brw_gs.h"
  53 #include "brw_vs.h"
  54 #include "brw_wm.h"
  55
  56 static bool
  57 brw_nir_lower_uniforms(nir_shader *nir, bool is_scalar)
  58 {
  59    if (is_scalar) {
  60       nir_assign_var_locations(&nir->uniforms, &nir->num_uniforms,
  61                                type_size_scalar_bytes);
  62       return nir_lower_io(nir, nir_var_uniform, type_size_scalar_bytes, 0);
  63    } else {
  64       nir_assign_var_locations(&nir->uniforms, &nir->num_uniforms,
  65                                type_size_vec4_bytes);
  66       return nir_lower_io(nir, nir_var_uniform, type_size_vec4_bytes, 0);
  67    }
  68 }
  69
  70 nir_shader *
  71 brw_create_nir(struct brw_context *brw,
  72                const struct gl_shader_program *shader_prog,
  73                struct gl_program *prog,
  74                gl_shader_stage stage,
  75                bool is_scalar)
  76 {
  77    const struct gen_device_info *devinfo = &brw->screen->devinfo;
  78    struct gl_context *ctx = &brw->ctx;
  79    const nir_shader_compiler_options *options =
  80       ctx->Const.ShaderCompilerOptions[stage].NirOptions;
  81    nir_shader *nir;
  82
  83    /* First, lower the GLSL/Mesa IR or SPIR-V to NIR */
  84    if (shader_prog) {
  85       if (shader_prog->data->spirv) {
  86          nir = _mesa_spirv_to_nir(ctx, shader_prog, stage, options);
  87       } else {
  88          nir = glsl_to_nir(shader_prog, stage, options);
  89       }
  90       assert (nir);
  91
  92       nir_remove_dead_variables(nir, nir_var_shader_in | nir_var_shader_out);
  93       nir_lower_returns(nir);
  94       nir_validate_shader(nir);
  95       NIR_PASS_V(nir, nir_lower_io_to_temporaries,
  96                  nir_shader_get_entrypoint(nir), true, false);
  97    } else {
  98       nir = prog_to_nir(prog, options);
  99       NIR_PASS_V(nir, nir_lower_regs_to_ssa); /* turn registers into SSA */
 100    }
 101    nir_validate_shader(nir);
 102
 103    nir = brw_preprocess_nir(brw->screen->compiler, nir);
 104
 105    NIR_PASS_V(nir, brw_nir_lower_image_load_store, devinfo);
 106
 107    if (stage == MESA_SHADER_TESS_CTRL) {
 108       /* Lower gl_PatchVerticesIn from a sys. value to a uniform on Gen8+. */
 109       static const gl_state_index16 tokens[STATE_LENGTH] =
 110          { STATE_INTERNAL, STATE_TCS_PATCH_VERTICES_IN };
 111       nir_lower_patch_vertices(nir, 0, devinfo->gen >= 8 ? tokens : NULL);
 112    }
 113
 114    if (stage == MESA_SHADER_TESS_EVAL) {
 115       /* Lower gl_PatchVerticesIn to a constant if we have a TCS, or
 116        * a uniform if we don't.
 117        */
 118       struct gl_linked_shader *tcs =
 119          shader_prog->_LinkedShaders[MESA_SHADER_TESS_CTRL];
 120       uint32_t static_patch_vertices =
 121          tcs ? tcs->Program->nir->info.tess.tcs_vertices_out : 0;
 122       static const gl_state_index16 tokens[STATE_LENGTH] =
 123          { STATE_INTERNAL, STATE_TES_PATCH_VERTICES_IN };
 124       nir_lower_patch_vertices(nir, static_patch_vertices, tokens);
 125    }
 126
 127    if (stage == MESA_SHADER_FRAGMENT) {
 128       static const struct nir_lower_wpos_ytransform_options wpos_options = {
 129          .state_tokens = {STATE_INTERNAL, STATE_FB_WPOS_Y_TRANSFORM, 0, 0, 0},
 130          .fs_coord_pixel_center_integer = 1,
 131          .fs_coord_origin_upper_left = 1,
 132       };
 133
 134       bool progress = false;
 135       NIR_PASS(progress, nir, nir_lower_wpos_ytransform, &wpos_options);
 136       if (progress) {
 137          _mesa_add_state_reference(prog->Parameters,
 138                                    wpos_options.state_tokens);
 139       }
 140    }
 141
 142    NIR_PASS_V(nir, brw_nir_lower_uniforms, is_scalar);
 143
 144    return nir;
 145 }
 146
 147 void
 148 brw_shader_gather_info(nir_shader *nir, struct gl_program *prog)
 149 {
 150    nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
 151
 152    /* Copy the info we just generated back into the gl_program */
 153    const char *prog_name = prog->info.name;
 154    const char *prog_label = prog->info.label;
 155    prog->info = nir->info;
 156    prog->info.name = prog_name;
 157    prog->info.label = prog_label;
 158 }
 159
 160 static unsigned
 161 get_new_program_id(struct intel_screen *screen)
 162 {
 163    return p_atomic_inc_return(&screen->program_id);
 164 }
 165
 166 static struct gl_program *brwNewProgram(struct gl_context *ctx, GLenum target,
 167                                         GLuint id, bool is_arb_asm)
 168 {
 169    struct brw_context *brw = brw_context(ctx);
 170    struct brw_program *prog = rzalloc(NULL, struct brw_program);
 171
 172    if (prog) {
 173       prog->id = get_new_program_id(brw->screen);
 174
 175       return _mesa_init_gl_program(&prog->program, target, id, is_arb_asm);
 176    }
 177
 178    return NULL;
 179 }
 180
 181 static void brwDeleteProgram( struct gl_context *ctx,
 182                               struct gl_program *prog )
 183 {
 184    struct brw_context *brw = brw_context(ctx);
 185
 186    /* Beware!  prog's refcount has reached zero, and it's about to be freed.
 187     *
 188     * In brw_upload_pipeline_state(), we compare brw->programs[i] to
 189     * ctx->FooProgram._Current, and flag BRW_NEW_FOO_PROGRAM if the
 190     * pointer has changed.
 191     *
 192     * We cannot leave brw->programs[i] as a dangling pointer to the dead
 193     * program.  malloc() may allocate the same memory for a new gl_program,
 194     * causing us to see matching pointers...but totally different programs.
 195     *
 196     * We cannot set brw->programs[i] to NULL, either.  If we've deleted the
 197     * active program, Mesa may set ctx->FooProgram._Current to NULL.  That
 198     * would cause us to see matching pointers (NULL == NULL), and fail to
 199     * detect that a program has changed since our last draw.
 200     *
 201     * So, set it to a bogus gl_program pointer that will never match,
 202     * causing us to properly reevaluate the state on our next draw.
 203     *
 204     * Getting this wrong causes heisenbugs which are very hard to catch,
 205     * as you need a very specific allocation pattern to hit the problem.
 206     */
 207    static const struct gl_program deleted_program;
 208
 209    for (int i = 0; i < MESA_SHADER_STAGES; i++) {
 210       if (brw->programs[i] == prog)
 211          brw->programs[i] = (struct gl_program *) &deleted_program;
 212    }
 213
 214    _mesa_delete_program( ctx, prog );
 215 }
 216
 217
 218 static GLboolean
 219 brwProgramStringNotify(struct gl_context *ctx,
 220                        GLenum target,
 221                        struct gl_program *prog)
 222 {
 223    assert(target == GL_VERTEX_PROGRAM_ARB || !prog->arb.IsPositionInvariant);
 224
 225    struct brw_context *brw = brw_context(ctx);
 226    const struct brw_compiler *compiler = brw->screen->compiler;
 227
 228    switch (target) {
 229    case GL_FRAGMENT_PROGRAM_ARB: {
 230       struct brw_program *newFP = brw_program(prog);
 231       const struct brw_program *curFP =
 232          brw_program_const(brw->programs[MESA_SHADER_FRAGMENT]);
 233
 234       if (newFP == curFP)
 235          brw->ctx.NewDriverState |= BRW_NEW_FRAGMENT_PROGRAM;
 236       newFP->id = get_new_program_id(brw->screen);
 237
 238       prog->nir = brw_create_nir(brw, NULL, prog, MESA_SHADER_FRAGMENT, true);
 239
 240       brw_shader_gather_info(prog->nir, prog);
 241
 242       brw_fs_precompile(ctx, prog);
 243       break;
 244    }
 245    case GL_VERTEX_PROGRAM_ARB: {
 246       struct brw_program *newVP = brw_program(prog);
 247       const struct brw_program *curVP =
 248          brw_program_const(brw->programs[MESA_SHADER_VERTEX]);
 249
 250       if (newVP == curVP)
 251          brw->ctx.NewDriverState |= BRW_NEW_VERTEX_PROGRAM;
 252       if (newVP->program.arb.IsPositionInvariant) {
 253          _mesa_insert_mvp_code(ctx, &newVP->program);
 254       }
 255       newVP->id = get_new_program_id(brw->screen);
 256
 257       /* Also tell tnl about it:
 258        */
 259       _tnl_program_string(ctx, target, prog);
 260
 261       prog->nir = brw_create_nir(brw, NULL, prog, MESA_SHADER_VERTEX,
 262                                  compiler->scalar_stage[MESA_SHADER_VERTEX]);
 263
 264       brw_shader_gather_info(prog->nir, prog);
 265
 266       brw_vs_precompile(ctx, prog);
 267       break;
 268    }
 269    default:
 270       /*
 271        * driver->ProgramStringNotify is only called for ARB programs, fixed
 272        * function vertex programs, and ir_to_mesa (which isn't used by the
 273        * i965 back-end).  Therefore, even after geometry shaders are added,
 274        * this function should only ever be called with a target of
 275        * GL_VERTEX_PROGRAM_ARB or GL_FRAGMENT_PROGRAM_ARB.
 276        */
 277       unreachable("Unexpected target in brwProgramStringNotify");
 278    }
 279
 280    return true;
 281 }
 282
 283 static void
 284 brw_memory_barrier(struct gl_context *ctx, GLbitfield barriers)
 285 {
 286    struct brw_context *brw = brw_context(ctx);
 287    const struct gen_device_info *devinfo = &brw->screen->devinfo;
 288    unsigned bits = PIPE_CONTROL_DATA_CACHE_FLUSH | PIPE_CONTROL_CS_STALL;
 289    assert(devinfo->gen >= 7 && devinfo->gen <= 11);
 290
 291    if (barriers & (GL_VERTEX_ATTRIB_ARRAY_BARRIER_BIT |
 292                    GL_ELEMENT_ARRAY_BARRIER_BIT |
 293                    GL_COMMAND_BARRIER_BIT))
 294       bits |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
 295
 296    if (barriers & GL_UNIFORM_BARRIER_BIT)
 297       bits |= (PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
 298                PIPE_CONTROL_CONST_CACHE_INVALIDATE);
 299
 300    if (barriers & GL_TEXTURE_FETCH_BARRIER_BIT)
 301       bits |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
 302
 303    if (barriers & (GL_TEXTURE_UPDATE_BARRIER_BIT |
 304                    GL_PIXEL_BUFFER_BARRIER_BIT))
 305       bits |= (PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
 306                PIPE_CONTROL_RENDER_TARGET_FLUSH);
 307
 308    if (barriers & GL_FRAMEBUFFER_BARRIER_BIT)
 309       bits |= (PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
 310                PIPE_CONTROL_RENDER_TARGET_FLUSH);
 311
 312    /* Typed surface messages are handled by the render cache on IVB, so we
 313     * need to flush it too.
 314     */
 315    if (devinfo->gen == 7 && !devinfo->is_haswell)
 316       bits |= PIPE_CONTROL_RENDER_TARGET_FLUSH;
 317
 318    brw_emit_pipe_control_flush(brw, bits);
 319 }
 320
 321 static void
 322 brw_framebuffer_fetch_barrier(struct gl_context *ctx)
 323 {
 324    struct brw_context *brw = brw_context(ctx);
 325    const struct gen_device_info *devinfo = &brw->screen->devinfo;
 326
 327    if (!ctx->Extensions.EXT_shader_framebuffer_fetch) {
 328       if (devinfo->gen >= 6) {
 329          brw_emit_pipe_control_flush(brw,
 330                                      PIPE_CONTROL_RENDER_TARGET_FLUSH |
 331                                      PIPE_CONTROL_CS_STALL);
 332          brw_emit_pipe_control_flush(brw,
 333                                      PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE);
 334       } else {
 335          brw_emit_pipe_control_flush(brw,
 336                                      PIPE_CONTROL_RENDER_TARGET_FLUSH);
 337       }
 338    }
 339 }
 340
 341 void
 342 brw_get_scratch_bo(struct brw_context *brw,
 343                    struct brw_bo **scratch_bo, int size)
 344 {
 345    struct brw_bo *old_bo = *scratch_bo;
 346
 347    if (old_bo && old_bo->size < size) {
 348       brw_bo_unreference(old_bo);
 349       old_bo = NULL;
 350    }
 351
 352    if (!old_bo) {
 353       *scratch_bo =
 354          brw_bo_alloc(brw->bufmgr, "scratch bo", size, BRW_MEMZONE_SCRATCH);
 355    }
 356 }
 357
 358 /**
 359  * Reserve enough scratch space for the given stage to hold \p per_thread_size
 360  * bytes times the given \p thread_count.
 361  */
 362 void
 363 brw_alloc_stage_scratch(struct brw_context *brw,
 364                         struct brw_stage_state *stage_state,
 365                         unsigned per_thread_size)
 366 {
 367    if (stage_state->per_thread_scratch >= per_thread_size)
 368       return;
 369
 370    stage_state->per_thread_scratch = per_thread_size;
 371
 372    if (stage_state->scratch_bo)
 373       brw_bo_unreference(stage_state->scratch_bo);
 374
 375    const struct gen_device_info *devinfo = &brw->screen->devinfo;
 376    unsigned thread_count;
 377    switch(stage_state->stage) {
 378    case MESA_SHADER_VERTEX:
 379       thread_count = devinfo->max_vs_threads;
 380       break;
 381    case MESA_SHADER_TESS_CTRL:
 382       thread_count = devinfo->max_tcs_threads;
 383       break;
 384    case MESA_SHADER_TESS_EVAL:
 385       thread_count = devinfo->max_tes_threads;
 386       break;
 387    case MESA_SHADER_GEOMETRY:
 388       thread_count = devinfo->max_gs_threads;
 389       break;
 390    case MESA_SHADER_FRAGMENT:
 391       thread_count = devinfo->max_wm_threads;
 392       break;
 393    case MESA_SHADER_COMPUTE: {
 394       unsigned subslices = MAX2(brw->screen->subslice_total, 1);
 395
 396       /* The documentation for 3DSTATE_PS "Scratch Space Base Pointer" says:
 397        *
 398        * "Scratch Space per slice is computed based on 4 sub-slices.  SW must
 399        *  allocate scratch space enough so that each slice has 4 slices
 400        *  allowed."
 401        *
 402        * According to the other driver team, this applies to compute shaders
 403        * as well.  This is not currently documented at all.
 404        *
 405        * brw->screen->subslice_total is the TOTAL number of subslices
 406        * and we wish to view that there are 4 subslices per slice
 407        * instead of the actual number of subslices per slice.
 408        */
 409       if (devinfo->gen >= 9)
 410          subslices = 4 * brw->screen->devinfo.num_slices;
 411
 412       unsigned scratch_ids_per_subslice;
 413       if (devinfo->is_haswell) {
 414          /* WaCSScratchSize:hsw
 415           *
 416           * Haswell's scratch space address calculation appears to be sparse
 417           * rather than tightly packed. The Thread ID has bits indicating
 418           * which subslice, EU within a subslice, and thread within an EU it
 419           * is. There's a maximum of two slices and two subslices, so these
 420           * can be stored with a single bit. Even though there are only 10 EUs
 421           * per subslice, this is stored in 4 bits, so there's an effective
 422           * maximum value of 16 EUs. Similarly, although there are only 7
 423           * threads per EU, this is stored in a 3 bit number, giving an
 424           * effective maximum value of 8 threads per EU.
 425           *
 426           * This means that we need to use 16 * 8 instead of 10 * 7 for the
 427           * number of threads per subslice.
 428           */
 429          scratch_ids_per_subslice = 16 * 8;
 430       } else if (devinfo->is_cherryview) {
 431          /* Cherryview devices have either 6 or 8 EUs per subslice, and each
 432           * EU has 7 threads. The 6 EU devices appear to calculate thread IDs
 433           * as if it had 8 EUs.
 434           */
 435          scratch_ids_per_subslice = 8 * 7;
 436       } else {
 437          scratch_ids_per_subslice = devinfo->max_cs_threads;
 438       }
 439
 440       thread_count = scratch_ids_per_subslice * subslices;
 441       break;
 442    }
 443    default:
 444       unreachable("Unsupported stage!");
 445    }
 446
 447    stage_state->scratch_bo =
 448       brw_bo_alloc(brw->bufmgr, "shader scratch space",
 449                    per_thread_size * thread_count, BRW_MEMZONE_SCRATCH);
 450 }
 451
 452 void brwInitFragProgFuncs( struct dd_function_table *functions )
 453 {
 454    assert(functions->ProgramStringNotify == _tnl_program_string);
 455
 456    functions->NewProgram = brwNewProgram;
 457    functions->DeleteProgram = brwDeleteProgram;
 458    functions->ProgramStringNotify = brwProgramStringNotify;
 459
 460    functions->LinkShader = brw_link_shader;
 461
 462    functions->MemoryBarrier = brw_memory_barrier;
 463    functions->FramebufferFetchBarrier = brw_framebuffer_fetch_barrier;
 464 }
 465
 466 struct shader_times {
 467    uint64_t time;
 468    uint64_t written;
 469    uint64_t reset;
 470 };
 471
 472 void
 473 brw_init_shader_time(struct brw_context *brw)
 474 {
 475    const int max_entries = 2048;
 476    brw->shader_time.bo =
 477       brw_bo_alloc(brw->bufmgr, "shader time",
 478                    max_entries * BRW_SHADER_TIME_STRIDE * 3,
 479                    BRW_MEMZONE_OTHER);
 480    brw->shader_time.names = rzalloc_array(brw, const char *, max_entries);
 481    brw->shader_time.ids = rzalloc_array(brw, int, max_entries);
 482    brw->shader_time.types = rzalloc_array(brw, enum shader_time_shader_type,
 483                                           max_entries);
 484    brw->shader_time.cumulative = rzalloc_array(brw, struct shader_times,
 485                                                max_entries);
 486    brw->shader_time.max_entries = max_entries;
 487 }
 488
 489 static int
 490 compare_time(const void *a, const void *b)
 491 {
 492    uint64_t * const *a_val = a;
 493    uint64_t * const *b_val = b;
 494
 495    /* We don't just subtract because we're turning the value to an int. */
 496    if (**a_val < **b_val)
 497       return -1;
 498    else if (**a_val == **b_val)
 499       return 0;
 500    else
 501       return 1;
 502 }
 503
 504 static void
 505 print_shader_time_line(const char *stage, const char *name,
 506                        int shader_num, uint64_t time, uint64_t total)
 507 {
 508    fprintf(stderr, "%-6s%-18s", stage, name);
 509
 510    if (shader_num != 0)
 511       fprintf(stderr, "%4d: ", shader_num);
 512    else
 513       fprintf(stderr, "    : ");
 514
 515    fprintf(stderr, "%16lld (%7.2f Gcycles)      %4.1f%%\n",
 516            (long long)time,
 517            (double)time / 1000000000.0,
 518            (double)time / total * 100.0);
 519 }
 520
 521 static void
 522 brw_report_shader_time(struct brw_context *brw)
 523 {
 524    if (!brw->shader_time.bo || !brw->shader_time.num_entries)
 525       return;
 526
 527    uint64_t scaled[brw->shader_time.num_entries];
 528    uint64_t *sorted[brw->shader_time.num_entries];
 529    uint64_t total_by_type[ST_CS + 1];
 530    memset(total_by_type, 0, sizeof(total_by_type));
 531    double total = 0;
 532    for (int i = 0; i < brw->shader_time.num_entries; i++) {
 533       uint64_t written = 0, reset = 0;
 534       enum shader_time_shader_type type = brw->shader_time.types[i];
 535
 536       sorted[i] = &scaled[i];
 537
 538       switch (type) {
 539       case ST_VS:
 540       case ST_TCS:
 541       case ST_TES:
 542       case ST_GS:
 543       case ST_FS8:
 544       case ST_FS16:
 545       case ST_FS32:
 546       case ST_CS:
 547          written = brw->shader_time.cumulative[i].written;
 548          reset = brw->shader_time.cumulative[i].reset;
 549          break;
 550
 551       default:
 552          /* I sometimes want to print things that aren't the 3 shader times.
 553           * Just print the sum in that case.
 554           */
 555          written = 1;
 556          reset = 0;
 557          break;
 558       }
 559
 560       uint64_t time = brw->shader_time.cumulative[i].time;
 561       if (written) {
 562          scaled[i] = time / written * (written + reset);
 563       } else {
 564          scaled[i] = time;
 565       }
 566
 567       switch (type) {
 568       case ST_VS:
 569       case ST_TCS:
 570       case ST_TES:
 571       case ST_GS:
 572       case ST_FS8:
 573       case ST_FS16:
 574       case ST_FS32:
 575       case ST_CS:
 576          total_by_type[type] += scaled[i];
 577          break;
 578       default:
 579          break;
 580       }
 581
 582       total += scaled[i];
 583    }
 584
 585    if (total == 0) {
 586       fprintf(stderr, "No shader time collected yet\n");
 587       return;
 588    }
 589
 590    qsort(sorted, brw->shader_time.num_entries, sizeof(sorted[0]), compare_time);
 591
 592    fprintf(stderr, "\n");
 593    fprintf(stderr, "type          ID                  cycles spent                   %% of total\n");
 594    for (int s = 0; s < brw->shader_time.num_entries; s++) {
 595       const char *stage;
 596       /* Work back from the sorted pointers times to a time to print. */
 597       int i = sorted[s] - scaled;
 598
 599       if (scaled[i] == 0)
 600          continue;
 601
 602       int shader_num = brw->shader_time.ids[i];
 603       const char *shader_name = brw->shader_time.names[i];
 604
 605       switch (brw->shader_time.types[i]) {
 606       case ST_VS:
 607          stage = "vs";
 608          break;
 609       case ST_TCS:
 610          stage = "tcs";
 611          break;
 612       case ST_TES:
 613          stage = "tes";
 614          break;
 615       case ST_GS:
 616          stage = "gs";
 617          break;
 618       case ST_FS8:
 619          stage = "fs8";
 620          break;
 621       case ST_FS16:
 622          stage = "fs16";
 623          break;
 624       case ST_FS32:
 625          stage = "fs32";
 626          break;
 627       case ST_CS:
 628          stage = "cs";
 629          break;
 630       default:
 631          stage = "other";
 632          break;
 633       }
 634
 635       print_shader_time_line(stage, shader_name, shader_num,
 636                              scaled[i], total);
 637    }
 638
 639    fprintf(stderr, "\n");
 640    print_shader_time_line("total", "vs", 0, total_by_type[ST_VS], total);
 641    print_shader_time_line("total", "tcs", 0, total_by_type[ST_TCS], total);
 642    print_shader_time_line("total", "tes", 0, total_by_type[ST_TES], total);
 643    print_shader_time_line("total", "gs", 0, total_by_type[ST_GS], total);
 644    print_shader_time_line("total", "fs8", 0, total_by_type[ST_FS8], total);
 645    print_shader_time_line("total", "fs16", 0, total_by_type[ST_FS16], total);
 646    print_shader_time_line("total", "fs32", 0, total_by_type[ST_FS32], total);
 647    print_shader_time_line("total", "cs", 0, total_by_type[ST_CS], total);
 648 }
 649
 650 static void
 651 brw_collect_shader_time(struct brw_context *brw)
 652 {
 653    if (!brw->shader_time.bo)
 654       return;
 655
 656    /* This probably stalls on the last rendering.  We could fix that by
 657     * delaying reading the reports, but it doesn't look like it's a big
 658     * overhead compared to the cost of tracking the time in the first place.
 659     */
 660    void *bo_map = brw_bo_map(brw, brw->shader_time.bo, MAP_READ | MAP_WRITE);
 661
 662    for (int i = 0; i < brw->shader_time.num_entries; i++) {
 663       uint32_t *times = bo_map + i * 3 * BRW_SHADER_TIME_STRIDE;
 664
 665       brw->shader_time.cumulative[i].time += times[BRW_SHADER_TIME_STRIDE * 0 / 4];
 666       brw->shader_time.cumulative[i].written += times[BRW_SHADER_TIME_STRIDE * 1 / 4];
 667       brw->shader_time.cumulative[i].reset += times[BRW_SHADER_TIME_STRIDE * 2 / 4];
 668    }
 669
 670    /* Zero the BO out to clear it out for our next collection.
 671     */
 672    memset(bo_map, 0, brw->shader_time.bo->size);
 673    brw_bo_unmap(brw->shader_time.bo);
 674 }
 675
 676 void
 677 brw_collect_and_report_shader_time(struct brw_context *brw)
 678 {
 679    brw_collect_shader_time(brw);
 680
 681    if (brw->shader_time.report_time == 0 ||
 682        get_time() - brw->shader_time.report_time >= 1.0) {
 683       brw_report_shader_time(brw);
 684       brw->shader_time.report_time = get_time();
 685    }
 686 }
 687
 688 /**
 689  * Chooses an index in the shader_time buffer and sets up tracking information
 690  * for our printouts.
 691  *
 692  * Note that this holds on to references to the underlying programs, which may
 693  * change their lifetimes compared to normal operation.
 694  */
 695 int
 696 brw_get_shader_time_index(struct brw_context *brw, struct gl_program *prog,
 697                           enum shader_time_shader_type type, bool is_glsl_sh)
 698 {
 699    int shader_time_index = brw->shader_time.num_entries++;
 700    assert(shader_time_index < brw->shader_time.max_entries);
 701    brw->shader_time.types[shader_time_index] = type;
 702
 703    const char *name;
 704    if (prog->Id == 0) {
 705       name = "ff";
 706    } else if (is_glsl_sh) {
 707       name = prog->info.label ?
 708          ralloc_strdup(brw->shader_time.names, prog->info.label) : "glsl";
 709    } else {
 710       name = "prog";
 711    }
 712
 713    brw->shader_time.names[shader_time_index] = name;
 714    brw->shader_time.ids[shader_time_index] = prog->Id;
 715
 716    return shader_time_index;
 717 }
 718
 719 void
 720 brw_destroy_shader_time(struct brw_context *brw)
 721 {
 722    brw_bo_unreference(brw->shader_time.bo);
 723    brw->shader_time.bo = NULL;
 724 }
 725
 726 void
 727 brw_stage_prog_data_free(const void *p)
 728 {
 729    struct brw_stage_prog_data *prog_data = (struct brw_stage_prog_data *)p;
 730
 731    ralloc_free(prog_data->param);
 732    ralloc_free(prog_data->pull_param);
 733 }
 734
 735 void
 736 brw_dump_arb_asm(const char *stage, struct gl_program *prog)
 737 {
 738    fprintf(stderr, "ARB_%s_program %d ir for native %s shader\n",
 739            stage, prog->Id, stage);
 740    _mesa_print_program(prog);
 741 }
 742
 743 void
 744 brw_setup_tex_for_precompile(const struct gen_device_info *devinfo,
 745                              struct brw_sampler_prog_key_data *tex,
 746                              struct gl_program *prog)
 747 {
 748    const bool has_shader_channel_select = devinfo->is_haswell || devinfo->gen >= 8;
 749    unsigned sampler_count = util_last_bit(prog->SamplersUsed);
 750    for (unsigned i = 0; i < sampler_count; i++) {
 751       if (!has_shader_channel_select && (prog->ShadowSamplers & (1 << i))) {
 752          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
 753          tex->swizzles[i] =
 754             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
 755       } else {
 756          /* Color sampler: assume no swizzling. */
 757          tex->swizzles[i] = SWIZZLE_XYZW;
 758       }
 759    }
 760 }
 761
 762 /**
 763  * Sets up the starting offsets for the groups of binding table entries
 764  * common to all pipeline stages.
 765  *
 766  * Unused groups are initialized to 0xd0d0d0d0 to make it obvious that they're
 767  * unused but also make sure that addition of small offsets to them will
 768  * trigger some of our asserts that surface indices are < BRW_MAX_SURFACES.
 769  */
 770 uint32_t
 771 brw_assign_common_binding_table_offsets(const struct gen_device_info *devinfo,
 772                                         const struct gl_program *prog,
 773                                         struct brw_stage_prog_data *stage_prog_data,
 774                                         uint32_t next_binding_table_offset)
 775 {
 776    int num_textures = util_last_bit(prog->SamplersUsed);
 777
 778    stage_prog_data->binding_table.texture_start = next_binding_table_offset;
 779    next_binding_table_offset += num_textures;
 780
 781    if (prog->info.num_ubos) {
 782       assert(prog->info.num_ubos <= BRW_MAX_UBO);
 783       stage_prog_data->binding_table.ubo_start = next_binding_table_offset;
 784       next_binding_table_offset += prog->info.num_ubos;
 785    } else {
 786       stage_prog_data->binding_table.ubo_start = 0xd0d0d0d0;
 787    }
 788
 789    if (prog->info.num_ssbos || prog->info.num_abos) {
 790       assert(prog->info.num_abos <= BRW_MAX_ABO);
 791       assert(prog->info.num_ssbos <= BRW_MAX_SSBO);
 792       stage_prog_data->binding_table.ssbo_start = next_binding_table_offset;
 793       next_binding_table_offset += prog->info.num_abos + prog->info.num_ssbos;
 794    } else {
 795       stage_prog_data->binding_table.ssbo_start = 0xd0d0d0d0;
 796    }
 797
 798    if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
 799       stage_prog_data->binding_table.shader_time_start = next_binding_table_offset;
 800       next_binding_table_offset++;
 801    } else {
 802       stage_prog_data->binding_table.shader_time_start = 0xd0d0d0d0;
 803    }
 804
 805    if (prog->info.uses_texture_gather) {
 806       if (devinfo->gen >= 8) {
 807          stage_prog_data->binding_table.gather_texture_start =
 808             stage_prog_data->binding_table.texture_start;
 809       } else {
 810          stage_prog_data->binding_table.gather_texture_start = next_binding_table_offset;
 811          next_binding_table_offset += num_textures;
 812       }
 813    } else {
 814       stage_prog_data->binding_table.gather_texture_start = 0xd0d0d0d0;
 815    }
 816
 817    if (prog->info.num_images) {
 818       stage_prog_data->binding_table.image_start = next_binding_table_offset;
 819       next_binding_table_offset += prog->info.num_images;
 820    } else {
 821       stage_prog_data->binding_table.image_start = 0xd0d0d0d0;
 822    }
 823
 824    /* This may or may not be used depending on how the compile goes. */
 825    stage_prog_data->binding_table.pull_constants_start = next_binding_table_offset;
 826    next_binding_table_offset++;
 827
 828    /* Plane 0 is just the regular texture section */
 829    stage_prog_data->binding_table.plane_start[0] = stage_prog_data->binding_table.texture_start;
 830
 831    stage_prog_data->binding_table.plane_start[1] = next_binding_table_offset;
 832    next_binding_table_offset += num_textures;
 833
 834    stage_prog_data->binding_table.plane_start[2] = next_binding_table_offset;
 835    next_binding_table_offset += num_textures;
 836
 837    /* prog_data->base.binding_table.size will be set by brw_mark_surface_used. */
 838
 839    assert(next_binding_table_offset <= BRW_MAX_SURFACES);
 840    return next_binding_table_offset;
 841 }
 842
 843 void
 844 brw_prog_key_set_id(union brw_any_prog_key *key, gl_shader_stage stage,
 845                     unsigned id)
 846 {
 847    static const unsigned stage_offsets[] = {
 848       offsetof(struct brw_vs_prog_key, program_string_id),
 849       offsetof(struct brw_tcs_prog_key, program_string_id),
 850       offsetof(struct brw_tes_prog_key, program_string_id),
 851       offsetof(struct brw_gs_prog_key, program_string_id),
 852       offsetof(struct brw_wm_prog_key, program_string_id),
 853       offsetof(struct brw_cs_prog_key, program_string_id),
 854    };
 855    assert((int)stage >= 0 && stage < ARRAY_SIZE(stage_offsets));
 856    *(unsigned*)((uint8_t*)key + stage_offsets[stage]) = id;
 857 }
 858
 859 void
 860 brw_populate_default_key(const struct gen_device_info *devinfo,
 861                          union brw_any_prog_key *prog_key,
 862                          struct gl_shader_program *sh_prog,
 863                          struct gl_program *prog)
 864 {
 865    switch (prog->info.stage) {
 866    case MESA_SHADER_VERTEX:
 867       brw_vs_populate_default_key(devinfo, &prog_key->vs, prog);
 868       break;
 869    case MESA_SHADER_TESS_CTRL:
 870       brw_tcs_populate_default_key(devinfo, &prog_key->tcs, sh_prog, prog);
 871       break;
 872    case MESA_SHADER_TESS_EVAL:
 873       brw_tes_populate_default_key(devinfo, &prog_key->tes, sh_prog, prog);
 874       break;
 875    case MESA_SHADER_GEOMETRY:
 876       brw_gs_populate_default_key(devinfo, &prog_key->gs, prog);
 877       break;
 878    case MESA_SHADER_FRAGMENT:
 879       brw_wm_populate_default_key(devinfo, &prog_key->wm, prog);
 880       break;
 881    case MESA_SHADER_COMPUTE:
 882       brw_cs_populate_default_key(devinfo, &prog_key->cs, prog);
 883       break;
 884    default:
 885       unreachable("Unsupported stage!");
 886    }
 887 }