src/mesa/drivers/dri/i965/brw_program.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keithw@vmware.com>
  30   */
  31
  32 #include <pthread.h>
  33 #include "main/imports.h"
  34 #include "main/glspirv.h"
  35 #include "program/prog_parameter.h"
  36 #include "program/prog_print.h"
  37 #include "program/prog_to_nir.h"
  38 #include "program/program.h"
  39 #include "program/programopt.h"
  40 #include "tnl/tnl.h"
  41 #include "util/ralloc.h"
  42 #include "compiler/glsl/ir.h"
  43 #include "compiler/glsl/program.h"
  44 #include "compiler/glsl/gl_nir.h"
  45 #include "compiler/glsl/glsl_to_nir.h"
  46
  47 #include "brw_program.h"
  48 #include "brw_context.h"
  49 #include "compiler/brw_nir.h"
  50 #include "brw_defines.h"
  51 #include "intel_batchbuffer.h"
  52
  53 #include "brw_cs.h"
  54 #include "brw_gs.h"
  55 #include "brw_vs.h"
  56 #include "brw_wm.h"
  57
  58 #include "main/shaderapi.h"
  59 #include "main/shaderobj.h"
  60
  61 static bool
  62 brw_nir_lower_uniforms(nir_shader *nir, bool is_scalar)
  63 {
  64    if (is_scalar) {
  65       nir_assign_var_locations(&nir->uniforms, &nir->num_uniforms,
  66                                type_size_scalar_bytes);
  67       return nir_lower_io(nir, nir_var_uniform, type_size_scalar_bytes, 0);
  68    } else {
  69       nir_assign_var_locations(&nir->uniforms, &nir->num_uniforms,
  70                                type_size_vec4_bytes);
  71       return nir_lower_io(nir, nir_var_uniform, type_size_vec4_bytes, 0);
  72    }
  73 }
  74
  75 static struct gl_program *brwNewProgram(struct gl_context *ctx, GLenum target,
  76                                         GLuint id, bool is_arb_asm);
  77
  78 nir_shader *
  79 brw_create_nir(struct brw_context *brw,
  80                const struct gl_shader_program *shader_prog,
  81                struct gl_program *prog,
  82                gl_shader_stage stage,
  83                bool is_scalar)
  84 {
  85    const struct gen_device_info *devinfo = &brw->screen->devinfo;
  86    struct gl_context *ctx = &brw->ctx;
  87    const nir_shader_compiler_options *options =
  88       ctx->Const.ShaderCompilerOptions[stage].NirOptions;
  89    nir_shader *nir;
  90
  91    /* First, lower the GLSL/Mesa IR or SPIR-V to NIR */
  92    if (shader_prog) {
  93       if (shader_prog->data->spirv) {
  94          nir = _mesa_spirv_to_nir(ctx, shader_prog, stage, options);
  95       } else {
  96          nir = glsl_to_nir(ctx, shader_prog, stage, options);
  97       }
  98       assert (nir);
  99
 100       nir_remove_dead_variables(nir, nir_var_shader_in | nir_var_shader_out);
 101       nir_validate_shader(nir, "after glsl_to_nir or spirv_to_nir");
 102       NIR_PASS_V(nir, nir_lower_io_to_temporaries,
 103                  nir_shader_get_entrypoint(nir), true, false);
 104    } else {
 105       nir = prog_to_nir(prog, options);
 106       NIR_PASS_V(nir, nir_lower_regs_to_ssa); /* turn registers into SSA */
 107       NIR_PASS_V(nir, gl_nir_lower_samplers, NULL);
 108    }
 109    nir_validate_shader(nir, "before brw_preprocess_nir");
 110
 111    nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
 112
 113    nir_shader *softfp64 = NULL;
 114    if ((options->lower_doubles_options & nir_lower_fp64_full_software) &&
 115        nir->info.uses_64bit) {
 116       softfp64 = glsl_float64_funcs_to_nir(ctx, options);
 117       ralloc_steal(ralloc_parent(nir), softfp64);
 118    }
 119
 120    nir = brw_preprocess_nir(brw->screen->compiler, nir, softfp64);
 121
 122    NIR_PASS_V(nir, brw_nir_lower_image_load_store, devinfo);
 123
 124    if (stage == MESA_SHADER_TESS_CTRL) {
 125       /* Lower gl_PatchVerticesIn from a sys. value to a uniform on Gen8+. */
 126       static const gl_state_index16 tokens[STATE_LENGTH] =
 127          { STATE_INTERNAL, STATE_TCS_PATCH_VERTICES_IN };
 128       nir_lower_patch_vertices(nir, 0, devinfo->gen >= 8 ? tokens : NULL);
 129    }
 130
 131    if (stage == MESA_SHADER_TESS_EVAL) {
 132       /* Lower gl_PatchVerticesIn to a constant if we have a TCS, or
 133        * a uniform if we don't.
 134        */
 135       struct gl_linked_shader *tcs =
 136          shader_prog->_LinkedShaders[MESA_SHADER_TESS_CTRL];
 137       uint32_t static_patch_vertices =
 138          tcs ? tcs->Program->nir->info.tess.tcs_vertices_out : 0;
 139       static const gl_state_index16 tokens[STATE_LENGTH] =
 140          { STATE_INTERNAL, STATE_TES_PATCH_VERTICES_IN };
 141       nir_lower_patch_vertices(nir, static_patch_vertices, tokens);
 142    }
 143
 144    if (stage == MESA_SHADER_FRAGMENT) {
 145       static const struct nir_lower_wpos_ytransform_options wpos_options = {
 146          .state_tokens = {STATE_INTERNAL, STATE_FB_WPOS_Y_TRANSFORM, 0, 0, 0},
 147          .fs_coord_pixel_center_integer = 1,
 148          .fs_coord_origin_upper_left = 1,
 149       };
 150
 151       bool progress = false;
 152       NIR_PASS(progress, nir, nir_lower_wpos_ytransform, &wpos_options);
 153       if (progress) {
 154          _mesa_add_state_reference(prog->Parameters,
 155                                    wpos_options.state_tokens);
 156       }
 157    }
 158
 159    NIR_PASS_V(nir, brw_nir_lower_uniforms, is_scalar);
 160
 161    return nir;
 162 }
 163
 164 void
 165 brw_shader_gather_info(nir_shader *nir, struct gl_program *prog)
 166 {
 167    nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
 168
 169    /* Copy the info we just generated back into the gl_program */
 170    const char *prog_name = prog->info.name;
 171    const char *prog_label = prog->info.label;
 172    prog->info = nir->info;
 173    prog->info.name = prog_name;
 174    prog->info.label = prog_label;
 175 }
 176
 177 static unsigned
 178 get_new_program_id(struct intel_screen *screen)
 179 {
 180    return p_atomic_inc_return(&screen->program_id);
 181 }
 182
 183 static struct gl_program *brwNewProgram(struct gl_context *ctx, GLenum target,
 184                                         GLuint id, bool is_arb_asm)
 185 {
 186    struct brw_context *brw = brw_context(ctx);
 187    struct brw_program *prog = rzalloc(NULL, struct brw_program);
 188
 189    if (prog) {
 190       prog->id = get_new_program_id(brw->screen);
 191
 192       return _mesa_init_gl_program(&prog->program, target, id, is_arb_asm);
 193    }
 194
 195    return NULL;
 196 }
 197
 198 static void brwDeleteProgram( struct gl_context *ctx,
 199                               struct gl_program *prog )
 200 {
 201    struct brw_context *brw = brw_context(ctx);
 202
 203    /* Beware!  prog's refcount has reached zero, and it's about to be freed.
 204     *
 205     * In brw_upload_pipeline_state(), we compare brw->programs[i] to
 206     * ctx->FooProgram._Current, and flag BRW_NEW_FOO_PROGRAM if the
 207     * pointer has changed.
 208     *
 209     * We cannot leave brw->programs[i] as a dangling pointer to the dead
 210     * program.  malloc() may allocate the same memory for a new gl_program,
 211     * causing us to see matching pointers...but totally different programs.
 212     *
 213     * We cannot set brw->programs[i] to NULL, either.  If we've deleted the
 214     * active program, Mesa may set ctx->FooProgram._Current to NULL.  That
 215     * would cause us to see matching pointers (NULL == NULL), and fail to
 216     * detect that a program has changed since our last draw.
 217     *
 218     * So, set it to a bogus gl_program pointer that will never match,
 219     * causing us to properly reevaluate the state on our next draw.
 220     *
 221     * Getting this wrong causes heisenbugs which are very hard to catch,
 222     * as you need a very specific allocation pattern to hit the problem.
 223     */
 224    static const struct gl_program deleted_program;
 225
 226    for (int i = 0; i < MESA_SHADER_STAGES; i++) {
 227       if (brw->programs[i] == prog)
 228          brw->programs[i] = (struct gl_program *) &deleted_program;
 229    }
 230
 231    _mesa_delete_program( ctx, prog );
 232 }
 233
 234
 235 static GLboolean
 236 brwProgramStringNotify(struct gl_context *ctx,
 237                        GLenum target,
 238                        struct gl_program *prog)
 239 {
 240    assert(target == GL_VERTEX_PROGRAM_ARB || !prog->arb.IsPositionInvariant);
 241
 242    struct brw_context *brw = brw_context(ctx);
 243    const struct brw_compiler *compiler = brw->screen->compiler;
 244
 245    switch (target) {
 246    case GL_FRAGMENT_PROGRAM_ARB: {
 247       struct brw_program *newFP = brw_program(prog);
 248       const struct brw_program *curFP =
 249          brw_program_const(brw->programs[MESA_SHADER_FRAGMENT]);
 250
 251       if (newFP == curFP)
 252          brw->ctx.NewDriverState |= BRW_NEW_FRAGMENT_PROGRAM;
 253       newFP->id = get_new_program_id(brw->screen);
 254
 255       prog->nir = brw_create_nir(brw, NULL, prog, MESA_SHADER_FRAGMENT, true);
 256
 257       brw_shader_gather_info(prog->nir, prog);
 258
 259       brw_fs_precompile(ctx, prog);
 260       break;
 261    }
 262    case GL_VERTEX_PROGRAM_ARB: {
 263       struct brw_program *newVP = brw_program(prog);
 264       const struct brw_program *curVP =
 265          brw_program_const(brw->programs[MESA_SHADER_VERTEX]);
 266
 267       if (newVP == curVP)
 268          brw->ctx.NewDriverState |= BRW_NEW_VERTEX_PROGRAM;
 269       if (newVP->program.arb.IsPositionInvariant) {
 270          _mesa_insert_mvp_code(ctx, &newVP->program);
 271       }
 272       newVP->id = get_new_program_id(brw->screen);
 273
 274       /* Also tell tnl about it:
 275        */
 276       _tnl_program_string(ctx, target, prog);
 277
 278       prog->nir = brw_create_nir(brw, NULL, prog, MESA_SHADER_VERTEX,
 279                                  compiler->scalar_stage[MESA_SHADER_VERTEX]);
 280
 281       brw_shader_gather_info(prog->nir, prog);
 282
 283       brw_vs_precompile(ctx, prog);
 284       break;
 285    }
 286    default:
 287       /*
 288        * driver->ProgramStringNotify is only called for ARB programs, fixed
 289        * function vertex programs, and ir_to_mesa (which isn't used by the
 290        * i965 back-end).  Therefore, even after geometry shaders are added,
 291        * this function should only ever be called with a target of
 292        * GL_VERTEX_PROGRAM_ARB or GL_FRAGMENT_PROGRAM_ARB.
 293        */
 294       unreachable("Unexpected target in brwProgramStringNotify");
 295    }
 296
 297    return true;
 298 }
 299
 300 static void
 301 brw_memory_barrier(struct gl_context *ctx, GLbitfield barriers)
 302 {
 303    struct brw_context *brw = brw_context(ctx);
 304    const struct gen_device_info *devinfo = &brw->screen->devinfo;
 305    unsigned bits = PIPE_CONTROL_DATA_CACHE_FLUSH | PIPE_CONTROL_CS_STALL;
 306    assert(devinfo->gen >= 7 && devinfo->gen <= 11);
 307
 308    if (barriers & (GL_VERTEX_ATTRIB_ARRAY_BARRIER_BIT |
 309                    GL_ELEMENT_ARRAY_BARRIER_BIT |
 310                    GL_COMMAND_BARRIER_BIT))
 311       bits |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
 312
 313    if (barriers & GL_UNIFORM_BARRIER_BIT)
 314       bits |= (PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
 315                PIPE_CONTROL_CONST_CACHE_INVALIDATE);
 316
 317    if (barriers & GL_TEXTURE_FETCH_BARRIER_BIT)
 318       bits |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
 319
 320    if (barriers & (GL_TEXTURE_UPDATE_BARRIER_BIT |
 321                    GL_PIXEL_BUFFER_BARRIER_BIT))
 322       bits |= (PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
 323                PIPE_CONTROL_RENDER_TARGET_FLUSH);
 324
 325    if (barriers & GL_FRAMEBUFFER_BARRIER_BIT)
 326       bits |= (PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
 327                PIPE_CONTROL_RENDER_TARGET_FLUSH);
 328
 329    /* Typed surface messages are handled by the render cache on IVB, so we
 330     * need to flush it too.
 331     */
 332    if (devinfo->gen == 7 && !devinfo->is_haswell)
 333       bits |= PIPE_CONTROL_RENDER_TARGET_FLUSH;
 334
 335    brw_emit_pipe_control_flush(brw, bits);
 336 }
 337
 338 static void
 339 brw_framebuffer_fetch_barrier(struct gl_context *ctx)
 340 {
 341    struct brw_context *brw = brw_context(ctx);
 342    const struct gen_device_info *devinfo = &brw->screen->devinfo;
 343
 344    if (!ctx->Extensions.EXT_shader_framebuffer_fetch) {
 345       if (devinfo->gen >= 6) {
 346          brw_emit_pipe_control_flush(brw,
 347                                      PIPE_CONTROL_RENDER_TARGET_FLUSH |
 348                                      PIPE_CONTROL_CS_STALL);
 349          brw_emit_pipe_control_flush(brw,
 350                                      PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE);
 351       } else {
 352          brw_emit_pipe_control_flush(brw,
 353                                      PIPE_CONTROL_RENDER_TARGET_FLUSH);
 354       }
 355    }
 356 }
 357
 358 void
 359 brw_get_scratch_bo(struct brw_context *brw,
 360                    struct brw_bo **scratch_bo, int size)
 361 {
 362    struct brw_bo *old_bo = *scratch_bo;
 363
 364    if (old_bo && old_bo->size < size) {
 365       brw_bo_unreference(old_bo);
 366       old_bo = NULL;
 367    }
 368
 369    if (!old_bo) {
 370       *scratch_bo =
 371          brw_bo_alloc(brw->bufmgr, "scratch bo", size, BRW_MEMZONE_SCRATCH);
 372    }
 373 }
 374
 375 /**
 376  * Reserve enough scratch space for the given stage to hold \p per_thread_size
 377  * bytes times the given \p thread_count.
 378  */
 379 void
 380 brw_alloc_stage_scratch(struct brw_context *brw,
 381                         struct brw_stage_state *stage_state,
 382                         unsigned per_thread_size)
 383 {
 384    if (stage_state->per_thread_scratch >= per_thread_size)
 385       return;
 386
 387    stage_state->per_thread_scratch = per_thread_size;
 388
 389    if (stage_state->scratch_bo)
 390       brw_bo_unreference(stage_state->scratch_bo);
 391
 392    const struct gen_device_info *devinfo = &brw->screen->devinfo;
 393    unsigned thread_count;
 394    switch(stage_state->stage) {
 395    case MESA_SHADER_VERTEX:
 396       thread_count = devinfo->max_vs_threads;
 397       break;
 398    case MESA_SHADER_TESS_CTRL:
 399       thread_count = devinfo->max_tcs_threads;
 400       break;
 401    case MESA_SHADER_TESS_EVAL:
 402       thread_count = devinfo->max_tes_threads;
 403       break;
 404    case MESA_SHADER_GEOMETRY:
 405       thread_count = devinfo->max_gs_threads;
 406       break;
 407    case MESA_SHADER_FRAGMENT:
 408       thread_count = devinfo->max_wm_threads;
 409       break;
 410    case MESA_SHADER_COMPUTE: {
 411       unsigned subslices = MAX2(brw->screen->subslice_total, 1);
 412
 413       /* The documentation for 3DSTATE_PS "Scratch Space Base Pointer" says:
 414        *
 415        * "Scratch Space per slice is computed based on 4 sub-slices.  SW must
 416        *  allocate scratch space enough so that each slice has 4 slices
 417        *  allowed."
 418        *
 419        * According to the other driver team, this applies to compute shaders
 420        * as well.  This is not currently documented at all.
 421        *
 422        * brw->screen->subslice_total is the TOTAL number of subslices
 423        * and we wish to view that there are 4 subslices per slice
 424        * instead of the actual number of subslices per slice.
 425        */
 426       if (devinfo->gen >= 9 && devinfo->gen < 11)
 427          subslices = 4 * brw->screen->devinfo.num_slices;
 428
 429       unsigned scratch_ids_per_subslice;
 430       if (devinfo->is_haswell) {
 431          /* WaCSScratchSize:hsw
 432           *
 433           * Haswell's scratch space address calculation appears to be sparse
 434           * rather than tightly packed. The Thread ID has bits indicating
 435           * which subslice, EU within a subslice, and thread within an EU it
 436           * is. There's a maximum of two slices and two subslices, so these
 437           * can be stored with a single bit. Even though there are only 10 EUs
 438           * per subslice, this is stored in 4 bits, so there's an effective
 439           * maximum value of 16 EUs. Similarly, although there are only 7
 440           * threads per EU, this is stored in a 3 bit number, giving an
 441           * effective maximum value of 8 threads per EU.
 442           *
 443           * This means that we need to use 16 * 8 instead of 10 * 7 for the
 444           * number of threads per subslice.
 445           */
 446          scratch_ids_per_subslice = 16 * 8;
 447       } else if (devinfo->is_cherryview) {
 448          /* Cherryview devices have either 6 or 8 EUs per subslice, and each
 449           * EU has 7 threads. The 6 EU devices appear to calculate thread IDs
 450           * as if it had 8 EUs.
 451           */
 452          scratch_ids_per_subslice = 8 * 7;
 453       } else {
 454          scratch_ids_per_subslice = devinfo->max_cs_threads;
 455       }
 456
 457       thread_count = scratch_ids_per_subslice * subslices;
 458       break;
 459    }
 460    default:
 461       unreachable("Unsupported stage!");
 462    }
 463
 464    stage_state->scratch_bo =
 465       brw_bo_alloc(brw->bufmgr, "shader scratch space",
 466                    per_thread_size * thread_count, BRW_MEMZONE_SCRATCH);
 467 }
 468
 469 void brwInitFragProgFuncs( struct dd_function_table *functions )
 470 {
 471    assert(functions->ProgramStringNotify == _tnl_program_string);
 472
 473    functions->NewProgram = brwNewProgram;
 474    functions->DeleteProgram = brwDeleteProgram;
 475    functions->ProgramStringNotify = brwProgramStringNotify;
 476
 477    functions->LinkShader = brw_link_shader;
 478
 479    functions->MemoryBarrier = brw_memory_barrier;
 480    functions->FramebufferFetchBarrier = brw_framebuffer_fetch_barrier;
 481 }
 482
 483 struct shader_times {
 484    uint64_t time;
 485    uint64_t written;
 486    uint64_t reset;
 487 };
 488
 489 void
 490 brw_init_shader_time(struct brw_context *brw)
 491 {
 492    const int max_entries = 2048;
 493    brw->shader_time.bo =
 494       brw_bo_alloc(brw->bufmgr, "shader time",
 495                    max_entries * BRW_SHADER_TIME_STRIDE * 3,
 496                    BRW_MEMZONE_OTHER);
 497    brw->shader_time.names = rzalloc_array(brw, const char *, max_entries);
 498    brw->shader_time.ids = rzalloc_array(brw, int, max_entries);
 499    brw->shader_time.types = rzalloc_array(brw, enum shader_time_shader_type,
 500                                           max_entries);
 501    brw->shader_time.cumulative = rzalloc_array(brw, struct shader_times,
 502                                                max_entries);
 503    brw->shader_time.max_entries = max_entries;
 504 }
 505
 506 static int
 507 compare_time(const void *a, const void *b)
 508 {
 509    uint64_t * const *a_val = a;
 510    uint64_t * const *b_val = b;
 511
 512    /* We don't just subtract because we're turning the value to an int. */
 513    if (**a_val < **b_val)
 514       return -1;
 515    else if (**a_val == **b_val)
 516       return 0;
 517    else
 518       return 1;
 519 }
 520
 521 static void
 522 print_shader_time_line(const char *stage, const char *name,
 523                        int shader_num, uint64_t time, uint64_t total)
 524 {
 525    fprintf(stderr, "%-6s%-18s", stage, name);
 526
 527    if (shader_num != 0)
 528       fprintf(stderr, "%4d: ", shader_num);
 529    else
 530       fprintf(stderr, "    : ");
 531
 532    fprintf(stderr, "%16lld (%7.2f Gcycles)      %4.1f%%\n",
 533            (long long)time,
 534            (double)time / 1000000000.0,
 535            (double)time / total * 100.0);
 536 }
 537
 538 static void
 539 brw_report_shader_time(struct brw_context *brw)
 540 {
 541    if (!brw->shader_time.bo || !brw->shader_time.num_entries)
 542       return;
 543
 544    uint64_t scaled[brw->shader_time.num_entries];
 545    uint64_t *sorted[brw->shader_time.num_entries];
 546    uint64_t total_by_type[ST_CS + 1];
 547    memset(total_by_type, 0, sizeof(total_by_type));
 548    double total = 0;
 549    for (int i = 0; i < brw->shader_time.num_entries; i++) {
 550       uint64_t written = 0, reset = 0;
 551       enum shader_time_shader_type type = brw->shader_time.types[i];
 552
 553       sorted[i] = &scaled[i];
 554
 555       switch (type) {
 556       case ST_VS:
 557       case ST_TCS:
 558       case ST_TES:
 559       case ST_GS:
 560       case ST_FS8:
 561       case ST_FS16:
 562       case ST_FS32:
 563       case ST_CS:
 564          written = brw->shader_time.cumulative[i].written;
 565          reset = brw->shader_time.cumulative[i].reset;
 566          break;
 567
 568       default:
 569          /* I sometimes want to print things that aren't the 3 shader times.
 570           * Just print the sum in that case.
 571           */
 572          written = 1;
 573          reset = 0;
 574          break;
 575       }
 576
 577       uint64_t time = brw->shader_time.cumulative[i].time;
 578       if (written) {
 579          scaled[i] = time / written * (written + reset);
 580       } else {
 581          scaled[i] = time;
 582       }
 583
 584       switch (type) {
 585       case ST_VS:
 586       case ST_TCS:
 587       case ST_TES:
 588       case ST_GS:
 589       case ST_FS8:
 590       case ST_FS16:
 591       case ST_FS32:
 592       case ST_CS:
 593          total_by_type[type] += scaled[i];
 594          break;
 595       default:
 596          break;
 597       }
 598
 599       total += scaled[i];
 600    }
 601
 602    if (total == 0) {
 603       fprintf(stderr, "No shader time collected yet\n");
 604       return;
 605    }
 606
 607    qsort(sorted, brw->shader_time.num_entries, sizeof(sorted[0]), compare_time);
 608
 609    fprintf(stderr, "\n");
 610    fprintf(stderr, "type          ID                  cycles spent                   %% of total\n");
 611    for (int s = 0; s < brw->shader_time.num_entries; s++) {
 612       const char *stage;
 613       /* Work back from the sorted pointers times to a time to print. */
 614       int i = sorted[s] - scaled;
 615
 616       if (scaled[i] == 0)
 617          continue;
 618
 619       int shader_num = brw->shader_time.ids[i];
 620       const char *shader_name = brw->shader_time.names[i];
 621
 622       switch (brw->shader_time.types[i]) {
 623       case ST_VS:
 624          stage = "vs";
 625          break;
 626       case ST_TCS:
 627          stage = "tcs";
 628          break;
 629       case ST_TES:
 630          stage = "tes";
 631          break;
 632       case ST_GS:
 633          stage = "gs";
 634          break;
 635       case ST_FS8:
 636          stage = "fs8";
 637          break;
 638       case ST_FS16:
 639          stage = "fs16";
 640          break;
 641       case ST_FS32:
 642          stage = "fs32";
 643          break;
 644       case ST_CS:
 645          stage = "cs";
 646          break;
 647       default:
 648          stage = "other";
 649          break;
 650       }
 651
 652       print_shader_time_line(stage, shader_name, shader_num,
 653                              scaled[i], total);
 654    }
 655
 656    fprintf(stderr, "\n");
 657    print_shader_time_line("total", "vs", 0, total_by_type[ST_VS], total);
 658    print_shader_time_line("total", "tcs", 0, total_by_type[ST_TCS], total);
 659    print_shader_time_line("total", "tes", 0, total_by_type[ST_TES], total);
 660    print_shader_time_line("total", "gs", 0, total_by_type[ST_GS], total);
 661    print_shader_time_line("total", "fs8", 0, total_by_type[ST_FS8], total);
 662    print_shader_time_line("total", "fs16", 0, total_by_type[ST_FS16], total);
 663    print_shader_time_line("total", "fs32", 0, total_by_type[ST_FS32], total);
 664    print_shader_time_line("total", "cs", 0, total_by_type[ST_CS], total);
 665 }
 666
 667 static void
 668 brw_collect_shader_time(struct brw_context *brw)
 669 {
 670    if (!brw->shader_time.bo)
 671       return;
 672
 673    /* This probably stalls on the last rendering.  We could fix that by
 674     * delaying reading the reports, but it doesn't look like it's a big
 675     * overhead compared to the cost of tracking the time in the first place.
 676     */
 677    void *bo_map = brw_bo_map(brw, brw->shader_time.bo, MAP_READ | MAP_WRITE);
 678
 679    for (int i = 0; i < brw->shader_time.num_entries; i++) {
 680       uint32_t *times = bo_map + i * 3 * BRW_SHADER_TIME_STRIDE;
 681
 682       brw->shader_time.cumulative[i].time += times[BRW_SHADER_TIME_STRIDE * 0 / 4];
 683       brw->shader_time.cumulative[i].written += times[BRW_SHADER_TIME_STRIDE * 1 / 4];
 684       brw->shader_time.cumulative[i].reset += times[BRW_SHADER_TIME_STRIDE * 2 / 4];
 685    }
 686
 687    /* Zero the BO out to clear it out for our next collection.
 688     */
 689    memset(bo_map, 0, brw->shader_time.bo->size);
 690    brw_bo_unmap(brw->shader_time.bo);
 691 }
 692
 693 void
 694 brw_collect_and_report_shader_time(struct brw_context *brw)
 695 {
 696    brw_collect_shader_time(brw);
 697
 698    if (brw->shader_time.report_time == 0 ||
 699        get_time() - brw->shader_time.report_time >= 1.0) {
 700       brw_report_shader_time(brw);
 701       brw->shader_time.report_time = get_time();
 702    }
 703 }
 704
 705 /**
 706  * Chooses an index in the shader_time buffer and sets up tracking information
 707  * for our printouts.
 708  *
 709  * Note that this holds on to references to the underlying programs, which may
 710  * change their lifetimes compared to normal operation.
 711  */
 712 int
 713 brw_get_shader_time_index(struct brw_context *brw, struct gl_program *prog,
 714                           enum shader_time_shader_type type, bool is_glsl_sh)
 715 {
 716    int shader_time_index = brw->shader_time.num_entries++;
 717    assert(shader_time_index < brw->shader_time.max_entries);
 718    brw->shader_time.types[shader_time_index] = type;
 719
 720    const char *name;
 721    if (prog->Id == 0) {
 722       name = "ff";
 723    } else if (is_glsl_sh) {
 724       name = prog->info.label ?
 725          ralloc_strdup(brw->shader_time.names, prog->info.label) : "glsl";
 726    } else {
 727       name = "prog";
 728    }
 729
 730    brw->shader_time.names[shader_time_index] = name;
 731    brw->shader_time.ids[shader_time_index] = prog->Id;
 732
 733    return shader_time_index;
 734 }
 735
 736 void
 737 brw_destroy_shader_time(struct brw_context *brw)
 738 {
 739    brw_bo_unreference(brw->shader_time.bo);
 740    brw->shader_time.bo = NULL;
 741 }
 742
 743 void
 744 brw_stage_prog_data_free(const void *p)
 745 {
 746    struct brw_stage_prog_data *prog_data = (struct brw_stage_prog_data *)p;
 747
 748    ralloc_free(prog_data->param);
 749    ralloc_free(prog_data->pull_param);
 750 }
 751
 752 void
 753 brw_dump_arb_asm(const char *stage, struct gl_program *prog)
 754 {
 755    fprintf(stderr, "ARB_%s_program %d ir for native %s shader\n",
 756            stage, prog->Id, stage);
 757    _mesa_print_program(prog);
 758 }
 759
 760 void
 761 brw_setup_tex_for_precompile(const struct gen_device_info *devinfo,
 762                              struct brw_sampler_prog_key_data *tex,
 763                              struct gl_program *prog)
 764 {
 765    const bool has_shader_channel_select = devinfo->is_haswell || devinfo->gen >= 8;
 766    unsigned sampler_count = util_last_bit(prog->SamplersUsed);
 767    for (unsigned i = 0; i < sampler_count; i++) {
 768       if (!has_shader_channel_select && (prog->ShadowSamplers & (1 << i))) {
 769          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
 770          tex->swizzles[i] =
 771             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
 772       } else {
 773          /* Color sampler: assume no swizzling. */
 774          tex->swizzles[i] = SWIZZLE_XYZW;
 775       }
 776    }
 777 }
 778
 779 /**
 780  * Sets up the starting offsets for the groups of binding table entries
 781  * common to all pipeline stages.
 782  *
 783  * Unused groups are initialized to 0xd0d0d0d0 to make it obvious that they're
 784  * unused but also make sure that addition of small offsets to them will
 785  * trigger some of our asserts that surface indices are < BRW_MAX_SURFACES.
 786  */
 787 uint32_t
 788 brw_assign_common_binding_table_offsets(const struct gen_device_info *devinfo,
 789                                         const struct gl_program *prog,
 790                                         struct brw_stage_prog_data *stage_prog_data,
 791                                         uint32_t next_binding_table_offset)
 792 {
 793    int num_textures = util_last_bit(prog->SamplersUsed);
 794
 795    stage_prog_data->binding_table.texture_start = next_binding_table_offset;
 796    next_binding_table_offset += num_textures;
 797
 798    if (prog->info.num_ubos) {
 799       assert(prog->info.num_ubos <= BRW_MAX_UBO);
 800       stage_prog_data->binding_table.ubo_start = next_binding_table_offset;
 801       next_binding_table_offset += prog->info.num_ubos;
 802    } else {
 803       stage_prog_data->binding_table.ubo_start = 0xd0d0d0d0;
 804    }
 805
 806    if (prog->info.num_ssbos || prog->info.num_abos) {
 807       assert(prog->info.num_abos <= BRW_MAX_ABO);
 808       assert(prog->info.num_ssbos <= BRW_MAX_SSBO);
 809       stage_prog_data->binding_table.ssbo_start = next_binding_table_offset;
 810       next_binding_table_offset += prog->info.num_abos + prog->info.num_ssbos;
 811    } else {
 812       stage_prog_data->binding_table.ssbo_start = 0xd0d0d0d0;
 813    }
 814
 815    if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
 816       stage_prog_data->binding_table.shader_time_start = next_binding_table_offset;
 817       next_binding_table_offset++;
 818    } else {
 819       stage_prog_data->binding_table.shader_time_start = 0xd0d0d0d0;
 820    }
 821
 822    if (prog->info.uses_texture_gather) {
 823       if (devinfo->gen >= 8) {
 824          stage_prog_data->binding_table.gather_texture_start =
 825             stage_prog_data->binding_table.texture_start;
 826       } else {
 827          stage_prog_data->binding_table.gather_texture_start = next_binding_table_offset;
 828          next_binding_table_offset += num_textures;
 829       }
 830    } else {
 831       stage_prog_data->binding_table.gather_texture_start = 0xd0d0d0d0;
 832    }
 833
 834    if (prog->info.num_images) {
 835       stage_prog_data->binding_table.image_start = next_binding_table_offset;
 836       next_binding_table_offset += prog->info.num_images;
 837    } else {
 838       stage_prog_data->binding_table.image_start = 0xd0d0d0d0;
 839    }
 840
 841    /* This may or may not be used depending on how the compile goes. */
 842    stage_prog_data->binding_table.pull_constants_start = next_binding_table_offset;
 843    next_binding_table_offset++;
 844
 845    /* Plane 0 is just the regular texture section */
 846    stage_prog_data->binding_table.plane_start[0] = stage_prog_data->binding_table.texture_start;
 847
 848    stage_prog_data->binding_table.plane_start[1] = next_binding_table_offset;
 849    next_binding_table_offset += num_textures;
 850
 851    stage_prog_data->binding_table.plane_start[2] = next_binding_table_offset;
 852    next_binding_table_offset += num_textures;
 853
 854    /* Set the binding table size.  Some callers may append new entries
 855     * and increase this accordingly.
 856     */
 857    stage_prog_data->binding_table.size_bytes = next_binding_table_offset * 4;
 858
 859    assert(next_binding_table_offset <= BRW_MAX_SURFACES);
 860    return next_binding_table_offset;
 861 }
 862
 863 void
 864 brw_prog_key_set_id(union brw_any_prog_key *key, gl_shader_stage stage,
 865                     unsigned id)
 866 {
 867    static const unsigned stage_offsets[] = {
 868       offsetof(struct brw_vs_prog_key, program_string_id),
 869       offsetof(struct brw_tcs_prog_key, program_string_id),
 870       offsetof(struct brw_tes_prog_key, program_string_id),
 871       offsetof(struct brw_gs_prog_key, program_string_id),
 872       offsetof(struct brw_wm_prog_key, program_string_id),
 873       offsetof(struct brw_cs_prog_key, program_string_id),
 874    };
 875    assert((int)stage >= 0 && stage < ARRAY_SIZE(stage_offsets));
 876    *(unsigned*)((uint8_t*)key + stage_offsets[stage]) = id;
 877 }
 878
 879 void
 880 brw_populate_default_key(const struct gen_device_info *devinfo,
 881                          union brw_any_prog_key *prog_key,
 882                          struct gl_shader_program *sh_prog,
 883                          struct gl_program *prog)
 884 {
 885    switch (prog->info.stage) {
 886    case MESA_SHADER_VERTEX:
 887       brw_vs_populate_default_key(devinfo, &prog_key->vs, prog);
 888       break;
 889    case MESA_SHADER_TESS_CTRL:
 890       brw_tcs_populate_default_key(devinfo, &prog_key->tcs, sh_prog, prog);
 891       break;
 892    case MESA_SHADER_TESS_EVAL:
 893       brw_tes_populate_default_key(devinfo, &prog_key->tes, sh_prog, prog);
 894       break;
 895    case MESA_SHADER_GEOMETRY:
 896       brw_gs_populate_default_key(devinfo, &prog_key->gs, prog);
 897       break;
 898    case MESA_SHADER_FRAGMENT:
 899       brw_wm_populate_default_key(devinfo, &prog_key->wm, prog);
 900       break;
 901    case MESA_SHADER_COMPUTE:
 902       brw_cs_populate_default_key(devinfo, &prog_key->cs, prog);
 903       break;
 904    default:
 905       unreachable("Unsupported stage!");
 906    }
 907 }