src/mesa/drivers/dri/i965/brw_program.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keithw@vmware.com>
  30   */
  31
  32 #include <pthread.h>
  33 #include "main/glspirv.h"
  34 #include "program/prog_parameter.h"
  35 #include "program/prog_print.h"
  36 #include "program/prog_to_nir.h"
  37 #include "program/program.h"
  38 #include "program/programopt.h"
  39 #include "tnl/tnl.h"
  40 #include "util/ralloc.h"
  41 #include "compiler/glsl/ir.h"
  42 #include "compiler/glsl/program.h"
  43 #include "compiler/glsl/gl_nir.h"
  44 #include "compiler/glsl/glsl_to_nir.h"
  45
  46 #include "brw_program.h"
  47 #include "brw_context.h"
  48 #include "compiler/brw_nir.h"
  49 #include "brw_defines.h"
  50 #include "intel_batchbuffer.h"
  51
  52 #include "brw_cs.h"
  53 #include "brw_gs.h"
  54 #include "brw_vs.h"
  55 #include "brw_wm.h"
  56 #include "brw_state.h"
  57
  58 #include "main/shaderapi.h"
  59 #include "main/shaderobj.h"
  60
  61 static bool
  62 brw_nir_lower_uniforms(nir_shader *nir, bool is_scalar)
  63 {
  64    if (is_scalar) {
  65       nir_assign_var_locations(&nir->uniforms, &nir->num_uniforms,
  66                                type_size_scalar_bytes);
  67       return nir_lower_io(nir, nir_var_uniform, type_size_scalar_bytes, 0);
  68    } else {
  69       nir_assign_var_locations(&nir->uniforms, &nir->num_uniforms,
  70                                type_size_vec4_bytes);
  71       return nir_lower_io(nir, nir_var_uniform, type_size_vec4_bytes, 0);
  72    }
  73 }
  74
  75 static struct gl_program *brwNewProgram(struct gl_context *ctx,
  76                                         gl_shader_stage stage,
  77                                         GLuint id, bool is_arb_asm);
  78
  79 nir_shader *
  80 brw_create_nir(struct brw_context *brw,
  81                const struct gl_shader_program *shader_prog,
  82                struct gl_program *prog,
  83                gl_shader_stage stage,
  84                bool is_scalar)
  85 {
  86    const struct gen_device_info *devinfo = &brw->screen->devinfo;
  87    struct gl_context *ctx = &brw->ctx;
  88    const nir_shader_compiler_options *options =
  89       ctx->Const.ShaderCompilerOptions[stage].NirOptions;
  90    nir_shader *nir;
  91
  92    /* First, lower the GLSL/Mesa IR or SPIR-V to NIR */
  93    if (shader_prog) {
  94       if (shader_prog->data->spirv) {
  95          nir = _mesa_spirv_to_nir(ctx, shader_prog, stage, options);
  96       } else {
  97          nir = glsl_to_nir(ctx, shader_prog, stage, options);
  98
  99          /* Remap the locations to slots so those requiring two slots will
 100           * occupy two locations. For instance, if we have in the IR code a
 101           * dvec3 attr0 in location 0 and vec4 attr1 in location 1, in NIR attr0
 102           * will use locations/slots 0 and 1, and attr1 will use location/slot 2
 103           */
 104          if (nir->info.stage == MESA_SHADER_VERTEX)
 105             nir_remap_dual_slot_attributes(nir, &prog->DualSlotInputs);
 106       }
 107       assert (nir);
 108
 109       nir_remove_dead_variables(nir, nir_var_shader_in | nir_var_shader_out);
 110       nir_validate_shader(nir, "after glsl_to_nir or spirv_to_nir");
 111       NIR_PASS_V(nir, nir_lower_io_to_temporaries,
 112                  nir_shader_get_entrypoint(nir), true, false);
 113    } else {
 114       nir = prog_to_nir(prog, options);
 115       NIR_PASS_V(nir, nir_lower_regs_to_ssa); /* turn registers into SSA */
 116    }
 117    nir_validate_shader(nir, "before brw_preprocess_nir");
 118
 119    nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
 120
 121    if (!ctx->SoftFP64 && nir->info.uses_64bit &&
 122        (options->lower_doubles_options & nir_lower_fp64_full_software)) {
 123       ctx->SoftFP64 = glsl_float64_funcs_to_nir(ctx, options);
 124    }
 125
 126    brw_preprocess_nir(brw->screen->compiler, nir, ctx->SoftFP64);
 127
 128    if (stage == MESA_SHADER_TESS_CTRL) {
 129       /* Lower gl_PatchVerticesIn from a sys. value to a uniform on Gen8+. */
 130       static const gl_state_index16 tokens[STATE_LENGTH] =
 131          { STATE_INTERNAL, STATE_TCS_PATCH_VERTICES_IN };
 132       nir_lower_patch_vertices(nir, 0, devinfo->gen >= 8 ? tokens : NULL);
 133    }
 134
 135    if (stage == MESA_SHADER_TESS_EVAL) {
 136       /* Lower gl_PatchVerticesIn to a constant if we have a TCS, or
 137        * a uniform if we don't.
 138        */
 139       struct gl_linked_shader *tcs =
 140          shader_prog->_LinkedShaders[MESA_SHADER_TESS_CTRL];
 141       uint32_t static_patch_vertices =
 142          tcs ? tcs->Program->nir->info.tess.tcs_vertices_out : 0;
 143       static const gl_state_index16 tokens[STATE_LENGTH] =
 144          { STATE_INTERNAL, STATE_TES_PATCH_VERTICES_IN };
 145       nir_lower_patch_vertices(nir, static_patch_vertices, tokens);
 146    }
 147
 148    if (stage == MESA_SHADER_FRAGMENT) {
 149       static const struct nir_lower_wpos_ytransform_options wpos_options = {
 150          .state_tokens = {STATE_INTERNAL, STATE_FB_WPOS_Y_TRANSFORM, 0, 0, 0},
 151          .fs_coord_pixel_center_integer = 1,
 152          .fs_coord_origin_upper_left = 1,
 153       };
 154
 155       bool progress = false;
 156       NIR_PASS(progress, nir, nir_lower_wpos_ytransform, &wpos_options);
 157       if (progress) {
 158          _mesa_add_state_reference(prog->Parameters,
 159                                    wpos_options.state_tokens);
 160       }
 161    }
 162
 163    NIR_PASS_V(nir, brw_nir_lower_uniforms, is_scalar);
 164
 165    return nir;
 166 }
 167
 168 static void
 169 shared_type_info(const struct glsl_type *type, unsigned *size, unsigned *align)
 170 {
 171    assert(glsl_type_is_vector_or_scalar(type));
 172
 173    uint32_t comp_size = glsl_type_is_boolean(type)
 174       ? 4 : glsl_get_bit_size(type) / 8;
 175    unsigned length = glsl_get_vector_elements(type);
 176    *size = comp_size * length,
 177    *align = comp_size * (length == 3 ? 4 : length);
 178 }
 179
 180 void
 181 brw_nir_lower_resources(nir_shader *nir, struct gl_shader_program *shader_prog,
 182                         struct gl_program *prog,
 183                         const struct gen_device_info *devinfo)
 184 {
 185    NIR_PASS_V(prog->nir, gl_nir_lower_samplers, shader_prog);
 186    prog->info.textures_used = prog->nir->info.textures_used;
 187    prog->info.textures_used_by_txf = prog->nir->info.textures_used_by_txf;
 188
 189    NIR_PASS_V(prog->nir, brw_nir_lower_image_load_store, devinfo, NULL);
 190
 191    if (prog->nir->info.stage == MESA_SHADER_COMPUTE &&
 192        shader_prog->data->spirv) {
 193       NIR_PASS_V(prog->nir, nir_lower_vars_to_explicit_types,
 194                  nir_var_mem_shared, shared_type_info);
 195       NIR_PASS_V(prog->nir, nir_lower_explicit_io,
 196                  nir_var_mem_shared, nir_address_format_32bit_offset);
 197    }
 198
 199    NIR_PASS_V(prog->nir, gl_nir_lower_buffers, shader_prog);
 200    /* Do a round of constant folding to clean up address calculations */
 201    NIR_PASS_V(prog->nir, nir_opt_constant_folding);
 202 }
 203
 204 void
 205 brw_shader_gather_info(nir_shader *nir, struct gl_program *prog)
 206 {
 207    nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
 208
 209    /* Copy the info we just generated back into the gl_program */
 210    const char *prog_name = prog->info.name;
 211    const char *prog_label = prog->info.label;
 212    prog->info = nir->info;
 213    prog->info.name = prog_name;
 214    prog->info.label = prog_label;
 215 }
 216
 217 static unsigned
 218 get_new_program_id(struct intel_screen *screen)
 219 {
 220    return p_atomic_inc_return(&screen->program_id);
 221 }
 222
 223 static struct gl_program *brwNewProgram(struct gl_context *ctx,
 224                                         gl_shader_stage stage,
 225                                         GLuint id, bool is_arb_asm)
 226 {
 227    struct brw_context *brw = brw_context(ctx);
 228    struct brw_program *prog = rzalloc(NULL, struct brw_program);
 229
 230    if (prog) {
 231       prog->id = get_new_program_id(brw->screen);
 232
 233       return _mesa_init_gl_program(&prog->program, stage, id, is_arb_asm);
 234    }
 235
 236    return NULL;
 237 }
 238
 239 static void brwDeleteProgram( struct gl_context *ctx,
 240                               struct gl_program *prog )
 241 {
 242    struct brw_context *brw = brw_context(ctx);
 243
 244    /* Beware!  prog's refcount has reached zero, and it's about to be freed.
 245     *
 246     * In brw_upload_pipeline_state(), we compare brw->programs[i] to
 247     * ctx->FooProgram._Current, and flag BRW_NEW_FOO_PROGRAM if the
 248     * pointer has changed.
 249     *
 250     * We cannot leave brw->programs[i] as a dangling pointer to the dead
 251     * program.  malloc() may allocate the same memory for a new gl_program,
 252     * causing us to see matching pointers...but totally different programs.
 253     *
 254     * We cannot set brw->programs[i] to NULL, either.  If we've deleted the
 255     * active program, Mesa may set ctx->FooProgram._Current to NULL.  That
 256     * would cause us to see matching pointers (NULL == NULL), and fail to
 257     * detect that a program has changed since our last draw.
 258     *
 259     * So, set it to a bogus gl_program pointer that will never match,
 260     * causing us to properly reevaluate the state on our next draw.
 261     *
 262     * Getting this wrong causes heisenbugs which are very hard to catch,
 263     * as you need a very specific allocation pattern to hit the problem.
 264     */
 265    static const struct gl_program deleted_program;
 266
 267    for (int i = 0; i < MESA_SHADER_STAGES; i++) {
 268       if (brw->programs[i] == prog)
 269          brw->programs[i] = (struct gl_program *) &deleted_program;
 270    }
 271
 272    _mesa_delete_program( ctx, prog );
 273 }
 274
 275
 276 static GLboolean
 277 brwProgramStringNotify(struct gl_context *ctx,
 278                        GLenum target,
 279                        struct gl_program *prog)
 280 {
 281    assert(target == GL_VERTEX_PROGRAM_ARB || !prog->arb.IsPositionInvariant);
 282
 283    struct brw_context *brw = brw_context(ctx);
 284    const struct brw_compiler *compiler = brw->screen->compiler;
 285
 286    switch (target) {
 287    case GL_FRAGMENT_PROGRAM_ARB: {
 288       struct brw_program *newFP = brw_program(prog);
 289       const struct brw_program *curFP =
 290          brw_program_const(brw->programs[MESA_SHADER_FRAGMENT]);
 291
 292       if (newFP == curFP)
 293          brw->ctx.NewDriverState |= BRW_NEW_FRAGMENT_PROGRAM;
 294       _mesa_program_fragment_position_to_sysval(&newFP->program);
 295       newFP->id = get_new_program_id(brw->screen);
 296
 297       prog->nir = brw_create_nir(brw, NULL, prog, MESA_SHADER_FRAGMENT, true);
 298
 299       brw_nir_lower_resources(prog->nir, NULL, prog, &brw->screen->devinfo);
 300
 301       brw_shader_gather_info(prog->nir, prog);
 302
 303       brw_fs_precompile(ctx, prog);
 304       break;
 305    }
 306    case GL_VERTEX_PROGRAM_ARB: {
 307       struct brw_program *newVP = brw_program(prog);
 308       const struct brw_program *curVP =
 309          brw_program_const(brw->programs[MESA_SHADER_VERTEX]);
 310
 311       if (newVP == curVP)
 312          brw->ctx.NewDriverState |= BRW_NEW_VERTEX_PROGRAM;
 313       if (newVP->program.arb.IsPositionInvariant) {
 314          _mesa_insert_mvp_code(ctx, &newVP->program);
 315       }
 316       newVP->id = get_new_program_id(brw->screen);
 317
 318       /* Also tell tnl about it:
 319        */
 320       _tnl_program_string(ctx, target, prog);
 321
 322       prog->nir = brw_create_nir(brw, NULL, prog, MESA_SHADER_VERTEX,
 323                                  compiler->scalar_stage[MESA_SHADER_VERTEX]);
 324
 325       brw_nir_lower_resources(prog->nir, NULL, prog, &brw->screen->devinfo);
 326
 327       brw_shader_gather_info(prog->nir, prog);
 328
 329       brw_vs_precompile(ctx, prog);
 330       break;
 331    }
 332    default:
 333       /*
 334        * driver->ProgramStringNotify is only called for ARB programs, fixed
 335        * function vertex programs, and ir_to_mesa (which isn't used by the
 336        * i965 back-end).  Therefore, even after geometry shaders are added,
 337        * this function should only ever be called with a target of
 338        * GL_VERTEX_PROGRAM_ARB or GL_FRAGMENT_PROGRAM_ARB.
 339        */
 340       unreachable("Unexpected target in brwProgramStringNotify");
 341    }
 342
 343    return true;
 344 }
 345
 346 static void
 347 brw_memory_barrier(struct gl_context *ctx, GLbitfield barriers)
 348 {
 349    struct brw_context *brw = brw_context(ctx);
 350    const struct gen_device_info *devinfo = &brw->screen->devinfo;
 351    unsigned bits = PIPE_CONTROL_DATA_CACHE_FLUSH | PIPE_CONTROL_CS_STALL;
 352    assert(devinfo->gen >= 7 && devinfo->gen <= 11);
 353
 354    if (barriers & (GL_VERTEX_ATTRIB_ARRAY_BARRIER_BIT |
 355                    GL_ELEMENT_ARRAY_BARRIER_BIT |
 356                    GL_COMMAND_BARRIER_BIT))
 357       bits |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
 358
 359    if (barriers & GL_UNIFORM_BARRIER_BIT)
 360       bits |= (PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
 361                PIPE_CONTROL_CONST_CACHE_INVALIDATE);
 362
 363    if (barriers & GL_TEXTURE_FETCH_BARRIER_BIT)
 364       bits |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
 365
 366    if (barriers & (GL_TEXTURE_UPDATE_BARRIER_BIT |
 367                    GL_PIXEL_BUFFER_BARRIER_BIT))
 368       bits |= (PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
 369                PIPE_CONTROL_RENDER_TARGET_FLUSH);
 370
 371    if (barriers & GL_FRAMEBUFFER_BARRIER_BIT)
 372       bits |= (PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
 373                PIPE_CONTROL_RENDER_TARGET_FLUSH);
 374
 375    /* Typed surface messages are handled by the render cache on IVB, so we
 376     * need to flush it too.
 377     */
 378    if (devinfo->gen == 7 && !devinfo->is_haswell)
 379       bits |= PIPE_CONTROL_RENDER_TARGET_FLUSH;
 380
 381    brw_emit_pipe_control_flush(brw, bits);
 382 }
 383
 384 static void
 385 brw_framebuffer_fetch_barrier(struct gl_context *ctx)
 386 {
 387    struct brw_context *brw = brw_context(ctx);
 388    const struct gen_device_info *devinfo = &brw->screen->devinfo;
 389
 390    if (!ctx->Extensions.EXT_shader_framebuffer_fetch) {
 391       if (devinfo->gen >= 6) {
 392          brw_emit_pipe_control_flush(brw,
 393                                      PIPE_CONTROL_RENDER_TARGET_FLUSH |
 394                                      PIPE_CONTROL_CS_STALL);
 395          brw_emit_pipe_control_flush(brw,
 396                                      PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE);
 397       } else {
 398          brw_emit_pipe_control_flush(brw,
 399                                      PIPE_CONTROL_RENDER_TARGET_FLUSH);
 400       }
 401    }
 402 }
 403
 404 void
 405 brw_get_scratch_bo(struct brw_context *brw,
 406                    struct brw_bo **scratch_bo, int size)
 407 {
 408    struct brw_bo *old_bo = *scratch_bo;
 409
 410    if (old_bo && old_bo->size < size) {
 411       brw_bo_unreference(old_bo);
 412       old_bo = NULL;
 413    }
 414
 415    if (!old_bo) {
 416       *scratch_bo =
 417          brw_bo_alloc(brw->bufmgr, "scratch bo", size, BRW_MEMZONE_SCRATCH);
 418    }
 419 }
 420
 421 /**
 422  * Reserve enough scratch space for the given stage to hold \p per_thread_size
 423  * bytes times the given \p thread_count.
 424  */
 425 void
 426 brw_alloc_stage_scratch(struct brw_context *brw,
 427                         struct brw_stage_state *stage_state,
 428                         unsigned per_thread_size)
 429 {
 430    if (stage_state->per_thread_scratch >= per_thread_size)
 431       return;
 432
 433    stage_state->per_thread_scratch = per_thread_size;
 434
 435    if (stage_state->scratch_bo)
 436       brw_bo_unreference(stage_state->scratch_bo);
 437
 438    const struct gen_device_info *devinfo = &brw->screen->devinfo;
 439    unsigned thread_count;
 440    switch(stage_state->stage) {
 441    case MESA_SHADER_VERTEX:
 442       thread_count = devinfo->max_vs_threads;
 443       break;
 444    case MESA_SHADER_TESS_CTRL:
 445       thread_count = devinfo->max_tcs_threads;
 446       break;
 447    case MESA_SHADER_TESS_EVAL:
 448       thread_count = devinfo->max_tes_threads;
 449       break;
 450    case MESA_SHADER_GEOMETRY:
 451       thread_count = devinfo->max_gs_threads;
 452       break;
 453    case MESA_SHADER_FRAGMENT:
 454       thread_count = devinfo->max_wm_threads;
 455       break;
 456    case MESA_SHADER_COMPUTE: {
 457       unsigned subslices = MAX2(brw->screen->subslice_total, 1);
 458
 459       /* The documentation for 3DSTATE_PS "Scratch Space Base Pointer" says:
 460        *
 461        * "Scratch Space per slice is computed based on 4 sub-slices.  SW must
 462        *  allocate scratch space enough so that each slice has 4 slices
 463        *  allowed."
 464        *
 465        * According to the other driver team, this applies to compute shaders
 466        * as well.  This is not currently documented at all.
 467        *
 468        * brw->screen->subslice_total is the TOTAL number of subslices
 469        * and we wish to view that there are 4 subslices per slice
 470        * instead of the actual number of subslices per slice.
 471        *
 472        * For, ICL, scratch space allocation is based on the number of threads
 473        * in the base configuration.
 474        */
 475       if (devinfo->gen == 11)
 476          subslices = 8;
 477       else if (devinfo->gen >= 9 && devinfo->gen < 11)
 478          subslices = 4 * brw->screen->devinfo.num_slices;
 479
 480       unsigned scratch_ids_per_subslice;
 481       if (devinfo->gen >= 11) {
 482          /* The MEDIA_VFE_STATE docs say:
 483           *
 484           *    "Starting with this configuration, the Maximum Number of
 485           *     Threads must be set to (#EU * 8) for GPGPU dispatches.
 486           *
 487           *     Although there are only 7 threads per EU in the configuration,
 488           *     the FFTID is calculated as if there are 8 threads per EU,
 489           *     which in turn requires a larger amount of Scratch Space to be
 490           *     allocated by the driver."
 491           */
 492          scratch_ids_per_subslice = 8 * 8;
 493       } else if (devinfo->is_haswell) {
 494          /* WaCSScratchSize:hsw
 495           *
 496           * Haswell's scratch space address calculation appears to be sparse
 497           * rather than tightly packed. The Thread ID has bits indicating
 498           * which subslice, EU within a subslice, and thread within an EU it
 499           * is. There's a maximum of two slices and two subslices, so these
 500           * can be stored with a single bit. Even though there are only 10 EUs
 501           * per subslice, this is stored in 4 bits, so there's an effective
 502           * maximum value of 16 EUs. Similarly, although there are only 7
 503           * threads per EU, this is stored in a 3 bit number, giving an
 504           * effective maximum value of 8 threads per EU.
 505           *
 506           * This means that we need to use 16 * 8 instead of 10 * 7 for the
 507           * number of threads per subslice.
 508           */
 509          scratch_ids_per_subslice = 16 * 8;
 510       } else if (devinfo->is_cherryview) {
 511          /* Cherryview devices have either 6 or 8 EUs per subslice, and each
 512           * EU has 7 threads. The 6 EU devices appear to calculate thread IDs
 513           * as if it had 8 EUs.
 514           */
 515          scratch_ids_per_subslice = 8 * 7;
 516       } else {
 517          scratch_ids_per_subslice = devinfo->max_cs_threads;
 518       }
 519
 520       thread_count = scratch_ids_per_subslice * subslices;
 521       break;
 522    }
 523    default:
 524       unreachable("Unsupported stage!");
 525    }
 526
 527    stage_state->scratch_bo =
 528       brw_bo_alloc(brw->bufmgr, "shader scratch space",
 529                    per_thread_size * thread_count, BRW_MEMZONE_SCRATCH);
 530 }
 531
 532 void brwInitFragProgFuncs( struct dd_function_table *functions )
 533 {
 534    assert(functions->ProgramStringNotify == _tnl_program_string);
 535
 536    functions->NewProgram = brwNewProgram;
 537    functions->DeleteProgram = brwDeleteProgram;
 538    functions->ProgramStringNotify = brwProgramStringNotify;
 539
 540    functions->LinkShader = brw_link_shader;
 541
 542    functions->MemoryBarrier = brw_memory_barrier;
 543    functions->FramebufferFetchBarrier = brw_framebuffer_fetch_barrier;
 544 }
 545
 546 struct shader_times {
 547    uint64_t time;
 548    uint64_t written;
 549    uint64_t reset;
 550 };
 551
 552 void
 553 brw_init_shader_time(struct brw_context *brw)
 554 {
 555    const int max_entries = 2048;
 556    brw->shader_time.bo =
 557       brw_bo_alloc(brw->bufmgr, "shader time",
 558                    max_entries * BRW_SHADER_TIME_STRIDE * 3,
 559                    BRW_MEMZONE_OTHER);
 560    brw->shader_time.names = rzalloc_array(brw, const char *, max_entries);
 561    brw->shader_time.ids = rzalloc_array(brw, int, max_entries);
 562    brw->shader_time.types = rzalloc_array(brw, enum shader_time_shader_type,
 563                                           max_entries);
 564    brw->shader_time.cumulative = rzalloc_array(brw, struct shader_times,
 565                                                max_entries);
 566    brw->shader_time.max_entries = max_entries;
 567 }
 568
 569 static int
 570 compare_time(const void *a, const void *b)
 571 {
 572    uint64_t * const *a_val = a;
 573    uint64_t * const *b_val = b;
 574
 575    /* We don't just subtract because we're turning the value to an int. */
 576    if (**a_val < **b_val)
 577       return -1;
 578    else if (**a_val == **b_val)
 579       return 0;
 580    else
 581       return 1;
 582 }
 583
 584 static void
 585 print_shader_time_line(const char *stage, const char *name,
 586                        int shader_num, uint64_t time, uint64_t total)
 587 {
 588    fprintf(stderr, "%-6s%-18s", stage, name);
 589
 590    if (shader_num != 0)
 591       fprintf(stderr, "%4d: ", shader_num);
 592    else
 593       fprintf(stderr, "    : ");
 594
 595    fprintf(stderr, "%16lld (%7.2f Gcycles)      %4.1f%%\n",
 596            (long long)time,
 597            (double)time / 1000000000.0,
 598            (double)time / total * 100.0);
 599 }
 600
 601 static void
 602 brw_report_shader_time(struct brw_context *brw)
 603 {
 604    if (!brw->shader_time.bo || !brw->shader_time.num_entries)
 605       return;
 606
 607    uint64_t scaled[brw->shader_time.num_entries];
 608    uint64_t *sorted[brw->shader_time.num_entries];
 609    uint64_t total_by_type[ST_CS + 1];
 610    memset(total_by_type, 0, sizeof(total_by_type));
 611    double total = 0;
 612    for (int i = 0; i < brw->shader_time.num_entries; i++) {
 613       uint64_t written = 0, reset = 0;
 614       enum shader_time_shader_type type = brw->shader_time.types[i];
 615
 616       sorted[i] = &scaled[i];
 617
 618       switch (type) {
 619       case ST_VS:
 620       case ST_TCS:
 621       case ST_TES:
 622       case ST_GS:
 623       case ST_FS8:
 624       case ST_FS16:
 625       case ST_FS32:
 626       case ST_CS:
 627          written = brw->shader_time.cumulative[i].written;
 628          reset = brw->shader_time.cumulative[i].reset;
 629          break;
 630
 631       default:
 632          /* I sometimes want to print things that aren't the 3 shader times.
 633           * Just print the sum in that case.
 634           */
 635          written = 1;
 636          reset = 0;
 637          break;
 638       }
 639
 640       uint64_t time = brw->shader_time.cumulative[i].time;
 641       if (written) {
 642          scaled[i] = time / written * (written + reset);
 643       } else {
 644          scaled[i] = time;
 645       }
 646
 647       switch (type) {
 648       case ST_VS:
 649       case ST_TCS:
 650       case ST_TES:
 651       case ST_GS:
 652       case ST_FS8:
 653       case ST_FS16:
 654       case ST_FS32:
 655       case ST_CS:
 656          total_by_type[type] += scaled[i];
 657          break;
 658       default:
 659          break;
 660       }
 661
 662       total += scaled[i];
 663    }
 664
 665    if (total == 0) {
 666       fprintf(stderr, "No shader time collected yet\n");
 667       return;
 668    }
 669
 670    qsort(sorted, brw->shader_time.num_entries, sizeof(sorted[0]), compare_time);
 671
 672    fprintf(stderr, "\n");
 673    fprintf(stderr, "type          ID                  cycles spent                   %% of total\n");
 674    for (int s = 0; s < brw->shader_time.num_entries; s++) {
 675       const char *stage;
 676       /* Work back from the sorted pointers times to a time to print. */
 677       int i = sorted[s] - scaled;
 678
 679       if (scaled[i] == 0)
 680          continue;
 681
 682       int shader_num = brw->shader_time.ids[i];
 683       const char *shader_name = brw->shader_time.names[i];
 684
 685       switch (brw->shader_time.types[i]) {
 686       case ST_VS:
 687          stage = "vs";
 688          break;
 689       case ST_TCS:
 690          stage = "tcs";
 691          break;
 692       case ST_TES:
 693          stage = "tes";
 694          break;
 695       case ST_GS:
 696          stage = "gs";
 697          break;
 698       case ST_FS8:
 699          stage = "fs8";
 700          break;
 701       case ST_FS16:
 702          stage = "fs16";
 703          break;
 704       case ST_FS32:
 705          stage = "fs32";
 706          break;
 707       case ST_CS:
 708          stage = "cs";
 709          break;
 710       default:
 711          stage = "other";
 712          break;
 713       }
 714
 715       print_shader_time_line(stage, shader_name, shader_num,
 716                              scaled[i], total);
 717    }
 718
 719    fprintf(stderr, "\n");
 720    print_shader_time_line("total", "vs", 0, total_by_type[ST_VS], total);
 721    print_shader_time_line("total", "tcs", 0, total_by_type[ST_TCS], total);
 722    print_shader_time_line("total", "tes", 0, total_by_type[ST_TES], total);
 723    print_shader_time_line("total", "gs", 0, total_by_type[ST_GS], total);
 724    print_shader_time_line("total", "fs8", 0, total_by_type[ST_FS8], total);
 725    print_shader_time_line("total", "fs16", 0, total_by_type[ST_FS16], total);
 726    print_shader_time_line("total", "fs32", 0, total_by_type[ST_FS32], total);
 727    print_shader_time_line("total", "cs", 0, total_by_type[ST_CS], total);
 728 }
 729
 730 static void
 731 brw_collect_shader_time(struct brw_context *brw)
 732 {
 733    if (!brw->shader_time.bo)
 734       return;
 735
 736    /* This probably stalls on the last rendering.  We could fix that by
 737     * delaying reading the reports, but it doesn't look like it's a big
 738     * overhead compared to the cost of tracking the time in the first place.
 739     */
 740    void *bo_map = brw_bo_map(brw, brw->shader_time.bo, MAP_READ | MAP_WRITE);
 741
 742    for (int i = 0; i < brw->shader_time.num_entries; i++) {
 743       uint32_t *times = bo_map + i * 3 * BRW_SHADER_TIME_STRIDE;
 744
 745       brw->shader_time.cumulative[i].time += times[BRW_SHADER_TIME_STRIDE * 0 / 4];
 746       brw->shader_time.cumulative[i].written += times[BRW_SHADER_TIME_STRIDE * 1 / 4];
 747       brw->shader_time.cumulative[i].reset += times[BRW_SHADER_TIME_STRIDE * 2 / 4];
 748    }
 749
 750    /* Zero the BO out to clear it out for our next collection.
 751     */
 752    memset(bo_map, 0, brw->shader_time.bo->size);
 753    brw_bo_unmap(brw->shader_time.bo);
 754 }
 755
 756 void
 757 brw_collect_and_report_shader_time(struct brw_context *brw)
 758 {
 759    brw_collect_shader_time(brw);
 760
 761    if (brw->shader_time.report_time == 0 ||
 762        get_time() - brw->shader_time.report_time >= 1.0) {
 763       brw_report_shader_time(brw);
 764       brw->shader_time.report_time = get_time();
 765    }
 766 }
 767
 768 /**
 769  * Chooses an index in the shader_time buffer and sets up tracking information
 770  * for our printouts.
 771  *
 772  * Note that this holds on to references to the underlying programs, which may
 773  * change their lifetimes compared to normal operation.
 774  */
 775 int
 776 brw_get_shader_time_index(struct brw_context *brw, struct gl_program *prog,
 777                           enum shader_time_shader_type type, bool is_glsl_sh)
 778 {
 779    int shader_time_index = brw->shader_time.num_entries++;
 780    assert(shader_time_index < brw->shader_time.max_entries);
 781    brw->shader_time.types[shader_time_index] = type;
 782
 783    const char *name;
 784    if (prog->Id == 0) {
 785       name = "ff";
 786    } else if (is_glsl_sh) {
 787       name = prog->info.label ?
 788          ralloc_strdup(brw->shader_time.names, prog->info.label) : "glsl";
 789    } else {
 790       name = "prog";
 791    }
 792
 793    brw->shader_time.names[shader_time_index] = name;
 794    brw->shader_time.ids[shader_time_index] = prog->Id;
 795
 796    return shader_time_index;
 797 }
 798
 799 void
 800 brw_destroy_shader_time(struct brw_context *brw)
 801 {
 802    brw_bo_unreference(brw->shader_time.bo);
 803    brw->shader_time.bo = NULL;
 804 }
 805
 806 void
 807 brw_stage_prog_data_free(const void *p)
 808 {
 809    struct brw_stage_prog_data *prog_data = (struct brw_stage_prog_data *)p;
 810
 811    ralloc_free(prog_data->param);
 812    ralloc_free(prog_data->pull_param);
 813 }
 814
 815 void
 816 brw_dump_arb_asm(const char *stage, struct gl_program *prog)
 817 {
 818    fprintf(stderr, "ARB_%s_program %d ir for native %s shader\n",
 819            stage, prog->Id, stage);
 820    _mesa_print_program(prog);
 821 }
 822
 823 void
 824 brw_setup_tex_for_precompile(const struct gen_device_info *devinfo,
 825                              struct brw_sampler_prog_key_data *tex,
 826                              const struct gl_program *prog)
 827 {
 828    const bool has_shader_channel_select = devinfo->is_haswell || devinfo->gen >= 8;
 829    unsigned sampler_count = util_last_bit(prog->SamplersUsed);
 830    for (unsigned i = 0; i < sampler_count; i++) {
 831       if (!has_shader_channel_select && (prog->ShadowSamplers & (1 << i))) {
 832          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
 833          tex->swizzles[i] =
 834             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
 835       } else {
 836          /* Color sampler: assume no swizzling. */
 837          tex->swizzles[i] = SWIZZLE_XYZW;
 838       }
 839    }
 840 }
 841
 842 /**
 843  * Sets up the starting offsets for the groups of binding table entries
 844  * common to all pipeline stages.
 845  *
 846  * Unused groups are initialized to 0xd0d0d0d0 to make it obvious that they're
 847  * unused but also make sure that addition of small offsets to them will
 848  * trigger some of our asserts that surface indices are < BRW_MAX_SURFACES.
 849  */
 850 uint32_t
 851 brw_assign_common_binding_table_offsets(const struct gen_device_info *devinfo,
 852                                         const struct gl_program *prog,
 853                                         struct brw_stage_prog_data *stage_prog_data,
 854                                         uint32_t next_binding_table_offset)
 855 {
 856    int num_textures = util_last_bit(prog->SamplersUsed);
 857
 858    stage_prog_data->binding_table.texture_start = next_binding_table_offset;
 859    next_binding_table_offset += num_textures;
 860
 861    if (prog->info.num_ubos) {
 862       assert(prog->info.num_ubos <= BRW_MAX_UBO);
 863       stage_prog_data->binding_table.ubo_start = next_binding_table_offset;
 864       next_binding_table_offset += prog->info.num_ubos;
 865    } else {
 866       stage_prog_data->binding_table.ubo_start = 0xd0d0d0d0;
 867    }
 868
 869    if (prog->info.num_ssbos || prog->info.num_abos) {
 870       assert(prog->info.num_abos <= BRW_MAX_ABO);
 871       assert(prog->info.num_ssbos <= BRW_MAX_SSBO);
 872       stage_prog_data->binding_table.ssbo_start = next_binding_table_offset;
 873       next_binding_table_offset += prog->info.num_abos + prog->info.num_ssbos;
 874    } else {
 875       stage_prog_data->binding_table.ssbo_start = 0xd0d0d0d0;
 876    }
 877
 878    if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
 879       stage_prog_data->binding_table.shader_time_start = next_binding_table_offset;
 880       next_binding_table_offset++;
 881    } else {
 882       stage_prog_data->binding_table.shader_time_start = 0xd0d0d0d0;
 883    }
 884
 885    if (prog->info.uses_texture_gather) {
 886       if (devinfo->gen >= 8) {
 887          stage_prog_data->binding_table.gather_texture_start =
 888             stage_prog_data->binding_table.texture_start;
 889       } else {
 890          stage_prog_data->binding_table.gather_texture_start = next_binding_table_offset;
 891          next_binding_table_offset += num_textures;
 892       }
 893    } else {
 894       stage_prog_data->binding_table.gather_texture_start = 0xd0d0d0d0;
 895    }
 896
 897    if (prog->info.num_images) {
 898       stage_prog_data->binding_table.image_start = next_binding_table_offset;
 899       next_binding_table_offset += prog->info.num_images;
 900    } else {
 901       stage_prog_data->binding_table.image_start = 0xd0d0d0d0;
 902    }
 903
 904    /* This may or may not be used depending on how the compile goes. */
 905    stage_prog_data->binding_table.pull_constants_start = next_binding_table_offset;
 906    next_binding_table_offset++;
 907
 908    /* Plane 0 is just the regular texture section */
 909    stage_prog_data->binding_table.plane_start[0] = stage_prog_data->binding_table.texture_start;
 910
 911    stage_prog_data->binding_table.plane_start[1] = next_binding_table_offset;
 912    next_binding_table_offset += num_textures;
 913
 914    stage_prog_data->binding_table.plane_start[2] = next_binding_table_offset;
 915    next_binding_table_offset += num_textures;
 916
 917    /* Set the binding table size.  Some callers may append new entries
 918     * and increase this accordingly.
 919     */
 920    stage_prog_data->binding_table.size_bytes = next_binding_table_offset * 4;
 921
 922    assert(next_binding_table_offset <= BRW_MAX_SURFACES);
 923    return next_binding_table_offset;
 924 }
 925
 926 void
 927 brw_populate_default_key(const struct brw_compiler *compiler,
 928                          union brw_any_prog_key *prog_key,
 929                          struct gl_shader_program *sh_prog,
 930                          struct gl_program *prog)
 931 {
 932    switch (prog->info.stage) {
 933    case MESA_SHADER_VERTEX:
 934       brw_vs_populate_default_key(compiler, &prog_key->vs, prog);
 935       break;
 936    case MESA_SHADER_TESS_CTRL:
 937       brw_tcs_populate_default_key(compiler, &prog_key->tcs, sh_prog, prog);
 938       break;
 939    case MESA_SHADER_TESS_EVAL:
 940       brw_tes_populate_default_key(compiler, &prog_key->tes, sh_prog, prog);
 941       break;
 942    case MESA_SHADER_GEOMETRY:
 943       brw_gs_populate_default_key(compiler, &prog_key->gs, prog);
 944       break;
 945    case MESA_SHADER_FRAGMENT:
 946       brw_wm_populate_default_key(compiler, &prog_key->wm, prog);
 947       break;
 948    case MESA_SHADER_COMPUTE:
 949       brw_cs_populate_default_key(compiler, &prog_key->cs, prog);
 950       break;
 951    default:
 952       unreachable("Unsupported stage!");
 953    }
 954 }
 955
 956 void
 957 brw_debug_recompile(struct brw_context *brw,
 958                     gl_shader_stage stage,
 959                     unsigned api_id,
 960                     struct brw_base_prog_key *key)
 961 {
 962    const struct brw_compiler *compiler = brw->screen->compiler;
 963    enum brw_cache_id cache_id = brw_stage_cache_id(stage);
 964
 965    compiler->shader_perf_log(brw, "Recompiling %s shader for program %d\n",
 966                              _mesa_shader_stage_to_string(stage), api_id);
 967
 968    const void *old_key =
 969       brw_find_previous_compile(&brw->cache, cache_id, key->program_string_id);
 970
 971    brw_debug_key_recompile(compiler, brw, stage, old_key, key);
 972 }