src/mesa/drivers/dri/i965/brw_program.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keithw@vmware.com>
  30   */
  31
  32 #include <pthread.h>
  33 #include "main/glspirv.h"
  34 #include "program/prog_parameter.h"
  35 #include "program/prog_print.h"
  36 #include "program/prog_to_nir.h"
  37 #include "program/program.h"
  38 #include "program/programopt.h"
  39 #include "tnl/tnl.h"
  40 #include "util/ralloc.h"
  41 #include "compiler/glsl/ir.h"
  42 #include "compiler/glsl/program.h"
  43 #include "compiler/glsl/gl_nir.h"
  44 #include "compiler/glsl/glsl_to_nir.h"
  45
  46 #include "brw_program.h"
  47 #include "brw_context.h"
  48 #include "compiler/brw_nir.h"
  49 #include "brw_defines.h"
  50 #include "intel_batchbuffer.h"
  51
  52 #include "brw_cs.h"
  53 #include "brw_gs.h"
  54 #include "brw_vs.h"
  55 #include "brw_wm.h"
  56 #include "brw_state.h"
  57
  58 #include "main/shaderapi.h"
  59 #include "main/shaderobj.h"
  60
  61 static bool
  62 brw_nir_lower_uniforms(nir_shader *nir, bool is_scalar)
  63 {
  64    if (is_scalar) {
  65       nir_assign_var_locations(&nir->uniforms, &nir->num_uniforms,
  66                                type_size_scalar_bytes);
  67       return nir_lower_io(nir, nir_var_uniform, type_size_scalar_bytes, 0);
  68    } else {
  69       nir_assign_var_locations(&nir->uniforms, &nir->num_uniforms,
  70                                type_size_vec4_bytes);
  71       return nir_lower_io(nir, nir_var_uniform, type_size_vec4_bytes, 0);
  72    }
  73 }
  74
  75 static struct gl_program *brwNewProgram(struct gl_context *ctx,
  76                                         gl_shader_stage stage,
  77                                         GLuint id, bool is_arb_asm);
  78
  79 nir_shader *
  80 brw_create_nir(struct brw_context *brw,
  81                const struct gl_shader_program *shader_prog,
  82                struct gl_program *prog,
  83                gl_shader_stage stage,
  84                bool is_scalar)
  85 {
  86    const struct gen_device_info *devinfo = &brw->screen->devinfo;
  87    struct gl_context *ctx = &brw->ctx;
  88    const nir_shader_compiler_options *options =
  89       ctx->Const.ShaderCompilerOptions[stage].NirOptions;
  90    nir_shader *nir;
  91
  92    /* First, lower the GLSL/Mesa IR or SPIR-V to NIR */
  93    if (shader_prog) {
  94       if (shader_prog->data->spirv) {
  95          nir = _mesa_spirv_to_nir(ctx, shader_prog, stage, options);
  96       } else {
  97          nir = glsl_to_nir(ctx, shader_prog, stage, options);
  98
  99          /* Remap the locations to slots so those requiring two slots will
 100           * occupy two locations. For instance, if we have in the IR code a
 101           * dvec3 attr0 in location 0 and vec4 attr1 in location 1, in NIR attr0
 102           * will use locations/slots 0 and 1, and attr1 will use location/slot 2
 103           */
 104          if (nir->info.stage == MESA_SHADER_VERTEX)
 105             nir_remap_dual_slot_attributes(nir, &prog->DualSlotInputs);
 106       }
 107       assert (nir);
 108
 109       nir_remove_dead_variables(nir, nir_var_shader_in | nir_var_shader_out,
 110                                 NULL);
 111       nir_validate_shader(nir, "after glsl_to_nir or spirv_to_nir");
 112       NIR_PASS_V(nir, nir_lower_io_to_temporaries,
 113                  nir_shader_get_entrypoint(nir), true, false);
 114    } else {
 115       nir = prog_to_nir(prog, options);
 116       NIR_PASS_V(nir, nir_lower_regs_to_ssa); /* turn registers into SSA */
 117    }
 118    nir_validate_shader(nir, "before brw_preprocess_nir");
 119
 120    nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
 121
 122    if (!ctx->SoftFP64 && nir->info.uses_64bit &&
 123        (options->lower_doubles_options & nir_lower_fp64_full_software)) {
 124       ctx->SoftFP64 = glsl_float64_funcs_to_nir(ctx, options);
 125    }
 126
 127    brw_preprocess_nir(brw->screen->compiler, nir, ctx->SoftFP64);
 128
 129    if (stage == MESA_SHADER_TESS_CTRL) {
 130       /* Lower gl_PatchVerticesIn from a sys. value to a uniform on Gen8+. */
 131       static const gl_state_index16 tokens[STATE_LENGTH] =
 132          { STATE_INTERNAL, STATE_TCS_PATCH_VERTICES_IN };
 133       nir_lower_patch_vertices(nir, 0, devinfo->gen >= 8 ? tokens : NULL);
 134    }
 135
 136    if (stage == MESA_SHADER_TESS_EVAL) {
 137       /* Lower gl_PatchVerticesIn to a constant if we have a TCS, or
 138        * a uniform if we don't.
 139        */
 140       struct gl_linked_shader *tcs =
 141          shader_prog->_LinkedShaders[MESA_SHADER_TESS_CTRL];
 142       uint32_t static_patch_vertices =
 143          tcs ? tcs->Program->nir->info.tess.tcs_vertices_out : 0;
 144       static const gl_state_index16 tokens[STATE_LENGTH] =
 145          { STATE_INTERNAL, STATE_TES_PATCH_VERTICES_IN };
 146       nir_lower_patch_vertices(nir, static_patch_vertices, tokens);
 147    }
 148
 149    if (stage == MESA_SHADER_FRAGMENT) {
 150       static const struct nir_lower_wpos_ytransform_options wpos_options = {
 151          .state_tokens = {STATE_INTERNAL, STATE_FB_WPOS_Y_TRANSFORM, 0, 0, 0},
 152          .fs_coord_pixel_center_integer = 1,
 153          .fs_coord_origin_upper_left = 1,
 154       };
 155
 156       bool progress = false;
 157       NIR_PASS(progress, nir, nir_lower_wpos_ytransform, &wpos_options);
 158       if (progress) {
 159          _mesa_add_state_reference(prog->Parameters,
 160                                    wpos_options.state_tokens);
 161       }
 162    }
 163
 164    NIR_PASS_V(nir, brw_nir_lower_uniforms, is_scalar);
 165
 166    return nir;
 167 }
 168
 169 static void
 170 shared_type_info(const struct glsl_type *type, unsigned *size, unsigned *align)
 171 {
 172    assert(glsl_type_is_vector_or_scalar(type));
 173
 174    uint32_t comp_size = glsl_type_is_boolean(type)
 175       ? 4 : glsl_get_bit_size(type) / 8;
 176    unsigned length = glsl_get_vector_elements(type);
 177    *size = comp_size * length,
 178    *align = comp_size * (length == 3 ? 4 : length);
 179 }
 180
 181 void
 182 brw_nir_lower_resources(nir_shader *nir, struct gl_shader_program *shader_prog,
 183                         struct gl_program *prog,
 184                         const struct gen_device_info *devinfo)
 185 {
 186    NIR_PASS_V(prog->nir, gl_nir_lower_samplers, shader_prog);
 187    prog->info.textures_used = prog->nir->info.textures_used;
 188    prog->info.textures_used_by_txf = prog->nir->info.textures_used_by_txf;
 189
 190    NIR_PASS_V(prog->nir, brw_nir_lower_image_load_store, devinfo, NULL);
 191
 192    if (prog->nir->info.stage == MESA_SHADER_COMPUTE &&
 193        shader_prog->data->spirv) {
 194       NIR_PASS_V(prog->nir, nir_lower_vars_to_explicit_types,
 195                  nir_var_mem_shared, shared_type_info);
 196       NIR_PASS_V(prog->nir, nir_lower_explicit_io,
 197                  nir_var_mem_shared, nir_address_format_32bit_offset);
 198    }
 199
 200    NIR_PASS_V(prog->nir, gl_nir_lower_buffers, shader_prog);
 201    /* Do a round of constant folding to clean up address calculations */
 202    NIR_PASS_V(prog->nir, nir_opt_constant_folding);
 203 }
 204
 205 void
 206 brw_shader_gather_info(nir_shader *nir, struct gl_program *prog)
 207 {
 208    nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
 209
 210    /* Copy the info we just generated back into the gl_program */
 211    const char *prog_name = prog->info.name;
 212    const char *prog_label = prog->info.label;
 213    prog->info = nir->info;
 214    prog->info.name = prog_name;
 215    prog->info.label = prog_label;
 216 }
 217
 218 static unsigned
 219 get_new_program_id(struct intel_screen *screen)
 220 {
 221    return p_atomic_inc_return(&screen->program_id);
 222 }
 223
 224 static struct gl_program *brwNewProgram(struct gl_context *ctx,
 225                                         gl_shader_stage stage,
 226                                         GLuint id, bool is_arb_asm)
 227 {
 228    struct brw_context *brw = brw_context(ctx);
 229    struct brw_program *prog = rzalloc(NULL, struct brw_program);
 230
 231    if (prog) {
 232       prog->id = get_new_program_id(brw->screen);
 233
 234       return _mesa_init_gl_program(&prog->program, stage, id, is_arb_asm);
 235    }
 236
 237    return NULL;
 238 }
 239
 240 static void brwDeleteProgram( struct gl_context *ctx,
 241                               struct gl_program *prog )
 242 {
 243    struct brw_context *brw = brw_context(ctx);
 244
 245    /* Beware!  prog's refcount has reached zero, and it's about to be freed.
 246     *
 247     * In brw_upload_pipeline_state(), we compare brw->programs[i] to
 248     * ctx->FooProgram._Current, and flag BRW_NEW_FOO_PROGRAM if the
 249     * pointer has changed.
 250     *
 251     * We cannot leave brw->programs[i] as a dangling pointer to the dead
 252     * program.  malloc() may allocate the same memory for a new gl_program,
 253     * causing us to see matching pointers...but totally different programs.
 254     *
 255     * We cannot set brw->programs[i] to NULL, either.  If we've deleted the
 256     * active program, Mesa may set ctx->FooProgram._Current to NULL.  That
 257     * would cause us to see matching pointers (NULL == NULL), and fail to
 258     * detect that a program has changed since our last draw.
 259     *
 260     * So, set it to a bogus gl_program pointer that will never match,
 261     * causing us to properly reevaluate the state on our next draw.
 262     *
 263     * Getting this wrong causes heisenbugs which are very hard to catch,
 264     * as you need a very specific allocation pattern to hit the problem.
 265     */
 266    static const struct gl_program deleted_program;
 267
 268    for (int i = 0; i < MESA_SHADER_STAGES; i++) {
 269       if (brw->programs[i] == prog)
 270          brw->programs[i] = (struct gl_program *) &deleted_program;
 271    }
 272
 273    _mesa_delete_program( ctx, prog );
 274 }
 275
 276
 277 static GLboolean
 278 brwProgramStringNotify(struct gl_context *ctx,
 279                        GLenum target,
 280                        struct gl_program *prog)
 281 {
 282    assert(target == GL_VERTEX_PROGRAM_ARB || !prog->arb.IsPositionInvariant);
 283
 284    struct brw_context *brw = brw_context(ctx);
 285    const struct brw_compiler *compiler = brw->screen->compiler;
 286
 287    switch (target) {
 288    case GL_FRAGMENT_PROGRAM_ARB: {
 289       struct brw_program *newFP = brw_program(prog);
 290       const struct brw_program *curFP =
 291          brw_program_const(brw->programs[MESA_SHADER_FRAGMENT]);
 292
 293       if (newFP == curFP)
 294          brw->ctx.NewDriverState |= BRW_NEW_FRAGMENT_PROGRAM;
 295       _mesa_program_fragment_position_to_sysval(&newFP->program);
 296       newFP->id = get_new_program_id(brw->screen);
 297
 298       prog->nir = brw_create_nir(brw, NULL, prog, MESA_SHADER_FRAGMENT, true);
 299
 300       brw_nir_lower_resources(prog->nir, NULL, prog, &brw->screen->devinfo);
 301
 302       brw_shader_gather_info(prog->nir, prog);
 303
 304       brw_fs_precompile(ctx, prog);
 305       break;
 306    }
 307    case GL_VERTEX_PROGRAM_ARB: {
 308       struct brw_program *newVP = brw_program(prog);
 309       const struct brw_program *curVP =
 310          brw_program_const(brw->programs[MESA_SHADER_VERTEX]);
 311
 312       if (newVP == curVP)
 313          brw->ctx.NewDriverState |= BRW_NEW_VERTEX_PROGRAM;
 314       if (newVP->program.arb.IsPositionInvariant) {
 315          _mesa_insert_mvp_code(ctx, &newVP->program);
 316       }
 317       newVP->id = get_new_program_id(brw->screen);
 318
 319       /* Also tell tnl about it:
 320        */
 321       _tnl_program_string(ctx, target, prog);
 322
 323       prog->nir = brw_create_nir(brw, NULL, prog, MESA_SHADER_VERTEX,
 324                                  compiler->scalar_stage[MESA_SHADER_VERTEX]);
 325
 326       brw_nir_lower_resources(prog->nir, NULL, prog, &brw->screen->devinfo);
 327
 328       brw_shader_gather_info(prog->nir, prog);
 329
 330       brw_vs_precompile(ctx, prog);
 331       break;
 332    }
 333    default:
 334       /*
 335        * driver->ProgramStringNotify is only called for ARB programs, fixed
 336        * function vertex programs, and ir_to_mesa (which isn't used by the
 337        * i965 back-end).  Therefore, even after geometry shaders are added,
 338        * this function should only ever be called with a target of
 339        * GL_VERTEX_PROGRAM_ARB or GL_FRAGMENT_PROGRAM_ARB.
 340        */
 341       unreachable("Unexpected target in brwProgramStringNotify");
 342    }
 343
 344    return true;
 345 }
 346
 347 static void
 348 brw_memory_barrier(struct gl_context *ctx, GLbitfield barriers)
 349 {
 350    struct brw_context *brw = brw_context(ctx);
 351    const struct gen_device_info *devinfo = &brw->screen->devinfo;
 352    unsigned bits = PIPE_CONTROL_DATA_CACHE_FLUSH | PIPE_CONTROL_CS_STALL;
 353    assert(devinfo->gen >= 7 && devinfo->gen <= 11);
 354
 355    if (barriers & (GL_VERTEX_ATTRIB_ARRAY_BARRIER_BIT |
 356                    GL_ELEMENT_ARRAY_BARRIER_BIT |
 357                    GL_COMMAND_BARRIER_BIT))
 358       bits |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
 359
 360    if (barriers & GL_UNIFORM_BARRIER_BIT)
 361       bits |= (PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
 362                PIPE_CONTROL_CONST_CACHE_INVALIDATE);
 363
 364    if (barriers & GL_TEXTURE_FETCH_BARRIER_BIT)
 365       bits |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
 366
 367    if (barriers & (GL_TEXTURE_UPDATE_BARRIER_BIT |
 368                    GL_PIXEL_BUFFER_BARRIER_BIT))
 369       bits |= (PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
 370                PIPE_CONTROL_RENDER_TARGET_FLUSH);
 371
 372    if (barriers & GL_FRAMEBUFFER_BARRIER_BIT)
 373       bits |= (PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
 374                PIPE_CONTROL_RENDER_TARGET_FLUSH);
 375
 376    /* Typed surface messages are handled by the render cache on IVB, so we
 377     * need to flush it too.
 378     */
 379    if (devinfo->gen == 7 && !devinfo->is_haswell)
 380       bits |= PIPE_CONTROL_RENDER_TARGET_FLUSH;
 381
 382    brw_emit_pipe_control_flush(brw, bits);
 383 }
 384
 385 static void
 386 brw_framebuffer_fetch_barrier(struct gl_context *ctx)
 387 {
 388    struct brw_context *brw = brw_context(ctx);
 389    const struct gen_device_info *devinfo = &brw->screen->devinfo;
 390
 391    if (!ctx->Extensions.EXT_shader_framebuffer_fetch) {
 392       if (devinfo->gen >= 6) {
 393          brw_emit_pipe_control_flush(brw,
 394                                      PIPE_CONTROL_RENDER_TARGET_FLUSH |
 395                                      PIPE_CONTROL_CS_STALL);
 396          brw_emit_pipe_control_flush(brw,
 397                                      PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE);
 398       } else {
 399          brw_emit_pipe_control_flush(brw,
 400                                      PIPE_CONTROL_RENDER_TARGET_FLUSH);
 401       }
 402    }
 403 }
 404
 405 void
 406 brw_get_scratch_bo(struct brw_context *brw,
 407                    struct brw_bo **scratch_bo, int size)
 408 {
 409    struct brw_bo *old_bo = *scratch_bo;
 410
 411    if (old_bo && old_bo->size < size) {
 412       brw_bo_unreference(old_bo);
 413       old_bo = NULL;
 414    }
 415
 416    if (!old_bo) {
 417       *scratch_bo =
 418          brw_bo_alloc(brw->bufmgr, "scratch bo", size, BRW_MEMZONE_SCRATCH);
 419    }
 420 }
 421
 422 /**
 423  * Reserve enough scratch space for the given stage to hold \p per_thread_size
 424  * bytes times the given \p thread_count.
 425  */
 426 void
 427 brw_alloc_stage_scratch(struct brw_context *brw,
 428                         struct brw_stage_state *stage_state,
 429                         unsigned per_thread_size)
 430 {
 431    if (stage_state->per_thread_scratch >= per_thread_size)
 432       return;
 433
 434    stage_state->per_thread_scratch = per_thread_size;
 435
 436    if (stage_state->scratch_bo)
 437       brw_bo_unreference(stage_state->scratch_bo);
 438
 439    const struct gen_device_info *devinfo = &brw->screen->devinfo;
 440    unsigned thread_count;
 441    switch(stage_state->stage) {
 442    case MESA_SHADER_VERTEX:
 443       thread_count = devinfo->max_vs_threads;
 444       break;
 445    case MESA_SHADER_TESS_CTRL:
 446       thread_count = devinfo->max_tcs_threads;
 447       break;
 448    case MESA_SHADER_TESS_EVAL:
 449       thread_count = devinfo->max_tes_threads;
 450       break;
 451    case MESA_SHADER_GEOMETRY:
 452       thread_count = devinfo->max_gs_threads;
 453       break;
 454    case MESA_SHADER_FRAGMENT:
 455       thread_count = devinfo->max_wm_threads;
 456       break;
 457    case MESA_SHADER_COMPUTE: {
 458       unsigned subslices = MAX2(brw->screen->subslice_total, 1);
 459
 460       /* The documentation for 3DSTATE_PS "Scratch Space Base Pointer" says:
 461        *
 462        * "Scratch Space per slice is computed based on 4 sub-slices.  SW must
 463        *  allocate scratch space enough so that each slice has 4 slices
 464        *  allowed."
 465        *
 466        * According to the other driver team, this applies to compute shaders
 467        * as well.  This is not currently documented at all.
 468        *
 469        * brw->screen->subslice_total is the TOTAL number of subslices
 470        * and we wish to view that there are 4 subslices per slice
 471        * instead of the actual number of subslices per slice.
 472        *
 473        * For, ICL, scratch space allocation is based on the number of threads
 474        * in the base configuration.
 475        */
 476       if (devinfo->gen == 11)
 477          subslices = 8;
 478       else if (devinfo->gen >= 9 && devinfo->gen < 11)
 479          subslices = 4 * brw->screen->devinfo.num_slices;
 480
 481       unsigned scratch_ids_per_subslice;
 482       if (devinfo->gen >= 11) {
 483          /* The MEDIA_VFE_STATE docs say:
 484           *
 485           *    "Starting with this configuration, the Maximum Number of
 486           *     Threads must be set to (#EU * 8) for GPGPU dispatches.
 487           *
 488           *     Although there are only 7 threads per EU in the configuration,
 489           *     the FFTID is calculated as if there are 8 threads per EU,
 490           *     which in turn requires a larger amount of Scratch Space to be
 491           *     allocated by the driver."
 492           */
 493          scratch_ids_per_subslice = 8 * 8;
 494       } else if (devinfo->is_haswell) {
 495          /* WaCSScratchSize:hsw
 496           *
 497           * Haswell's scratch space address calculation appears to be sparse
 498           * rather than tightly packed. The Thread ID has bits indicating
 499           * which subslice, EU within a subslice, and thread within an EU it
 500           * is. There's a maximum of two slices and two subslices, so these
 501           * can be stored with a single bit. Even though there are only 10 EUs
 502           * per subslice, this is stored in 4 bits, so there's an effective
 503           * maximum value of 16 EUs. Similarly, although there are only 7
 504           * threads per EU, this is stored in a 3 bit number, giving an
 505           * effective maximum value of 8 threads per EU.
 506           *
 507           * This means that we need to use 16 * 8 instead of 10 * 7 for the
 508           * number of threads per subslice.
 509           */
 510          scratch_ids_per_subslice = 16 * 8;
 511       } else if (devinfo->is_cherryview) {
 512          /* Cherryview devices have either 6 or 8 EUs per subslice, and each
 513           * EU has 7 threads. The 6 EU devices appear to calculate thread IDs
 514           * as if it had 8 EUs.
 515           */
 516          scratch_ids_per_subslice = 8 * 7;
 517       } else {
 518          scratch_ids_per_subslice = devinfo->max_cs_threads;
 519       }
 520
 521       thread_count = scratch_ids_per_subslice * subslices;
 522       break;
 523    }
 524    default:
 525       unreachable("Unsupported stage!");
 526    }
 527
 528    stage_state->scratch_bo =
 529       brw_bo_alloc(brw->bufmgr, "shader scratch space",
 530                    per_thread_size * thread_count, BRW_MEMZONE_SCRATCH);
 531 }
 532
 533 void brwInitFragProgFuncs( struct dd_function_table *functions )
 534 {
 535    assert(functions->ProgramStringNotify == _tnl_program_string);
 536
 537    functions->NewProgram = brwNewProgram;
 538    functions->DeleteProgram = brwDeleteProgram;
 539    functions->ProgramStringNotify = brwProgramStringNotify;
 540
 541    functions->LinkShader = brw_link_shader;
 542
 543    functions->MemoryBarrier = brw_memory_barrier;
 544    functions->FramebufferFetchBarrier = brw_framebuffer_fetch_barrier;
 545 }
 546
 547 struct shader_times {
 548    uint64_t time;
 549    uint64_t written;
 550    uint64_t reset;
 551 };
 552
 553 void
 554 brw_init_shader_time(struct brw_context *brw)
 555 {
 556    const int max_entries = 2048;
 557    brw->shader_time.bo =
 558       brw_bo_alloc(brw->bufmgr, "shader time",
 559                    max_entries * BRW_SHADER_TIME_STRIDE * 3,
 560                    BRW_MEMZONE_OTHER);
 561    brw->shader_time.names = rzalloc_array(brw, const char *, max_entries);
 562    brw->shader_time.ids = rzalloc_array(brw, int, max_entries);
 563    brw->shader_time.types = rzalloc_array(brw, enum shader_time_shader_type,
 564                                           max_entries);
 565    brw->shader_time.cumulative = rzalloc_array(brw, struct shader_times,
 566                                                max_entries);
 567    brw->shader_time.max_entries = max_entries;
 568 }
 569
 570 static int
 571 compare_time(const void *a, const void *b)
 572 {
 573    uint64_t * const *a_val = a;
 574    uint64_t * const *b_val = b;
 575
 576    /* We don't just subtract because we're turning the value to an int. */
 577    if (**a_val < **b_val)
 578       return -1;
 579    else if (**a_val == **b_val)
 580       return 0;
 581    else
 582       return 1;
 583 }
 584
 585 static void
 586 print_shader_time_line(const char *stage, const char *name,
 587                        int shader_num, uint64_t time, uint64_t total)
 588 {
 589    fprintf(stderr, "%-6s%-18s", stage, name);
 590
 591    if (shader_num != 0)
 592       fprintf(stderr, "%4d: ", shader_num);
 593    else
 594       fprintf(stderr, "    : ");
 595
 596    fprintf(stderr, "%16lld (%7.2f Gcycles)      %4.1f%%\n",
 597            (long long)time,
 598            (double)time / 1000000000.0,
 599            (double)time / total * 100.0);
 600 }
 601
 602 static void
 603 brw_report_shader_time(struct brw_context *brw)
 604 {
 605    if (!brw->shader_time.bo || !brw->shader_time.num_entries)
 606       return;
 607
 608    uint64_t scaled[brw->shader_time.num_entries];
 609    uint64_t *sorted[brw->shader_time.num_entries];
 610    uint64_t total_by_type[ST_CS + 1];
 611    memset(total_by_type, 0, sizeof(total_by_type));
 612    double total = 0;
 613    for (int i = 0; i < brw->shader_time.num_entries; i++) {
 614       uint64_t written = 0, reset = 0;
 615       enum shader_time_shader_type type = brw->shader_time.types[i];
 616
 617       sorted[i] = &scaled[i];
 618
 619       switch (type) {
 620       case ST_VS:
 621       case ST_TCS:
 622       case ST_TES:
 623       case ST_GS:
 624       case ST_FS8:
 625       case ST_FS16:
 626       case ST_FS32:
 627       case ST_CS:
 628          written = brw->shader_time.cumulative[i].written;
 629          reset = brw->shader_time.cumulative[i].reset;
 630          break;
 631
 632       default:
 633          /* I sometimes want to print things that aren't the 3 shader times.
 634           * Just print the sum in that case.
 635           */
 636          written = 1;
 637          reset = 0;
 638          break;
 639       }
 640
 641       uint64_t time = brw->shader_time.cumulative[i].time;
 642       if (written) {
 643          scaled[i] = time / written * (written + reset);
 644       } else {
 645          scaled[i] = time;
 646       }
 647
 648       switch (type) {
 649       case ST_VS:
 650       case ST_TCS:
 651       case ST_TES:
 652       case ST_GS:
 653       case ST_FS8:
 654       case ST_FS16:
 655       case ST_FS32:
 656       case ST_CS:
 657          total_by_type[type] += scaled[i];
 658          break;
 659       default:
 660          break;
 661       }
 662
 663       total += scaled[i];
 664    }
 665
 666    if (total == 0) {
 667       fprintf(stderr, "No shader time collected yet\n");
 668       return;
 669    }
 670
 671    qsort(sorted, brw->shader_time.num_entries, sizeof(sorted[0]), compare_time);
 672
 673    fprintf(stderr, "\n");
 674    fprintf(stderr, "type          ID                  cycles spent                   %% of total\n");
 675    for (int s = 0; s < brw->shader_time.num_entries; s++) {
 676       const char *stage;
 677       /* Work back from the sorted pointers times to a time to print. */
 678       int i = sorted[s] - scaled;
 679
 680       if (scaled[i] == 0)
 681          continue;
 682
 683       int shader_num = brw->shader_time.ids[i];
 684       const char *shader_name = brw->shader_time.names[i];
 685
 686       switch (brw->shader_time.types[i]) {
 687       case ST_VS:
 688          stage = "vs";
 689          break;
 690       case ST_TCS:
 691          stage = "tcs";
 692          break;
 693       case ST_TES:
 694          stage = "tes";
 695          break;
 696       case ST_GS:
 697          stage = "gs";
 698          break;
 699       case ST_FS8:
 700          stage = "fs8";
 701          break;
 702       case ST_FS16:
 703          stage = "fs16";
 704          break;
 705       case ST_FS32:
 706          stage = "fs32";
 707          break;
 708       case ST_CS:
 709          stage = "cs";
 710          break;
 711       default:
 712          stage = "other";
 713          break;
 714       }
 715
 716       print_shader_time_line(stage, shader_name, shader_num,
 717                              scaled[i], total);
 718    }
 719
 720    fprintf(stderr, "\n");
 721    print_shader_time_line("total", "vs", 0, total_by_type[ST_VS], total);
 722    print_shader_time_line("total", "tcs", 0, total_by_type[ST_TCS], total);
 723    print_shader_time_line("total", "tes", 0, total_by_type[ST_TES], total);
 724    print_shader_time_line("total", "gs", 0, total_by_type[ST_GS], total);
 725    print_shader_time_line("total", "fs8", 0, total_by_type[ST_FS8], total);
 726    print_shader_time_line("total", "fs16", 0, total_by_type[ST_FS16], total);
 727    print_shader_time_line("total", "fs32", 0, total_by_type[ST_FS32], total);
 728    print_shader_time_line("total", "cs", 0, total_by_type[ST_CS], total);
 729 }
 730
 731 static void
 732 brw_collect_shader_time(struct brw_context *brw)
 733 {
 734    if (!brw->shader_time.bo)
 735       return;
 736
 737    /* This probably stalls on the last rendering.  We could fix that by
 738     * delaying reading the reports, but it doesn't look like it's a big
 739     * overhead compared to the cost of tracking the time in the first place.
 740     */
 741    void *bo_map = brw_bo_map(brw, brw->shader_time.bo, MAP_READ | MAP_WRITE);
 742
 743    for (int i = 0; i < brw->shader_time.num_entries; i++) {
 744       uint32_t *times = bo_map + i * 3 * BRW_SHADER_TIME_STRIDE;
 745
 746       brw->shader_time.cumulative[i].time += times[BRW_SHADER_TIME_STRIDE * 0 / 4];
 747       brw->shader_time.cumulative[i].written += times[BRW_SHADER_TIME_STRIDE * 1 / 4];
 748       brw->shader_time.cumulative[i].reset += times[BRW_SHADER_TIME_STRIDE * 2 / 4];
 749    }
 750
 751    /* Zero the BO out to clear it out for our next collection.
 752     */
 753    memset(bo_map, 0, brw->shader_time.bo->size);
 754    brw_bo_unmap(brw->shader_time.bo);
 755 }
 756
 757 void
 758 brw_collect_and_report_shader_time(struct brw_context *brw)
 759 {
 760    brw_collect_shader_time(brw);
 761
 762    if (brw->shader_time.report_time == 0 ||
 763        get_time() - brw->shader_time.report_time >= 1.0) {
 764       brw_report_shader_time(brw);
 765       brw->shader_time.report_time = get_time();
 766    }
 767 }
 768
 769 /**
 770  * Chooses an index in the shader_time buffer and sets up tracking information
 771  * for our printouts.
 772  *
 773  * Note that this holds on to references to the underlying programs, which may
 774  * change their lifetimes compared to normal operation.
 775  */
 776 int
 777 brw_get_shader_time_index(struct brw_context *brw, struct gl_program *prog,
 778                           enum shader_time_shader_type type, bool is_glsl_sh)
 779 {
 780    int shader_time_index = brw->shader_time.num_entries++;
 781    assert(shader_time_index < brw->shader_time.max_entries);
 782    brw->shader_time.types[shader_time_index] = type;
 783
 784    const char *name;
 785    if (prog->Id == 0) {
 786       name = "ff";
 787    } else if (is_glsl_sh) {
 788       name = prog->info.label ?
 789          ralloc_strdup(brw->shader_time.names, prog->info.label) : "glsl";
 790    } else {
 791       name = "prog";
 792    }
 793
 794    brw->shader_time.names[shader_time_index] = name;
 795    brw->shader_time.ids[shader_time_index] = prog->Id;
 796
 797    return shader_time_index;
 798 }
 799
 800 void
 801 brw_destroy_shader_time(struct brw_context *brw)
 802 {
 803    brw_bo_unreference(brw->shader_time.bo);
 804    brw->shader_time.bo = NULL;
 805 }
 806
 807 void
 808 brw_stage_prog_data_free(const void *p)
 809 {
 810    struct brw_stage_prog_data *prog_data = (struct brw_stage_prog_data *)p;
 811
 812    ralloc_free(prog_data->param);
 813    ralloc_free(prog_data->pull_param);
 814 }
 815
 816 void
 817 brw_dump_arb_asm(const char *stage, struct gl_program *prog)
 818 {
 819    fprintf(stderr, "ARB_%s_program %d ir for native %s shader\n",
 820            stage, prog->Id, stage);
 821    _mesa_print_program(prog);
 822 }
 823
 824 void
 825 brw_setup_tex_for_precompile(const struct gen_device_info *devinfo,
 826                              struct brw_sampler_prog_key_data *tex,
 827                              const struct gl_program *prog)
 828 {
 829    const bool has_shader_channel_select = devinfo->is_haswell || devinfo->gen >= 8;
 830    unsigned sampler_count = util_last_bit(prog->SamplersUsed);
 831    for (unsigned i = 0; i < sampler_count; i++) {
 832       if (!has_shader_channel_select && (prog->ShadowSamplers & (1 << i))) {
 833          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
 834          tex->swizzles[i] =
 835             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
 836       } else {
 837          /* Color sampler: assume no swizzling. */
 838          tex->swizzles[i] = SWIZZLE_XYZW;
 839       }
 840    }
 841 }
 842
 843 /**
 844  * Sets up the starting offsets for the groups of binding table entries
 845  * common to all pipeline stages.
 846  *
 847  * Unused groups are initialized to 0xd0d0d0d0 to make it obvious that they're
 848  * unused but also make sure that addition of small offsets to them will
 849  * trigger some of our asserts that surface indices are < BRW_MAX_SURFACES.
 850  */
 851 uint32_t
 852 brw_assign_common_binding_table_offsets(const struct gen_device_info *devinfo,
 853                                         const struct gl_program *prog,
 854                                         struct brw_stage_prog_data *stage_prog_data,
 855                                         uint32_t next_binding_table_offset)
 856 {
 857    int num_textures = util_last_bit(prog->SamplersUsed);
 858
 859    stage_prog_data->binding_table.texture_start = next_binding_table_offset;
 860    next_binding_table_offset += num_textures;
 861
 862    if (prog->info.num_ubos) {
 863       assert(prog->info.num_ubos <= BRW_MAX_UBO);
 864       stage_prog_data->binding_table.ubo_start = next_binding_table_offset;
 865       next_binding_table_offset += prog->info.num_ubos;
 866    } else {
 867       stage_prog_data->binding_table.ubo_start = 0xd0d0d0d0;
 868    }
 869
 870    if (prog->info.num_ssbos || prog->info.num_abos) {
 871       assert(prog->info.num_abos <= BRW_MAX_ABO);
 872       assert(prog->info.num_ssbos <= BRW_MAX_SSBO);
 873       stage_prog_data->binding_table.ssbo_start = next_binding_table_offset;
 874       next_binding_table_offset += prog->info.num_abos + prog->info.num_ssbos;
 875    } else {
 876       stage_prog_data->binding_table.ssbo_start = 0xd0d0d0d0;
 877    }
 878
 879    if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
 880       stage_prog_data->binding_table.shader_time_start = next_binding_table_offset;
 881       next_binding_table_offset++;
 882    } else {
 883       stage_prog_data->binding_table.shader_time_start = 0xd0d0d0d0;
 884    }
 885
 886    if (prog->info.uses_texture_gather) {
 887       if (devinfo->gen >= 8) {
 888          stage_prog_data->binding_table.gather_texture_start =
 889             stage_prog_data->binding_table.texture_start;
 890       } else {
 891          stage_prog_data->binding_table.gather_texture_start = next_binding_table_offset;
 892          next_binding_table_offset += num_textures;
 893       }
 894    } else {
 895       stage_prog_data->binding_table.gather_texture_start = 0xd0d0d0d0;
 896    }
 897
 898    if (prog->info.num_images) {
 899       stage_prog_data->binding_table.image_start = next_binding_table_offset;
 900       next_binding_table_offset += prog->info.num_images;
 901    } else {
 902       stage_prog_data->binding_table.image_start = 0xd0d0d0d0;
 903    }
 904
 905    /* This may or may not be used depending on how the compile goes. */
 906    stage_prog_data->binding_table.pull_constants_start = next_binding_table_offset;
 907    next_binding_table_offset++;
 908
 909    /* Plane 0 is just the regular texture section */
 910    stage_prog_data->binding_table.plane_start[0] = stage_prog_data->binding_table.texture_start;
 911
 912    stage_prog_data->binding_table.plane_start[1] = next_binding_table_offset;
 913    next_binding_table_offset += num_textures;
 914
 915    stage_prog_data->binding_table.plane_start[2] = next_binding_table_offset;
 916    next_binding_table_offset += num_textures;
 917
 918    /* Set the binding table size.  Some callers may append new entries
 919     * and increase this accordingly.
 920     */
 921    stage_prog_data->binding_table.size_bytes = next_binding_table_offset * 4;
 922
 923    assert(next_binding_table_offset <= BRW_MAX_SURFACES);
 924    return next_binding_table_offset;
 925 }
 926
 927 void
 928 brw_populate_default_key(const struct brw_compiler *compiler,
 929                          union brw_any_prog_key *prog_key,
 930                          struct gl_shader_program *sh_prog,
 931                          struct gl_program *prog)
 932 {
 933    switch (prog->info.stage) {
 934    case MESA_SHADER_VERTEX:
 935       brw_vs_populate_default_key(compiler, &prog_key->vs, prog);
 936       break;
 937    case MESA_SHADER_TESS_CTRL:
 938       brw_tcs_populate_default_key(compiler, &prog_key->tcs, sh_prog, prog);
 939       break;
 940    case MESA_SHADER_TESS_EVAL:
 941       brw_tes_populate_default_key(compiler, &prog_key->tes, sh_prog, prog);
 942       break;
 943    case MESA_SHADER_GEOMETRY:
 944       brw_gs_populate_default_key(compiler, &prog_key->gs, prog);
 945       break;
 946    case MESA_SHADER_FRAGMENT:
 947       brw_wm_populate_default_key(compiler, &prog_key->wm, prog);
 948       break;
 949    case MESA_SHADER_COMPUTE:
 950       brw_cs_populate_default_key(compiler, &prog_key->cs, prog);
 951       break;
 952    default:
 953       unreachable("Unsupported stage!");
 954    }
 955 }
 956
 957 void
 958 brw_debug_recompile(struct brw_context *brw,
 959                     gl_shader_stage stage,
 960                     unsigned api_id,
 961                     struct brw_base_prog_key *key)
 962 {
 963    const struct brw_compiler *compiler = brw->screen->compiler;
 964    enum brw_cache_id cache_id = brw_stage_cache_id(stage);
 965
 966    compiler->shader_perf_log(brw, "Recompiling %s shader for program %d\n",
 967                              _mesa_shader_stage_to_string(stage), api_id);
 968
 969    const void *old_key =
 970       brw_find_previous_compile(&brw->cache, cache_id, key->program_string_id);
 971
 972    brw_debug_key_recompile(compiler, brw, stage, old_key, key);
 973 }