src/mesa/drivers/dri/i965/brw_program.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keithw@vmware.com>
  30   */
  31
  32 #include <pthread.h>
  33 #include "main/imports.h"
  34 #include "main/glspirv.h"
  35 #include "program/prog_parameter.h"
  36 #include "program/prog_print.h"
  37 #include "program/prog_to_nir.h"
  38 #include "program/program.h"
  39 #include "program/programopt.h"
  40 #include "tnl/tnl.h"
  41 #include "util/ralloc.h"
  42 #include "compiler/glsl/ir.h"
  43 #include "compiler/glsl/program.h"
  44 #include "compiler/glsl/gl_nir.h"
  45 #include "compiler/glsl/glsl_to_nir.h"
  46
  47 #include "brw_program.h"
  48 #include "brw_context.h"
  49 #include "compiler/brw_nir.h"
  50 #include "brw_defines.h"
  51 #include "intel_batchbuffer.h"
  52
  53 #include "brw_cs.h"
  54 #include "brw_gs.h"
  55 #include "brw_vs.h"
  56 #include "brw_wm.h"
  57 #include "brw_state.h"
  58
  59 #include "main/shaderapi.h"
  60 #include "main/shaderobj.h"
  61
  62 static bool
  63 brw_nir_lower_uniforms(nir_shader *nir, bool is_scalar)
  64 {
  65    if (is_scalar) {
  66       nir_assign_var_locations(&nir->uniforms, &nir->num_uniforms,
  67                                type_size_scalar_bytes);
  68       return nir_lower_io(nir, nir_var_uniform, type_size_scalar_bytes, 0);
  69    } else {
  70       nir_assign_var_locations(&nir->uniforms, &nir->num_uniforms,
  71                                type_size_vec4_bytes);
  72       return nir_lower_io(nir, nir_var_uniform, type_size_vec4_bytes, 0);
  73    }
  74 }
  75
  76 static struct gl_program *brwNewProgram(struct gl_context *ctx, GLenum target,
  77                                         GLuint id, bool is_arb_asm);
  78
  79 nir_shader *
  80 brw_create_nir(struct brw_context *brw,
  81                const struct gl_shader_program *shader_prog,
  82                struct gl_program *prog,
  83                gl_shader_stage stage,
  84                bool is_scalar)
  85 {
  86    const struct gen_device_info *devinfo = &brw->screen->devinfo;
  87    struct gl_context *ctx = &brw->ctx;
  88    const nir_shader_compiler_options *options =
  89       ctx->Const.ShaderCompilerOptions[stage].NirOptions;
  90    nir_shader *nir;
  91
  92    /* First, lower the GLSL/Mesa IR or SPIR-V to NIR */
  93    if (shader_prog) {
  94       if (shader_prog->data->spirv) {
  95          nir = _mesa_spirv_to_nir(ctx, shader_prog, stage, options);
  96       } else {
  97          nir = glsl_to_nir(ctx, shader_prog, stage, options);
  98       }
  99       assert (nir);
 100
 101       nir_remove_dead_variables(nir, nir_var_shader_in | nir_var_shader_out);
 102       nir_validate_shader(nir, "after glsl_to_nir or spirv_to_nir");
 103       NIR_PASS_V(nir, nir_lower_io_to_temporaries,
 104                  nir_shader_get_entrypoint(nir), true, false);
 105    } else {
 106       nir = prog_to_nir(prog, options);
 107       NIR_PASS_V(nir, nir_lower_regs_to_ssa); /* turn registers into SSA */
 108    }
 109    nir_validate_shader(nir, "before brw_preprocess_nir");
 110
 111    nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
 112
 113    if (!ctx->SoftFP64 && nir->info.uses_64bit &&
 114        (options->lower_doubles_options & nir_lower_fp64_full_software)) {
 115       ctx->SoftFP64 = glsl_float64_funcs_to_nir(ctx, options);
 116    }
 117
 118    brw_preprocess_nir(brw->screen->compiler, nir, ctx->SoftFP64);
 119
 120    if (stage == MESA_SHADER_TESS_CTRL) {
 121       /* Lower gl_PatchVerticesIn from a sys. value to a uniform on Gen8+. */
 122       static const gl_state_index16 tokens[STATE_LENGTH] =
 123          { STATE_INTERNAL, STATE_TCS_PATCH_VERTICES_IN };
 124       nir_lower_patch_vertices(nir, 0, devinfo->gen >= 8 ? tokens : NULL);
 125    }
 126
 127    if (stage == MESA_SHADER_TESS_EVAL) {
 128       /* Lower gl_PatchVerticesIn to a constant if we have a TCS, or
 129        * a uniform if we don't.
 130        */
 131       struct gl_linked_shader *tcs =
 132          shader_prog->_LinkedShaders[MESA_SHADER_TESS_CTRL];
 133       uint32_t static_patch_vertices =
 134          tcs ? tcs->Program->nir->info.tess.tcs_vertices_out : 0;
 135       static const gl_state_index16 tokens[STATE_LENGTH] =
 136          { STATE_INTERNAL, STATE_TES_PATCH_VERTICES_IN };
 137       nir_lower_patch_vertices(nir, static_patch_vertices, tokens);
 138    }
 139
 140    if (stage == MESA_SHADER_FRAGMENT) {
 141       static const struct nir_lower_wpos_ytransform_options wpos_options = {
 142          .state_tokens = {STATE_INTERNAL, STATE_FB_WPOS_Y_TRANSFORM, 0, 0, 0},
 143          .fs_coord_pixel_center_integer = 1,
 144          .fs_coord_origin_upper_left = 1,
 145       };
 146
 147       bool progress = false;
 148       NIR_PASS(progress, nir, nir_lower_wpos_ytransform, &wpos_options);
 149       if (progress) {
 150          _mesa_add_state_reference(prog->Parameters,
 151                                    wpos_options.state_tokens);
 152       }
 153    }
 154
 155    NIR_PASS_V(nir, brw_nir_lower_uniforms, is_scalar);
 156
 157    return nir;
 158 }
 159
 160 static void
 161 shared_type_info(const struct glsl_type *type, unsigned *size, unsigned *align)
 162 {
 163    assert(glsl_type_is_vector_or_scalar(type));
 164
 165    uint32_t comp_size = glsl_type_is_boolean(type)
 166       ? 4 : glsl_get_bit_size(type) / 8;
 167    unsigned length = glsl_get_vector_elements(type);
 168    *size = comp_size * length,
 169    *align = comp_size * (length == 3 ? 4 : length);
 170 }
 171
 172 void
 173 brw_nir_lower_resources(nir_shader *nir, struct gl_shader_program *shader_prog,
 174                         struct gl_program *prog,
 175                         const struct gen_device_info *devinfo)
 176 {
 177    NIR_PASS_V(prog->nir, gl_nir_lower_samplers, shader_prog);
 178    prog->info.textures_used = prog->nir->info.textures_used;
 179    prog->info.textures_used_by_txf = prog->nir->info.textures_used_by_txf;
 180
 181    NIR_PASS_V(prog->nir, brw_nir_lower_image_load_store, devinfo);
 182
 183    if (prog->nir->info.stage == MESA_SHADER_COMPUTE &&
 184        shader_prog->data->spirv) {
 185       NIR_PASS_V(prog->nir, nir_lower_vars_to_explicit_types,
 186                  nir_var_mem_shared, shared_type_info);
 187       NIR_PASS_V(prog->nir, nir_lower_explicit_io,
 188                  nir_var_mem_shared, nir_address_format_32bit_offset);
 189    }
 190
 191    NIR_PASS_V(prog->nir, gl_nir_lower_buffers, shader_prog);
 192    /* Do a round of constant folding to clean up address calculations */
 193    NIR_PASS_V(prog->nir, nir_opt_constant_folding);
 194 }
 195
 196 void
 197 brw_shader_gather_info(nir_shader *nir, struct gl_program *prog)
 198 {
 199    nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
 200
 201    /* Copy the info we just generated back into the gl_program */
 202    const char *prog_name = prog->info.name;
 203    const char *prog_label = prog->info.label;
 204    prog->info = nir->info;
 205    prog->info.name = prog_name;
 206    prog->info.label = prog_label;
 207 }
 208
 209 static unsigned
 210 get_new_program_id(struct intel_screen *screen)
 211 {
 212    return p_atomic_inc_return(&screen->program_id);
 213 }
 214
 215 static struct gl_program *brwNewProgram(struct gl_context *ctx, GLenum target,
 216                                         GLuint id, bool is_arb_asm)
 217 {
 218    struct brw_context *brw = brw_context(ctx);
 219    struct brw_program *prog = rzalloc(NULL, struct brw_program);
 220
 221    if (prog) {
 222       prog->id = get_new_program_id(brw->screen);
 223
 224       return _mesa_init_gl_program(&prog->program, target, id, is_arb_asm);
 225    }
 226
 227    return NULL;
 228 }
 229
 230 static void brwDeleteProgram( struct gl_context *ctx,
 231                               struct gl_program *prog )
 232 {
 233    struct brw_context *brw = brw_context(ctx);
 234
 235    /* Beware!  prog's refcount has reached zero, and it's about to be freed.
 236     *
 237     * In brw_upload_pipeline_state(), we compare brw->programs[i] to
 238     * ctx->FooProgram._Current, and flag BRW_NEW_FOO_PROGRAM if the
 239     * pointer has changed.
 240     *
 241     * We cannot leave brw->programs[i] as a dangling pointer to the dead
 242     * program.  malloc() may allocate the same memory for a new gl_program,
 243     * causing us to see matching pointers...but totally different programs.
 244     *
 245     * We cannot set brw->programs[i] to NULL, either.  If we've deleted the
 246     * active program, Mesa may set ctx->FooProgram._Current to NULL.  That
 247     * would cause us to see matching pointers (NULL == NULL), and fail to
 248     * detect that a program has changed since our last draw.
 249     *
 250     * So, set it to a bogus gl_program pointer that will never match,
 251     * causing us to properly reevaluate the state on our next draw.
 252     *
 253     * Getting this wrong causes heisenbugs which are very hard to catch,
 254     * as you need a very specific allocation pattern to hit the problem.
 255     */
 256    static const struct gl_program deleted_program;
 257
 258    for (int i = 0; i < MESA_SHADER_STAGES; i++) {
 259       if (brw->programs[i] == prog)
 260          brw->programs[i] = (struct gl_program *) &deleted_program;
 261    }
 262
 263    _mesa_delete_program( ctx, prog );
 264 }
 265
 266
 267 static GLboolean
 268 brwProgramStringNotify(struct gl_context *ctx,
 269                        GLenum target,
 270                        struct gl_program *prog)
 271 {
 272    assert(target == GL_VERTEX_PROGRAM_ARB || !prog->arb.IsPositionInvariant);
 273
 274    struct brw_context *brw = brw_context(ctx);
 275    const struct brw_compiler *compiler = brw->screen->compiler;
 276
 277    switch (target) {
 278    case GL_FRAGMENT_PROGRAM_ARB: {
 279       struct brw_program *newFP = brw_program(prog);
 280       const struct brw_program *curFP =
 281          brw_program_const(brw->programs[MESA_SHADER_FRAGMENT]);
 282
 283       if (newFP == curFP)
 284          brw->ctx.NewDriverState |= BRW_NEW_FRAGMENT_PROGRAM;
 285       _mesa_program_fragment_position_to_sysval(&newFP->program);
 286       newFP->id = get_new_program_id(brw->screen);
 287
 288       prog->nir = brw_create_nir(brw, NULL, prog, MESA_SHADER_FRAGMENT, true);
 289
 290       brw_nir_lower_resources(prog->nir, NULL, prog, &brw->screen->devinfo);
 291
 292       brw_shader_gather_info(prog->nir, prog);
 293
 294       brw_fs_precompile(ctx, prog);
 295       break;
 296    }
 297    case GL_VERTEX_PROGRAM_ARB: {
 298       struct brw_program *newVP = brw_program(prog);
 299       const struct brw_program *curVP =
 300          brw_program_const(brw->programs[MESA_SHADER_VERTEX]);
 301
 302       if (newVP == curVP)
 303          brw->ctx.NewDriverState |= BRW_NEW_VERTEX_PROGRAM;
 304       if (newVP->program.arb.IsPositionInvariant) {
 305          _mesa_insert_mvp_code(ctx, &newVP->program);
 306       }
 307       newVP->id = get_new_program_id(brw->screen);
 308
 309       /* Also tell tnl about it:
 310        */
 311       _tnl_program_string(ctx, target, prog);
 312
 313       prog->nir = brw_create_nir(brw, NULL, prog, MESA_SHADER_VERTEX,
 314                                  compiler->scalar_stage[MESA_SHADER_VERTEX]);
 315
 316       brw_nir_lower_resources(prog->nir, NULL, prog, &brw->screen->devinfo);
 317
 318       brw_shader_gather_info(prog->nir, prog);
 319
 320       brw_vs_precompile(ctx, prog);
 321       break;
 322    }
 323    default:
 324       /*
 325        * driver->ProgramStringNotify is only called for ARB programs, fixed
 326        * function vertex programs, and ir_to_mesa (which isn't used by the
 327        * i965 back-end).  Therefore, even after geometry shaders are added,
 328        * this function should only ever be called with a target of
 329        * GL_VERTEX_PROGRAM_ARB or GL_FRAGMENT_PROGRAM_ARB.
 330        */
 331       unreachable("Unexpected target in brwProgramStringNotify");
 332    }
 333
 334    return true;
 335 }
 336
 337 static void
 338 brw_memory_barrier(struct gl_context *ctx, GLbitfield barriers)
 339 {
 340    struct brw_context *brw = brw_context(ctx);
 341    const struct gen_device_info *devinfo = &brw->screen->devinfo;
 342    unsigned bits = PIPE_CONTROL_DATA_CACHE_FLUSH | PIPE_CONTROL_CS_STALL;
 343    assert(devinfo->gen >= 7 && devinfo->gen <= 11);
 344
 345    if (barriers & (GL_VERTEX_ATTRIB_ARRAY_BARRIER_BIT |
 346                    GL_ELEMENT_ARRAY_BARRIER_BIT |
 347                    GL_COMMAND_BARRIER_BIT))
 348       bits |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
 349
 350    if (barriers & GL_UNIFORM_BARRIER_BIT)
 351       bits |= (PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
 352                PIPE_CONTROL_CONST_CACHE_INVALIDATE);
 353
 354    if (barriers & GL_TEXTURE_FETCH_BARRIER_BIT)
 355       bits |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
 356
 357    if (barriers & (GL_TEXTURE_UPDATE_BARRIER_BIT |
 358                    GL_PIXEL_BUFFER_BARRIER_BIT))
 359       bits |= (PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
 360                PIPE_CONTROL_RENDER_TARGET_FLUSH);
 361
 362    if (barriers & GL_FRAMEBUFFER_BARRIER_BIT)
 363       bits |= (PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
 364                PIPE_CONTROL_RENDER_TARGET_FLUSH);
 365
 366    /* Typed surface messages are handled by the render cache on IVB, so we
 367     * need to flush it too.
 368     */
 369    if (devinfo->gen == 7 && !devinfo->is_haswell)
 370       bits |= PIPE_CONTROL_RENDER_TARGET_FLUSH;
 371
 372    brw_emit_pipe_control_flush(brw, bits);
 373 }
 374
 375 static void
 376 brw_framebuffer_fetch_barrier(struct gl_context *ctx)
 377 {
 378    struct brw_context *brw = brw_context(ctx);
 379    const struct gen_device_info *devinfo = &brw->screen->devinfo;
 380
 381    if (!ctx->Extensions.EXT_shader_framebuffer_fetch) {
 382       if (devinfo->gen >= 6) {
 383          brw_emit_pipe_control_flush(brw,
 384                                      PIPE_CONTROL_RENDER_TARGET_FLUSH |
 385                                      PIPE_CONTROL_CS_STALL);
 386          brw_emit_pipe_control_flush(brw,
 387                                      PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE);
 388       } else {
 389          brw_emit_pipe_control_flush(brw,
 390                                      PIPE_CONTROL_RENDER_TARGET_FLUSH);
 391       }
 392    }
 393 }
 394
 395 void
 396 brw_get_scratch_bo(struct brw_context *brw,
 397                    struct brw_bo **scratch_bo, int size)
 398 {
 399    struct brw_bo *old_bo = *scratch_bo;
 400
 401    if (old_bo && old_bo->size < size) {
 402       brw_bo_unreference(old_bo);
 403       old_bo = NULL;
 404    }
 405
 406    if (!old_bo) {
 407       *scratch_bo =
 408          brw_bo_alloc(brw->bufmgr, "scratch bo", size, BRW_MEMZONE_SCRATCH);
 409    }
 410 }
 411
 412 /**
 413  * Reserve enough scratch space for the given stage to hold \p per_thread_size
 414  * bytes times the given \p thread_count.
 415  */
 416 void
 417 brw_alloc_stage_scratch(struct brw_context *brw,
 418                         struct brw_stage_state *stage_state,
 419                         unsigned per_thread_size)
 420 {
 421    if (stage_state->per_thread_scratch >= per_thread_size)
 422       return;
 423
 424    stage_state->per_thread_scratch = per_thread_size;
 425
 426    if (stage_state->scratch_bo)
 427       brw_bo_unreference(stage_state->scratch_bo);
 428
 429    const struct gen_device_info *devinfo = &brw->screen->devinfo;
 430    unsigned thread_count;
 431    switch(stage_state->stage) {
 432    case MESA_SHADER_VERTEX:
 433       thread_count = devinfo->max_vs_threads;
 434       break;
 435    case MESA_SHADER_TESS_CTRL:
 436       thread_count = devinfo->max_tcs_threads;
 437       break;
 438    case MESA_SHADER_TESS_EVAL:
 439       thread_count = devinfo->max_tes_threads;
 440       break;
 441    case MESA_SHADER_GEOMETRY:
 442       thread_count = devinfo->max_gs_threads;
 443       break;
 444    case MESA_SHADER_FRAGMENT:
 445       thread_count = devinfo->max_wm_threads;
 446       break;
 447    case MESA_SHADER_COMPUTE: {
 448       unsigned subslices = MAX2(brw->screen->subslice_total, 1);
 449
 450       /* The documentation for 3DSTATE_PS "Scratch Space Base Pointer" says:
 451        *
 452        * "Scratch Space per slice is computed based on 4 sub-slices.  SW must
 453        *  allocate scratch space enough so that each slice has 4 slices
 454        *  allowed."
 455        *
 456        * According to the other driver team, this applies to compute shaders
 457        * as well.  This is not currently documented at all.
 458        *
 459        * brw->screen->subslice_total is the TOTAL number of subslices
 460        * and we wish to view that there are 4 subslices per slice
 461        * instead of the actual number of subslices per slice.
 462        */
 463       if (devinfo->gen >= 9 && devinfo->gen < 11)
 464          subslices = 4 * brw->screen->devinfo.num_slices;
 465
 466       unsigned scratch_ids_per_subslice;
 467       if (devinfo->is_haswell) {
 468          /* WaCSScratchSize:hsw
 469           *
 470           * Haswell's scratch space address calculation appears to be sparse
 471           * rather than tightly packed. The Thread ID has bits indicating
 472           * which subslice, EU within a subslice, and thread within an EU it
 473           * is. There's a maximum of two slices and two subslices, so these
 474           * can be stored with a single bit. Even though there are only 10 EUs
 475           * per subslice, this is stored in 4 bits, so there's an effective
 476           * maximum value of 16 EUs. Similarly, although there are only 7
 477           * threads per EU, this is stored in a 3 bit number, giving an
 478           * effective maximum value of 8 threads per EU.
 479           *
 480           * This means that we need to use 16 * 8 instead of 10 * 7 for the
 481           * number of threads per subslice.
 482           */
 483          scratch_ids_per_subslice = 16 * 8;
 484       } else if (devinfo->is_cherryview) {
 485          /* Cherryview devices have either 6 or 8 EUs per subslice, and each
 486           * EU has 7 threads. The 6 EU devices appear to calculate thread IDs
 487           * as if it had 8 EUs.
 488           */
 489          scratch_ids_per_subslice = 8 * 7;
 490       } else {
 491          scratch_ids_per_subslice = devinfo->max_cs_threads;
 492       }
 493
 494       thread_count = scratch_ids_per_subslice * subslices;
 495       break;
 496    }
 497    default:
 498       unreachable("Unsupported stage!");
 499    }
 500
 501    stage_state->scratch_bo =
 502       brw_bo_alloc(brw->bufmgr, "shader scratch space",
 503                    per_thread_size * thread_count, BRW_MEMZONE_SCRATCH);
 504 }
 505
 506 void brwInitFragProgFuncs( struct dd_function_table *functions )
 507 {
 508    assert(functions->ProgramStringNotify == _tnl_program_string);
 509
 510    functions->NewProgram = brwNewProgram;
 511    functions->DeleteProgram = brwDeleteProgram;
 512    functions->ProgramStringNotify = brwProgramStringNotify;
 513
 514    functions->LinkShader = brw_link_shader;
 515
 516    functions->MemoryBarrier = brw_memory_barrier;
 517    functions->FramebufferFetchBarrier = brw_framebuffer_fetch_barrier;
 518 }
 519
 520 struct shader_times {
 521    uint64_t time;
 522    uint64_t written;
 523    uint64_t reset;
 524 };
 525
 526 void
 527 brw_init_shader_time(struct brw_context *brw)
 528 {
 529    const int max_entries = 2048;
 530    brw->shader_time.bo =
 531       brw_bo_alloc(brw->bufmgr, "shader time",
 532                    max_entries * BRW_SHADER_TIME_STRIDE * 3,
 533                    BRW_MEMZONE_OTHER);
 534    brw->shader_time.names = rzalloc_array(brw, const char *, max_entries);
 535    brw->shader_time.ids = rzalloc_array(brw, int, max_entries);
 536    brw->shader_time.types = rzalloc_array(brw, enum shader_time_shader_type,
 537                                           max_entries);
 538    brw->shader_time.cumulative = rzalloc_array(brw, struct shader_times,
 539                                                max_entries);
 540    brw->shader_time.max_entries = max_entries;
 541 }
 542
 543 static int
 544 compare_time(const void *a, const void *b)
 545 {
 546    uint64_t * const *a_val = a;
 547    uint64_t * const *b_val = b;
 548
 549    /* We don't just subtract because we're turning the value to an int. */
 550    if (**a_val < **b_val)
 551       return -1;
 552    else if (**a_val == **b_val)
 553       return 0;
 554    else
 555       return 1;
 556 }
 557
 558 static void
 559 print_shader_time_line(const char *stage, const char *name,
 560                        int shader_num, uint64_t time, uint64_t total)
 561 {
 562    fprintf(stderr, "%-6s%-18s", stage, name);
 563
 564    if (shader_num != 0)
 565       fprintf(stderr, "%4d: ", shader_num);
 566    else
 567       fprintf(stderr, "    : ");
 568
 569    fprintf(stderr, "%16lld (%7.2f Gcycles)      %4.1f%%\n",
 570            (long long)time,
 571            (double)time / 1000000000.0,
 572            (double)time / total * 100.0);
 573 }
 574
 575 static void
 576 brw_report_shader_time(struct brw_context *brw)
 577 {
 578    if (!brw->shader_time.bo || !brw->shader_time.num_entries)
 579       return;
 580
 581    uint64_t scaled[brw->shader_time.num_entries];
 582    uint64_t *sorted[brw->shader_time.num_entries];
 583    uint64_t total_by_type[ST_CS + 1];
 584    memset(total_by_type, 0, sizeof(total_by_type));
 585    double total = 0;
 586    for (int i = 0; i < brw->shader_time.num_entries; i++) {
 587       uint64_t written = 0, reset = 0;
 588       enum shader_time_shader_type type = brw->shader_time.types[i];
 589
 590       sorted[i] = &scaled[i];
 591
 592       switch (type) {
 593       case ST_VS:
 594       case ST_TCS:
 595       case ST_TES:
 596       case ST_GS:
 597       case ST_FS8:
 598       case ST_FS16:
 599       case ST_FS32:
 600       case ST_CS:
 601          written = brw->shader_time.cumulative[i].written;
 602          reset = brw->shader_time.cumulative[i].reset;
 603          break;
 604
 605       default:
 606          /* I sometimes want to print things that aren't the 3 shader times.
 607           * Just print the sum in that case.
 608           */
 609          written = 1;
 610          reset = 0;
 611          break;
 612       }
 613
 614       uint64_t time = brw->shader_time.cumulative[i].time;
 615       if (written) {
 616          scaled[i] = time / written * (written + reset);
 617       } else {
 618          scaled[i] = time;
 619       }
 620
 621       switch (type) {
 622       case ST_VS:
 623       case ST_TCS:
 624       case ST_TES:
 625       case ST_GS:
 626       case ST_FS8:
 627       case ST_FS16:
 628       case ST_FS32:
 629       case ST_CS:
 630          total_by_type[type] += scaled[i];
 631          break;
 632       default:
 633          break;
 634       }
 635
 636       total += scaled[i];
 637    }
 638
 639    if (total == 0) {
 640       fprintf(stderr, "No shader time collected yet\n");
 641       return;
 642    }
 643
 644    qsort(sorted, brw->shader_time.num_entries, sizeof(sorted[0]), compare_time);
 645
 646    fprintf(stderr, "\n");
 647    fprintf(stderr, "type          ID                  cycles spent                   %% of total\n");
 648    for (int s = 0; s < brw->shader_time.num_entries; s++) {
 649       const char *stage;
 650       /* Work back from the sorted pointers times to a time to print. */
 651       int i = sorted[s] - scaled;
 652
 653       if (scaled[i] == 0)
 654          continue;
 655
 656       int shader_num = brw->shader_time.ids[i];
 657       const char *shader_name = brw->shader_time.names[i];
 658
 659       switch (brw->shader_time.types[i]) {
 660       case ST_VS:
 661          stage = "vs";
 662          break;
 663       case ST_TCS:
 664          stage = "tcs";
 665          break;
 666       case ST_TES:
 667          stage = "tes";
 668          break;
 669       case ST_GS:
 670          stage = "gs";
 671          break;
 672       case ST_FS8:
 673          stage = "fs8";
 674          break;
 675       case ST_FS16:
 676          stage = "fs16";
 677          break;
 678       case ST_FS32:
 679          stage = "fs32";
 680          break;
 681       case ST_CS:
 682          stage = "cs";
 683          break;
 684       default:
 685          stage = "other";
 686          break;
 687       }
 688
 689       print_shader_time_line(stage, shader_name, shader_num,
 690                              scaled[i], total);
 691    }
 692
 693    fprintf(stderr, "\n");
 694    print_shader_time_line("total", "vs", 0, total_by_type[ST_VS], total);
 695    print_shader_time_line("total", "tcs", 0, total_by_type[ST_TCS], total);
 696    print_shader_time_line("total", "tes", 0, total_by_type[ST_TES], total);
 697    print_shader_time_line("total", "gs", 0, total_by_type[ST_GS], total);
 698    print_shader_time_line("total", "fs8", 0, total_by_type[ST_FS8], total);
 699    print_shader_time_line("total", "fs16", 0, total_by_type[ST_FS16], total);
 700    print_shader_time_line("total", "fs32", 0, total_by_type[ST_FS32], total);
 701    print_shader_time_line("total", "cs", 0, total_by_type[ST_CS], total);
 702 }
 703
 704 static void
 705 brw_collect_shader_time(struct brw_context *brw)
 706 {
 707    if (!brw->shader_time.bo)
 708       return;
 709
 710    /* This probably stalls on the last rendering.  We could fix that by
 711     * delaying reading the reports, but it doesn't look like it's a big
 712     * overhead compared to the cost of tracking the time in the first place.
 713     */
 714    void *bo_map = brw_bo_map(brw, brw->shader_time.bo, MAP_READ | MAP_WRITE);
 715
 716    for (int i = 0; i < brw->shader_time.num_entries; i++) {
 717       uint32_t *times = bo_map + i * 3 * BRW_SHADER_TIME_STRIDE;
 718
 719       brw->shader_time.cumulative[i].time += times[BRW_SHADER_TIME_STRIDE * 0 / 4];
 720       brw->shader_time.cumulative[i].written += times[BRW_SHADER_TIME_STRIDE * 1 / 4];
 721       brw->shader_time.cumulative[i].reset += times[BRW_SHADER_TIME_STRIDE * 2 / 4];
 722    }
 723
 724    /* Zero the BO out to clear it out for our next collection.
 725     */
 726    memset(bo_map, 0, brw->shader_time.bo->size);
 727    brw_bo_unmap(brw->shader_time.bo);
 728 }
 729
 730 void
 731 brw_collect_and_report_shader_time(struct brw_context *brw)
 732 {
 733    brw_collect_shader_time(brw);
 734
 735    if (brw->shader_time.report_time == 0 ||
 736        get_time() - brw->shader_time.report_time >= 1.0) {
 737       brw_report_shader_time(brw);
 738       brw->shader_time.report_time = get_time();
 739    }
 740 }
 741
 742 /**
 743  * Chooses an index in the shader_time buffer and sets up tracking information
 744  * for our printouts.
 745  *
 746  * Note that this holds on to references to the underlying programs, which may
 747  * change their lifetimes compared to normal operation.
 748  */
 749 int
 750 brw_get_shader_time_index(struct brw_context *brw, struct gl_program *prog,
 751                           enum shader_time_shader_type type, bool is_glsl_sh)
 752 {
 753    int shader_time_index = brw->shader_time.num_entries++;
 754    assert(shader_time_index < brw->shader_time.max_entries);
 755    brw->shader_time.types[shader_time_index] = type;
 756
 757    const char *name;
 758    if (prog->Id == 0) {
 759       name = "ff";
 760    } else if (is_glsl_sh) {
 761       name = prog->info.label ?
 762          ralloc_strdup(brw->shader_time.names, prog->info.label) : "glsl";
 763    } else {
 764       name = "prog";
 765    }
 766
 767    brw->shader_time.names[shader_time_index] = name;
 768    brw->shader_time.ids[shader_time_index] = prog->Id;
 769
 770    return shader_time_index;
 771 }
 772
 773 void
 774 brw_destroy_shader_time(struct brw_context *brw)
 775 {
 776    brw_bo_unreference(brw->shader_time.bo);
 777    brw->shader_time.bo = NULL;
 778 }
 779
 780 void
 781 brw_stage_prog_data_free(const void *p)
 782 {
 783    struct brw_stage_prog_data *prog_data = (struct brw_stage_prog_data *)p;
 784
 785    ralloc_free(prog_data->param);
 786    ralloc_free(prog_data->pull_param);
 787 }
 788
 789 void
 790 brw_dump_arb_asm(const char *stage, struct gl_program *prog)
 791 {
 792    fprintf(stderr, "ARB_%s_program %d ir for native %s shader\n",
 793            stage, prog->Id, stage);
 794    _mesa_print_program(prog);
 795 }
 796
 797 void
 798 brw_setup_tex_for_precompile(const struct gen_device_info *devinfo,
 799                              struct brw_sampler_prog_key_data *tex,
 800                              const struct gl_program *prog)
 801 {
 802    const bool has_shader_channel_select = devinfo->is_haswell || devinfo->gen >= 8;
 803    unsigned sampler_count = util_last_bit(prog->SamplersUsed);
 804    for (unsigned i = 0; i < sampler_count; i++) {
 805       if (!has_shader_channel_select && (prog->ShadowSamplers & (1 << i))) {
 806          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
 807          tex->swizzles[i] =
 808             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
 809       } else {
 810          /* Color sampler: assume no swizzling. */
 811          tex->swizzles[i] = SWIZZLE_XYZW;
 812       }
 813    }
 814 }
 815
 816 /**
 817  * Sets up the starting offsets for the groups of binding table entries
 818  * common to all pipeline stages.
 819  *
 820  * Unused groups are initialized to 0xd0d0d0d0 to make it obvious that they're
 821  * unused but also make sure that addition of small offsets to them will
 822  * trigger some of our asserts that surface indices are < BRW_MAX_SURFACES.
 823  */
 824 uint32_t
 825 brw_assign_common_binding_table_offsets(const struct gen_device_info *devinfo,
 826                                         const struct gl_program *prog,
 827                                         struct brw_stage_prog_data *stage_prog_data,
 828                                         uint32_t next_binding_table_offset)
 829 {
 830    int num_textures = util_last_bit(prog->SamplersUsed);
 831
 832    stage_prog_data->binding_table.texture_start = next_binding_table_offset;
 833    next_binding_table_offset += num_textures;
 834
 835    if (prog->info.num_ubos) {
 836       assert(prog->info.num_ubos <= BRW_MAX_UBO);
 837       stage_prog_data->binding_table.ubo_start = next_binding_table_offset;
 838       next_binding_table_offset += prog->info.num_ubos;
 839    } else {
 840       stage_prog_data->binding_table.ubo_start = 0xd0d0d0d0;
 841    }
 842
 843    if (prog->info.num_ssbos || prog->info.num_abos) {
 844       assert(prog->info.num_abos <= BRW_MAX_ABO);
 845       assert(prog->info.num_ssbos <= BRW_MAX_SSBO);
 846       stage_prog_data->binding_table.ssbo_start = next_binding_table_offset;
 847       next_binding_table_offset += prog->info.num_abos + prog->info.num_ssbos;
 848    } else {
 849       stage_prog_data->binding_table.ssbo_start = 0xd0d0d0d0;
 850    }
 851
 852    if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
 853       stage_prog_data->binding_table.shader_time_start = next_binding_table_offset;
 854       next_binding_table_offset++;
 855    } else {
 856       stage_prog_data->binding_table.shader_time_start = 0xd0d0d0d0;
 857    }
 858
 859    if (prog->info.uses_texture_gather) {
 860       if (devinfo->gen >= 8) {
 861          stage_prog_data->binding_table.gather_texture_start =
 862             stage_prog_data->binding_table.texture_start;
 863       } else {
 864          stage_prog_data->binding_table.gather_texture_start = next_binding_table_offset;
 865          next_binding_table_offset += num_textures;
 866       }
 867    } else {
 868       stage_prog_data->binding_table.gather_texture_start = 0xd0d0d0d0;
 869    }
 870
 871    if (prog->info.num_images) {
 872       stage_prog_data->binding_table.image_start = next_binding_table_offset;
 873       next_binding_table_offset += prog->info.num_images;
 874    } else {
 875       stage_prog_data->binding_table.image_start = 0xd0d0d0d0;
 876    }
 877
 878    /* This may or may not be used depending on how the compile goes. */
 879    stage_prog_data->binding_table.pull_constants_start = next_binding_table_offset;
 880    next_binding_table_offset++;
 881
 882    /* Plane 0 is just the regular texture section */
 883    stage_prog_data->binding_table.plane_start[0] = stage_prog_data->binding_table.texture_start;
 884
 885    stage_prog_data->binding_table.plane_start[1] = next_binding_table_offset;
 886    next_binding_table_offset += num_textures;
 887
 888    stage_prog_data->binding_table.plane_start[2] = next_binding_table_offset;
 889    next_binding_table_offset += num_textures;
 890
 891    /* Set the binding table size.  Some callers may append new entries
 892     * and increase this accordingly.
 893     */
 894    stage_prog_data->binding_table.size_bytes = next_binding_table_offset * 4;
 895
 896    assert(next_binding_table_offset <= BRW_MAX_SURFACES);
 897    return next_binding_table_offset;
 898 }
 899
 900 void
 901 brw_populate_default_key(const struct brw_compiler *compiler,
 902                          union brw_any_prog_key *prog_key,
 903                          struct gl_shader_program *sh_prog,
 904                          struct gl_program *prog)
 905 {
 906    switch (prog->info.stage) {
 907    case MESA_SHADER_VERTEX:
 908       brw_vs_populate_default_key(compiler, &prog_key->vs, prog);
 909       break;
 910    case MESA_SHADER_TESS_CTRL:
 911       brw_tcs_populate_default_key(compiler, &prog_key->tcs, sh_prog, prog);
 912       break;
 913    case MESA_SHADER_TESS_EVAL:
 914       brw_tes_populate_default_key(compiler, &prog_key->tes, sh_prog, prog);
 915       break;
 916    case MESA_SHADER_GEOMETRY:
 917       brw_gs_populate_default_key(compiler, &prog_key->gs, prog);
 918       break;
 919    case MESA_SHADER_FRAGMENT:
 920       brw_wm_populate_default_key(compiler, &prog_key->wm, prog);
 921       break;
 922    case MESA_SHADER_COMPUTE:
 923       brw_cs_populate_default_key(compiler, &prog_key->cs, prog);
 924       break;
 925    default:
 926       unreachable("Unsupported stage!");
 927    }
 928 }
 929
 930 void
 931 brw_debug_recompile(struct brw_context *brw,
 932                     gl_shader_stage stage,
 933                     unsigned api_id,
 934                     struct brw_base_prog_key *key)
 935 {
 936    const struct brw_compiler *compiler = brw->screen->compiler;
 937    enum brw_cache_id cache_id = brw_stage_cache_id(stage);
 938
 939    compiler->shader_perf_log(brw, "Recompiling %s shader for program %d\n",
 940                              _mesa_shader_stage_to_string(stage), api_id);
 941
 942    const void *old_key =
 943       brw_find_previous_compile(&brw->cache, cache_id, key->program_string_id);
 944
 945    brw_debug_key_recompile(compiler, brw, stage, old_key, key);
 946 }