src/mesa/drivers/dri/i965/brw_program.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keithw@vmware.com>
  30   */
  31
  32 #include <pthread.h>
  33 #include "main/imports.h"
  34 #include "main/glspirv.h"
  35 #include "program/prog_parameter.h"
  36 #include "program/prog_print.h"
  37 #include "program/prog_to_nir.h"
  38 #include "program/program.h"
  39 #include "program/programopt.h"
  40 #include "tnl/tnl.h"
  41 #include "util/ralloc.h"
  42 #include "compiler/glsl/ir.h"
  43 #include "compiler/glsl/program.h"
  44 #include "compiler/glsl/glsl_to_nir.h"
  45 #include "glsl/float64_glsl.h"
  46
  47 #include "brw_program.h"
  48 #include "brw_context.h"
  49 #include "compiler/brw_nir.h"
  50 #include "brw_defines.h"
  51 #include "intel_batchbuffer.h"
  52
  53 #include "brw_cs.h"
  54 #include "brw_gs.h"
  55 #include "brw_vs.h"
  56 #include "brw_wm.h"
  57
  58 #include "main/shaderapi.h"
  59 #include "main/shaderobj.h"
  60
  61 static bool
  62 brw_nir_lower_uniforms(nir_shader *nir, bool is_scalar)
  63 {
  64    if (is_scalar) {
  65       nir_assign_var_locations(&nir->uniforms, &nir->num_uniforms,
  66                                type_size_scalar_bytes);
  67       return nir_lower_io(nir, nir_var_uniform, type_size_scalar_bytes, 0);
  68    } else {
  69       nir_assign_var_locations(&nir->uniforms, &nir->num_uniforms,
  70                                type_size_vec4_bytes);
  71       return nir_lower_io(nir, nir_var_uniform, type_size_vec4_bytes, 0);
  72    }
  73 }
  74
  75 static struct gl_program *brwNewProgram(struct gl_context *ctx, GLenum target,
  76                                         GLuint id, bool is_arb_asm);
  77
  78 static nir_shader *
  79 compile_fp64_funcs(struct gl_context *ctx,
  80                    const nir_shader_compiler_options *options,
  81                    void *mem_ctx,
  82                    gl_shader_stage stage)
  83 {
  84    const GLuint name = ~0;
  85    struct gl_shader *sh;
  86
  87    sh = _mesa_new_shader(name, stage);
  88
  89    sh->Source = float64_source;
  90    sh->CompileStatus = COMPILE_FAILURE;
  91    _mesa_glsl_compile_shader(ctx, sh, false, false, true);
  92
  93    if (!sh->CompileStatus) {
  94       if (sh->InfoLog) {
  95          _mesa_problem(ctx,
  96                        "fp64 software impl compile failed:\n%s\nsource:\n%s\n",
  97                        sh->InfoLog, float64_source);
  98       }
  99    }
 100
 101    struct gl_shader_program *sh_prog;
 102    sh_prog = _mesa_new_shader_program(name);
 103    sh_prog->Label = NULL;
 104    sh_prog->NumShaders = 1;
 105    sh_prog->Shaders = malloc(sizeof(struct gl_shader *));
 106    sh_prog->Shaders[0] = sh;
 107
 108    struct gl_linked_shader *linked = rzalloc(NULL, struct gl_linked_shader);
 109    linked->Stage = stage;
 110    linked->Program =
 111       brwNewProgram(ctx,
 112                     _mesa_shader_stage_to_program(stage),
 113                     name, false);
 114
 115    linked->ir = sh->ir;
 116    sh_prog->_LinkedShaders[stage] = linked;
 117
 118    nir_shader *nir = glsl_to_nir(sh_prog, stage, options);
 119
 120    return nir_shader_clone(mem_ctx, nir);
 121 }
 122
 123 nir_shader *
 124 brw_create_nir(struct brw_context *brw,
 125                const struct gl_shader_program *shader_prog,
 126                struct gl_program *prog,
 127                gl_shader_stage stage,
 128                bool is_scalar)
 129 {
 130    const struct gen_device_info *devinfo = &brw->screen->devinfo;
 131    struct gl_context *ctx = &brw->ctx;
 132    const nir_shader_compiler_options *options =
 133       ctx->Const.ShaderCompilerOptions[stage].NirOptions;
 134    nir_shader *nir;
 135
 136    /* First, lower the GLSL/Mesa IR or SPIR-V to NIR */
 137    if (shader_prog) {
 138       if (shader_prog->data->spirv) {
 139          nir = _mesa_spirv_to_nir(ctx, shader_prog, stage, options);
 140       } else {
 141          nir = glsl_to_nir(shader_prog, stage, options);
 142       }
 143       assert (nir);
 144
 145       nir_remove_dead_variables(nir, nir_var_shader_in | nir_var_shader_out);
 146       nir_lower_returns(nir);
 147       nir_validate_shader(nir, "after glsl_to_nir or spirv_to_nir and "
 148                                "return lowering");
 149       NIR_PASS_V(nir, nir_lower_io_to_temporaries,
 150                  nir_shader_get_entrypoint(nir), true, false);
 151    } else {
 152       nir = prog_to_nir(prog, options);
 153       NIR_PASS_V(nir, nir_lower_regs_to_ssa); /* turn registers into SSA */
 154    }
 155    nir_validate_shader(nir, "before brw_preprocess_nir");
 156
 157    nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
 158
 159    if (!devinfo->has_64bit_types && nir->info.uses_64bit) {
 160       nir_shader *fp64 = compile_fp64_funcs(ctx, options, ralloc_parent(nir), stage);
 161
 162       nir_validate_shader(fp64, "fp64");
 163       exec_list_append(&nir->functions, &fp64->functions);
 164    }
 165
 166    nir = brw_preprocess_nir(brw->screen->compiler, nir);
 167
 168    NIR_PASS_V(nir, brw_nir_lower_image_load_store, devinfo);
 169
 170    if (stage == MESA_SHADER_TESS_CTRL) {
 171       /* Lower gl_PatchVerticesIn from a sys. value to a uniform on Gen8+. */
 172       static const gl_state_index16 tokens[STATE_LENGTH] =
 173          { STATE_INTERNAL, STATE_TCS_PATCH_VERTICES_IN };
 174       nir_lower_patch_vertices(nir, 0, devinfo->gen >= 8 ? tokens : NULL);
 175    }
 176
 177    if (stage == MESA_SHADER_TESS_EVAL) {
 178       /* Lower gl_PatchVerticesIn to a constant if we have a TCS, or
 179        * a uniform if we don't.
 180        */
 181       struct gl_linked_shader *tcs =
 182          shader_prog->_LinkedShaders[MESA_SHADER_TESS_CTRL];
 183       uint32_t static_patch_vertices =
 184          tcs ? tcs->Program->nir->info.tess.tcs_vertices_out : 0;
 185       static const gl_state_index16 tokens[STATE_LENGTH] =
 186          { STATE_INTERNAL, STATE_TES_PATCH_VERTICES_IN };
 187       nir_lower_patch_vertices(nir, static_patch_vertices, tokens);
 188    }
 189
 190    if (stage == MESA_SHADER_FRAGMENT) {
 191       static const struct nir_lower_wpos_ytransform_options wpos_options = {
 192          .state_tokens = {STATE_INTERNAL, STATE_FB_WPOS_Y_TRANSFORM, 0, 0, 0},
 193          .fs_coord_pixel_center_integer = 1,
 194          .fs_coord_origin_upper_left = 1,
 195       };
 196
 197       bool progress = false;
 198       NIR_PASS(progress, nir, nir_lower_wpos_ytransform, &wpos_options);
 199       if (progress) {
 200          _mesa_add_state_reference(prog->Parameters,
 201                                    wpos_options.state_tokens);
 202       }
 203    }
 204
 205    NIR_PASS_V(nir, brw_nir_lower_uniforms, is_scalar);
 206
 207    return nir;
 208 }
 209
 210 void
 211 brw_shader_gather_info(nir_shader *nir, struct gl_program *prog)
 212 {
 213    nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
 214
 215    /* Copy the info we just generated back into the gl_program */
 216    const char *prog_name = prog->info.name;
 217    const char *prog_label = prog->info.label;
 218    prog->info = nir->info;
 219    prog->info.name = prog_name;
 220    prog->info.label = prog_label;
 221 }
 222
 223 static unsigned
 224 get_new_program_id(struct intel_screen *screen)
 225 {
 226    return p_atomic_inc_return(&screen->program_id);
 227 }
 228
 229 static struct gl_program *brwNewProgram(struct gl_context *ctx, GLenum target,
 230                                         GLuint id, bool is_arb_asm)
 231 {
 232    struct brw_context *brw = brw_context(ctx);
 233    struct brw_program *prog = rzalloc(NULL, struct brw_program);
 234
 235    if (prog) {
 236       prog->id = get_new_program_id(brw->screen);
 237
 238       return _mesa_init_gl_program(&prog->program, target, id, is_arb_asm);
 239    }
 240
 241    return NULL;
 242 }
 243
 244 static void brwDeleteProgram( struct gl_context *ctx,
 245                               struct gl_program *prog )
 246 {
 247    struct brw_context *brw = brw_context(ctx);
 248
 249    /* Beware!  prog's refcount has reached zero, and it's about to be freed.
 250     *
 251     * In brw_upload_pipeline_state(), we compare brw->programs[i] to
 252     * ctx->FooProgram._Current, and flag BRW_NEW_FOO_PROGRAM if the
 253     * pointer has changed.
 254     *
 255     * We cannot leave brw->programs[i] as a dangling pointer to the dead
 256     * program.  malloc() may allocate the same memory for a new gl_program,
 257     * causing us to see matching pointers...but totally different programs.
 258     *
 259     * We cannot set brw->programs[i] to NULL, either.  If we've deleted the
 260     * active program, Mesa may set ctx->FooProgram._Current to NULL.  That
 261     * would cause us to see matching pointers (NULL == NULL), and fail to
 262     * detect that a program has changed since our last draw.
 263     *
 264     * So, set it to a bogus gl_program pointer that will never match,
 265     * causing us to properly reevaluate the state on our next draw.
 266     *
 267     * Getting this wrong causes heisenbugs which are very hard to catch,
 268     * as you need a very specific allocation pattern to hit the problem.
 269     */
 270    static const struct gl_program deleted_program;
 271
 272    for (int i = 0; i < MESA_SHADER_STAGES; i++) {
 273       if (brw->programs[i] == prog)
 274          brw->programs[i] = (struct gl_program *) &deleted_program;
 275    }
 276
 277    _mesa_delete_program( ctx, prog );
 278 }
 279
 280
 281 static GLboolean
 282 brwProgramStringNotify(struct gl_context *ctx,
 283                        GLenum target,
 284                        struct gl_program *prog)
 285 {
 286    assert(target == GL_VERTEX_PROGRAM_ARB || !prog->arb.IsPositionInvariant);
 287
 288    struct brw_context *brw = brw_context(ctx);
 289    const struct brw_compiler *compiler = brw->screen->compiler;
 290
 291    switch (target) {
 292    case GL_FRAGMENT_PROGRAM_ARB: {
 293       struct brw_program *newFP = brw_program(prog);
 294       const struct brw_program *curFP =
 295          brw_program_const(brw->programs[MESA_SHADER_FRAGMENT]);
 296
 297       if (newFP == curFP)
 298          brw->ctx.NewDriverState |= BRW_NEW_FRAGMENT_PROGRAM;
 299       newFP->id = get_new_program_id(brw->screen);
 300
 301       prog->nir = brw_create_nir(brw, NULL, prog, MESA_SHADER_FRAGMENT, true);
 302
 303       brw_shader_gather_info(prog->nir, prog);
 304
 305       brw_fs_precompile(ctx, prog);
 306       break;
 307    }
 308    case GL_VERTEX_PROGRAM_ARB: {
 309       struct brw_program *newVP = brw_program(prog);
 310       const struct brw_program *curVP =
 311          brw_program_const(brw->programs[MESA_SHADER_VERTEX]);
 312
 313       if (newVP == curVP)
 314          brw->ctx.NewDriverState |= BRW_NEW_VERTEX_PROGRAM;
 315       if (newVP->program.arb.IsPositionInvariant) {
 316          _mesa_insert_mvp_code(ctx, &newVP->program);
 317       }
 318       newVP->id = get_new_program_id(brw->screen);
 319
 320       /* Also tell tnl about it:
 321        */
 322       _tnl_program_string(ctx, target, prog);
 323
 324       prog->nir = brw_create_nir(brw, NULL, prog, MESA_SHADER_VERTEX,
 325                                  compiler->scalar_stage[MESA_SHADER_VERTEX]);
 326
 327       brw_shader_gather_info(prog->nir, prog);
 328
 329       brw_vs_precompile(ctx, prog);
 330       break;
 331    }
 332    default:
 333       /*
 334        * driver->ProgramStringNotify is only called for ARB programs, fixed
 335        * function vertex programs, and ir_to_mesa (which isn't used by the
 336        * i965 back-end).  Therefore, even after geometry shaders are added,
 337        * this function should only ever be called with a target of
 338        * GL_VERTEX_PROGRAM_ARB or GL_FRAGMENT_PROGRAM_ARB.
 339        */
 340       unreachable("Unexpected target in brwProgramStringNotify");
 341    }
 342
 343    return true;
 344 }
 345
 346 static void
 347 brw_memory_barrier(struct gl_context *ctx, GLbitfield barriers)
 348 {
 349    struct brw_context *brw = brw_context(ctx);
 350    const struct gen_device_info *devinfo = &brw->screen->devinfo;
 351    unsigned bits = PIPE_CONTROL_DATA_CACHE_FLUSH | PIPE_CONTROL_CS_STALL;
 352    assert(devinfo->gen >= 7 && devinfo->gen <= 11);
 353
 354    if (barriers & (GL_VERTEX_ATTRIB_ARRAY_BARRIER_BIT |
 355                    GL_ELEMENT_ARRAY_BARRIER_BIT |
 356                    GL_COMMAND_BARRIER_BIT))
 357       bits |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
 358
 359    if (barriers & GL_UNIFORM_BARRIER_BIT)
 360       bits |= (PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
 361                PIPE_CONTROL_CONST_CACHE_INVALIDATE);
 362
 363    if (barriers & GL_TEXTURE_FETCH_BARRIER_BIT)
 364       bits |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
 365
 366    if (barriers & (GL_TEXTURE_UPDATE_BARRIER_BIT |
 367                    GL_PIXEL_BUFFER_BARRIER_BIT))
 368       bits |= (PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
 369                PIPE_CONTROL_RENDER_TARGET_FLUSH);
 370
 371    if (barriers & GL_FRAMEBUFFER_BARRIER_BIT)
 372       bits |= (PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
 373                PIPE_CONTROL_RENDER_TARGET_FLUSH);
 374
 375    /* Typed surface messages are handled by the render cache on IVB, so we
 376     * need to flush it too.
 377     */
 378    if (devinfo->gen == 7 && !devinfo->is_haswell)
 379       bits |= PIPE_CONTROL_RENDER_TARGET_FLUSH;
 380
 381    brw_emit_pipe_control_flush(brw, bits);
 382 }
 383
 384 static void
 385 brw_framebuffer_fetch_barrier(struct gl_context *ctx)
 386 {
 387    struct brw_context *brw = brw_context(ctx);
 388    const struct gen_device_info *devinfo = &brw->screen->devinfo;
 389
 390    if (!ctx->Extensions.EXT_shader_framebuffer_fetch) {
 391       if (devinfo->gen >= 6) {
 392          brw_emit_pipe_control_flush(brw,
 393                                      PIPE_CONTROL_RENDER_TARGET_FLUSH |
 394                                      PIPE_CONTROL_CS_STALL);
 395          brw_emit_pipe_control_flush(brw,
 396                                      PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE);
 397       } else {
 398          brw_emit_pipe_control_flush(brw,
 399                                      PIPE_CONTROL_RENDER_TARGET_FLUSH);
 400       }
 401    }
 402 }
 403
 404 void
 405 brw_get_scratch_bo(struct brw_context *brw,
 406                    struct brw_bo **scratch_bo, int size)
 407 {
 408    struct brw_bo *old_bo = *scratch_bo;
 409
 410    if (old_bo && old_bo->size < size) {
 411       brw_bo_unreference(old_bo);
 412       old_bo = NULL;
 413    }
 414
 415    if (!old_bo) {
 416       *scratch_bo =
 417          brw_bo_alloc(brw->bufmgr, "scratch bo", size, BRW_MEMZONE_SCRATCH);
 418    }
 419 }
 420
 421 /**
 422  * Reserve enough scratch space for the given stage to hold \p per_thread_size
 423  * bytes times the given \p thread_count.
 424  */
 425 void
 426 brw_alloc_stage_scratch(struct brw_context *brw,
 427                         struct brw_stage_state *stage_state,
 428                         unsigned per_thread_size)
 429 {
 430    if (stage_state->per_thread_scratch >= per_thread_size)
 431       return;
 432
 433    stage_state->per_thread_scratch = per_thread_size;
 434
 435    if (stage_state->scratch_bo)
 436       brw_bo_unreference(stage_state->scratch_bo);
 437
 438    const struct gen_device_info *devinfo = &brw->screen->devinfo;
 439    unsigned thread_count;
 440    switch(stage_state->stage) {
 441    case MESA_SHADER_VERTEX:
 442       thread_count = devinfo->max_vs_threads;
 443       break;
 444    case MESA_SHADER_TESS_CTRL:
 445       thread_count = devinfo->max_tcs_threads;
 446       break;
 447    case MESA_SHADER_TESS_EVAL:
 448       thread_count = devinfo->max_tes_threads;
 449       break;
 450    case MESA_SHADER_GEOMETRY:
 451       thread_count = devinfo->max_gs_threads;
 452       break;
 453    case MESA_SHADER_FRAGMENT:
 454       thread_count = devinfo->max_wm_threads;
 455       break;
 456    case MESA_SHADER_COMPUTE: {
 457       unsigned subslices = MAX2(brw->screen->subslice_total, 1);
 458
 459       /* The documentation for 3DSTATE_PS "Scratch Space Base Pointer" says:
 460        *
 461        * "Scratch Space per slice is computed based on 4 sub-slices.  SW must
 462        *  allocate scratch space enough so that each slice has 4 slices
 463        *  allowed."
 464        *
 465        * According to the other driver team, this applies to compute shaders
 466        * as well.  This is not currently documented at all.
 467        *
 468        * brw->screen->subslice_total is the TOTAL number of subslices
 469        * and we wish to view that there are 4 subslices per slice
 470        * instead of the actual number of subslices per slice.
 471        */
 472       if (devinfo->gen >= 9 && devinfo->gen < 11)
 473          subslices = 4 * brw->screen->devinfo.num_slices;
 474
 475       unsigned scratch_ids_per_subslice;
 476       if (devinfo->is_haswell) {
 477          /* WaCSScratchSize:hsw
 478           *
 479           * Haswell's scratch space address calculation appears to be sparse
 480           * rather than tightly packed. The Thread ID has bits indicating
 481           * which subslice, EU within a subslice, and thread within an EU it
 482           * is. There's a maximum of two slices and two subslices, so these
 483           * can be stored with a single bit. Even though there are only 10 EUs
 484           * per subslice, this is stored in 4 bits, so there's an effective
 485           * maximum value of 16 EUs. Similarly, although there are only 7
 486           * threads per EU, this is stored in a 3 bit number, giving an
 487           * effective maximum value of 8 threads per EU.
 488           *
 489           * This means that we need to use 16 * 8 instead of 10 * 7 for the
 490           * number of threads per subslice.
 491           */
 492          scratch_ids_per_subslice = 16 * 8;
 493       } else if (devinfo->is_cherryview) {
 494          /* Cherryview devices have either 6 or 8 EUs per subslice, and each
 495           * EU has 7 threads. The 6 EU devices appear to calculate thread IDs
 496           * as if it had 8 EUs.
 497           */
 498          scratch_ids_per_subslice = 8 * 7;
 499       } else {
 500          scratch_ids_per_subslice = devinfo->max_cs_threads;
 501       }
 502
 503       thread_count = scratch_ids_per_subslice * subslices;
 504       break;
 505    }
 506    default:
 507       unreachable("Unsupported stage!");
 508    }
 509
 510    stage_state->scratch_bo =
 511       brw_bo_alloc(brw->bufmgr, "shader scratch space",
 512                    per_thread_size * thread_count, BRW_MEMZONE_SCRATCH);
 513 }
 514
 515 void brwInitFragProgFuncs( struct dd_function_table *functions )
 516 {
 517    assert(functions->ProgramStringNotify == _tnl_program_string);
 518
 519    functions->NewProgram = brwNewProgram;
 520    functions->DeleteProgram = brwDeleteProgram;
 521    functions->ProgramStringNotify = brwProgramStringNotify;
 522
 523    functions->LinkShader = brw_link_shader;
 524
 525    functions->MemoryBarrier = brw_memory_barrier;
 526    functions->FramebufferFetchBarrier = brw_framebuffer_fetch_barrier;
 527 }
 528
 529 struct shader_times {
 530    uint64_t time;
 531    uint64_t written;
 532    uint64_t reset;
 533 };
 534
 535 void
 536 brw_init_shader_time(struct brw_context *brw)
 537 {
 538    const int max_entries = 2048;
 539    brw->shader_time.bo =
 540       brw_bo_alloc(brw->bufmgr, "shader time",
 541                    max_entries * BRW_SHADER_TIME_STRIDE * 3,
 542                    BRW_MEMZONE_OTHER);
 543    brw->shader_time.names = rzalloc_array(brw, const char *, max_entries);
 544    brw->shader_time.ids = rzalloc_array(brw, int, max_entries);
 545    brw->shader_time.types = rzalloc_array(brw, enum shader_time_shader_type,
 546                                           max_entries);
 547    brw->shader_time.cumulative = rzalloc_array(brw, struct shader_times,
 548                                                max_entries);
 549    brw->shader_time.max_entries = max_entries;
 550 }
 551
 552 static int
 553 compare_time(const void *a, const void *b)
 554 {
 555    uint64_t * const *a_val = a;
 556    uint64_t * const *b_val = b;
 557
 558    /* We don't just subtract because we're turning the value to an int. */
 559    if (**a_val < **b_val)
 560       return -1;
 561    else if (**a_val == **b_val)
 562       return 0;
 563    else
 564       return 1;
 565 }
 566
 567 static void
 568 print_shader_time_line(const char *stage, const char *name,
 569                        int shader_num, uint64_t time, uint64_t total)
 570 {
 571    fprintf(stderr, "%-6s%-18s", stage, name);
 572
 573    if (shader_num != 0)
 574       fprintf(stderr, "%4d: ", shader_num);
 575    else
 576       fprintf(stderr, "    : ");
 577
 578    fprintf(stderr, "%16lld (%7.2f Gcycles)      %4.1f%%\n",
 579            (long long)time,
 580            (double)time / 1000000000.0,
 581            (double)time / total * 100.0);
 582 }
 583
 584 static void
 585 brw_report_shader_time(struct brw_context *brw)
 586 {
 587    if (!brw->shader_time.bo || !brw->shader_time.num_entries)
 588       return;
 589
 590    uint64_t scaled[brw->shader_time.num_entries];
 591    uint64_t *sorted[brw->shader_time.num_entries];
 592    uint64_t total_by_type[ST_CS + 1];
 593    memset(total_by_type, 0, sizeof(total_by_type));
 594    double total = 0;
 595    for (int i = 0; i < brw->shader_time.num_entries; i++) {
 596       uint64_t written = 0, reset = 0;
 597       enum shader_time_shader_type type = brw->shader_time.types[i];
 598
 599       sorted[i] = &scaled[i];
 600
 601       switch (type) {
 602       case ST_VS:
 603       case ST_TCS:
 604       case ST_TES:
 605       case ST_GS:
 606       case ST_FS8:
 607       case ST_FS16:
 608       case ST_FS32:
 609       case ST_CS:
 610          written = brw->shader_time.cumulative[i].written;
 611          reset = brw->shader_time.cumulative[i].reset;
 612          break;
 613
 614       default:
 615          /* I sometimes want to print things that aren't the 3 shader times.
 616           * Just print the sum in that case.
 617           */
 618          written = 1;
 619          reset = 0;
 620          break;
 621       }
 622
 623       uint64_t time = brw->shader_time.cumulative[i].time;
 624       if (written) {
 625          scaled[i] = time / written * (written + reset);
 626       } else {
 627          scaled[i] = time;
 628       }
 629
 630       switch (type) {
 631       case ST_VS:
 632       case ST_TCS:
 633       case ST_TES:
 634       case ST_GS:
 635       case ST_FS8:
 636       case ST_FS16:
 637       case ST_FS32:
 638       case ST_CS:
 639          total_by_type[type] += scaled[i];
 640          break;
 641       default:
 642          break;
 643       }
 644
 645       total += scaled[i];
 646    }
 647
 648    if (total == 0) {
 649       fprintf(stderr, "No shader time collected yet\n");
 650       return;
 651    }
 652
 653    qsort(sorted, brw->shader_time.num_entries, sizeof(sorted[0]), compare_time);
 654
 655    fprintf(stderr, "\n");
 656    fprintf(stderr, "type          ID                  cycles spent                   %% of total\n");
 657    for (int s = 0; s < brw->shader_time.num_entries; s++) {
 658       const char *stage;
 659       /* Work back from the sorted pointers times to a time to print. */
 660       int i = sorted[s] - scaled;
 661
 662       if (scaled[i] == 0)
 663          continue;
 664
 665       int shader_num = brw->shader_time.ids[i];
 666       const char *shader_name = brw->shader_time.names[i];
 667
 668       switch (brw->shader_time.types[i]) {
 669       case ST_VS:
 670          stage = "vs";
 671          break;
 672       case ST_TCS:
 673          stage = "tcs";
 674          break;
 675       case ST_TES:
 676          stage = "tes";
 677          break;
 678       case ST_GS:
 679          stage = "gs";
 680          break;
 681       case ST_FS8:
 682          stage = "fs8";
 683          break;
 684       case ST_FS16:
 685          stage = "fs16";
 686          break;
 687       case ST_FS32:
 688          stage = "fs32";
 689          break;
 690       case ST_CS:
 691          stage = "cs";
 692          break;
 693       default:
 694          stage = "other";
 695          break;
 696       }
 697
 698       print_shader_time_line(stage, shader_name, shader_num,
 699                              scaled[i], total);
 700    }
 701
 702    fprintf(stderr, "\n");
 703    print_shader_time_line("total", "vs", 0, total_by_type[ST_VS], total);
 704    print_shader_time_line("total", "tcs", 0, total_by_type[ST_TCS], total);
 705    print_shader_time_line("total", "tes", 0, total_by_type[ST_TES], total);
 706    print_shader_time_line("total", "gs", 0, total_by_type[ST_GS], total);
 707    print_shader_time_line("total", "fs8", 0, total_by_type[ST_FS8], total);
 708    print_shader_time_line("total", "fs16", 0, total_by_type[ST_FS16], total);
 709    print_shader_time_line("total", "fs32", 0, total_by_type[ST_FS32], total);
 710    print_shader_time_line("total", "cs", 0, total_by_type[ST_CS], total);
 711 }
 712
 713 static void
 714 brw_collect_shader_time(struct brw_context *brw)
 715 {
 716    if (!brw->shader_time.bo)
 717       return;
 718
 719    /* This probably stalls on the last rendering.  We could fix that by
 720     * delaying reading the reports, but it doesn't look like it's a big
 721     * overhead compared to the cost of tracking the time in the first place.
 722     */
 723    void *bo_map = brw_bo_map(brw, brw->shader_time.bo, MAP_READ | MAP_WRITE);
 724
 725    for (int i = 0; i < brw->shader_time.num_entries; i++) {
 726       uint32_t *times = bo_map + i * 3 * BRW_SHADER_TIME_STRIDE;
 727
 728       brw->shader_time.cumulative[i].time += times[BRW_SHADER_TIME_STRIDE * 0 / 4];
 729       brw->shader_time.cumulative[i].written += times[BRW_SHADER_TIME_STRIDE * 1 / 4];
 730       brw->shader_time.cumulative[i].reset += times[BRW_SHADER_TIME_STRIDE * 2 / 4];
 731    }
 732
 733    /* Zero the BO out to clear it out for our next collection.
 734     */
 735    memset(bo_map, 0, brw->shader_time.bo->size);
 736    brw_bo_unmap(brw->shader_time.bo);
 737 }
 738
 739 void
 740 brw_collect_and_report_shader_time(struct brw_context *brw)
 741 {
 742    brw_collect_shader_time(brw);
 743
 744    if (brw->shader_time.report_time == 0 ||
 745        get_time() - brw->shader_time.report_time >= 1.0) {
 746       brw_report_shader_time(brw);
 747       brw->shader_time.report_time = get_time();
 748    }
 749 }
 750
 751 /**
 752  * Chooses an index in the shader_time buffer and sets up tracking information
 753  * for our printouts.
 754  *
 755  * Note that this holds on to references to the underlying programs, which may
 756  * change their lifetimes compared to normal operation.
 757  */
 758 int
 759 brw_get_shader_time_index(struct brw_context *brw, struct gl_program *prog,
 760                           enum shader_time_shader_type type, bool is_glsl_sh)
 761 {
 762    int shader_time_index = brw->shader_time.num_entries++;
 763    assert(shader_time_index < brw->shader_time.max_entries);
 764    brw->shader_time.types[shader_time_index] = type;
 765
 766    const char *name;
 767    if (prog->Id == 0) {
 768       name = "ff";
 769    } else if (is_glsl_sh) {
 770       name = prog->info.label ?
 771          ralloc_strdup(brw->shader_time.names, prog->info.label) : "glsl";
 772    } else {
 773       name = "prog";
 774    }
 775
 776    brw->shader_time.names[shader_time_index] = name;
 777    brw->shader_time.ids[shader_time_index] = prog->Id;
 778
 779    return shader_time_index;
 780 }
 781
 782 void
 783 brw_destroy_shader_time(struct brw_context *brw)
 784 {
 785    brw_bo_unreference(brw->shader_time.bo);
 786    brw->shader_time.bo = NULL;
 787 }
 788
 789 void
 790 brw_stage_prog_data_free(const void *p)
 791 {
 792    struct brw_stage_prog_data *prog_data = (struct brw_stage_prog_data *)p;
 793
 794    ralloc_free(prog_data->param);
 795    ralloc_free(prog_data->pull_param);
 796 }
 797
 798 void
 799 brw_dump_arb_asm(const char *stage, struct gl_program *prog)
 800 {
 801    fprintf(stderr, "ARB_%s_program %d ir for native %s shader\n",
 802            stage, prog->Id, stage);
 803    _mesa_print_program(prog);
 804 }
 805
 806 void
 807 brw_setup_tex_for_precompile(const struct gen_device_info *devinfo,
 808                              struct brw_sampler_prog_key_data *tex,
 809                              struct gl_program *prog)
 810 {
 811    const bool has_shader_channel_select = devinfo->is_haswell || devinfo->gen >= 8;
 812    unsigned sampler_count = util_last_bit(prog->SamplersUsed);
 813    for (unsigned i = 0; i < sampler_count; i++) {
 814       if (!has_shader_channel_select && (prog->ShadowSamplers & (1 << i))) {
 815          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
 816          tex->swizzles[i] =
 817             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
 818       } else {
 819          /* Color sampler: assume no swizzling. */
 820          tex->swizzles[i] = SWIZZLE_XYZW;
 821       }
 822    }
 823 }
 824
 825 /**
 826  * Sets up the starting offsets for the groups of binding table entries
 827  * common to all pipeline stages.
 828  *
 829  * Unused groups are initialized to 0xd0d0d0d0 to make it obvious that they're
 830  * unused but also make sure that addition of small offsets to them will
 831  * trigger some of our asserts that surface indices are < BRW_MAX_SURFACES.
 832  */
 833 uint32_t
 834 brw_assign_common_binding_table_offsets(const struct gen_device_info *devinfo,
 835                                         const struct gl_program *prog,
 836                                         struct brw_stage_prog_data *stage_prog_data,
 837                                         uint32_t next_binding_table_offset)
 838 {
 839    int num_textures = util_last_bit(prog->SamplersUsed);
 840
 841    stage_prog_data->binding_table.texture_start = next_binding_table_offset;
 842    next_binding_table_offset += num_textures;
 843
 844    if (prog->info.num_ubos) {
 845       assert(prog->info.num_ubos <= BRW_MAX_UBO);
 846       stage_prog_data->binding_table.ubo_start = next_binding_table_offset;
 847       next_binding_table_offset += prog->info.num_ubos;
 848    } else {
 849       stage_prog_data->binding_table.ubo_start = 0xd0d0d0d0;
 850    }
 851
 852    if (prog->info.num_ssbos || prog->info.num_abos) {
 853       assert(prog->info.num_abos <= BRW_MAX_ABO);
 854       assert(prog->info.num_ssbos <= BRW_MAX_SSBO);
 855       stage_prog_data->binding_table.ssbo_start = next_binding_table_offset;
 856       next_binding_table_offset += prog->info.num_abos + prog->info.num_ssbos;
 857    } else {
 858       stage_prog_data->binding_table.ssbo_start = 0xd0d0d0d0;
 859    }
 860
 861    if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
 862       stage_prog_data->binding_table.shader_time_start = next_binding_table_offset;
 863       next_binding_table_offset++;
 864    } else {
 865       stage_prog_data->binding_table.shader_time_start = 0xd0d0d0d0;
 866    }
 867
 868    if (prog->info.uses_texture_gather) {
 869       if (devinfo->gen >= 8) {
 870          stage_prog_data->binding_table.gather_texture_start =
 871             stage_prog_data->binding_table.texture_start;
 872       } else {
 873          stage_prog_data->binding_table.gather_texture_start = next_binding_table_offset;
 874          next_binding_table_offset += num_textures;
 875       }
 876    } else {
 877       stage_prog_data->binding_table.gather_texture_start = 0xd0d0d0d0;
 878    }
 879
 880    if (prog->info.num_images) {
 881       stage_prog_data->binding_table.image_start = next_binding_table_offset;
 882       next_binding_table_offset += prog->info.num_images;
 883    } else {
 884       stage_prog_data->binding_table.image_start = 0xd0d0d0d0;
 885    }
 886
 887    /* This may or may not be used depending on how the compile goes. */
 888    stage_prog_data->binding_table.pull_constants_start = next_binding_table_offset;
 889    next_binding_table_offset++;
 890
 891    /* Plane 0 is just the regular texture section */
 892    stage_prog_data->binding_table.plane_start[0] = stage_prog_data->binding_table.texture_start;
 893
 894    stage_prog_data->binding_table.plane_start[1] = next_binding_table_offset;
 895    next_binding_table_offset += num_textures;
 896
 897    stage_prog_data->binding_table.plane_start[2] = next_binding_table_offset;
 898    next_binding_table_offset += num_textures;
 899
 900    /* Set the binding table size.  Some callers may append new entries
 901     * and increase this accordingly.
 902     */
 903    stage_prog_data->binding_table.size_bytes = next_binding_table_offset * 4;
 904
 905    assert(next_binding_table_offset <= BRW_MAX_SURFACES);
 906    return next_binding_table_offset;
 907 }
 908
 909 void
 910 brw_prog_key_set_id(union brw_any_prog_key *key, gl_shader_stage stage,
 911                     unsigned id)
 912 {
 913    static const unsigned stage_offsets[] = {
 914       offsetof(struct brw_vs_prog_key, program_string_id),
 915       offsetof(struct brw_tcs_prog_key, program_string_id),
 916       offsetof(struct brw_tes_prog_key, program_string_id),
 917       offsetof(struct brw_gs_prog_key, program_string_id),
 918       offsetof(struct brw_wm_prog_key, program_string_id),
 919       offsetof(struct brw_cs_prog_key, program_string_id),
 920    };
 921    assert((int)stage >= 0 && stage < ARRAY_SIZE(stage_offsets));
 922    *(unsigned*)((uint8_t*)key + stage_offsets[stage]) = id;
 923 }
 924
 925 void
 926 brw_populate_default_key(const struct gen_device_info *devinfo,
 927                          union brw_any_prog_key *prog_key,
 928                          struct gl_shader_program *sh_prog,
 929                          struct gl_program *prog)
 930 {
 931    switch (prog->info.stage) {
 932    case MESA_SHADER_VERTEX:
 933       brw_vs_populate_default_key(devinfo, &prog_key->vs, prog);
 934       break;
 935    case MESA_SHADER_TESS_CTRL:
 936       brw_tcs_populate_default_key(devinfo, &prog_key->tcs, sh_prog, prog);
 937       break;
 938    case MESA_SHADER_TESS_EVAL:
 939       brw_tes_populate_default_key(devinfo, &prog_key->tes, sh_prog, prog);
 940       break;
 941    case MESA_SHADER_GEOMETRY:
 942       brw_gs_populate_default_key(devinfo, &prog_key->gs, prog);
 943       break;
 944    case MESA_SHADER_FRAGMENT:
 945       brw_wm_populate_default_key(devinfo, &prog_key->wm, prog);
 946       break;
 947    case MESA_SHADER_COMPUTE:
 948       brw_cs_populate_default_key(devinfo, &prog_key->cs, prog);
 949       break;
 950    default:
 951       unreachable("Unsupported stage!");
 952    }
 953 }