src/mesa/drivers/dri/i965/brw_program.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keithw@vmware.com>
  30   */
  31
  32 #include <pthread.h>
  33 #include "main/imports.h"
  34 #include "main/glspirv.h"
  35 #include "program/prog_parameter.h"
  36 #include "program/prog_print.h"
  37 #include "program/prog_to_nir.h"
  38 #include "program/program.h"
  39 #include "program/programopt.h"
  40 #include "tnl/tnl.h"
  41 #include "util/ralloc.h"
  42 #include "compiler/glsl/ir.h"
  43 #include "compiler/glsl/program.h"
  44 #include "compiler/glsl/gl_nir.h"
  45 #include "compiler/glsl/glsl_to_nir.h"
  46 #include "glsl/float64_glsl.h"
  47
  48 #include "brw_program.h"
  49 #include "brw_context.h"
  50 #include "compiler/brw_nir.h"
  51 #include "brw_defines.h"
  52 #include "intel_batchbuffer.h"
  53
  54 #include "brw_cs.h"
  55 #include "brw_gs.h"
  56 #include "brw_vs.h"
  57 #include "brw_wm.h"
  58
  59 #include "main/shaderapi.h"
  60 #include "main/shaderobj.h"
  61
  62 static bool
  63 brw_nir_lower_uniforms(nir_shader *nir, bool is_scalar)
  64 {
  65    if (is_scalar) {
  66       nir_assign_var_locations(&nir->uniforms, &nir->num_uniforms,
  67                                type_size_scalar_bytes);
  68       return nir_lower_io(nir, nir_var_uniform, type_size_scalar_bytes, 0);
  69    } else {
  70       nir_assign_var_locations(&nir->uniforms, &nir->num_uniforms,
  71                                type_size_vec4_bytes);
  72       return nir_lower_io(nir, nir_var_uniform, type_size_vec4_bytes, 0);
  73    }
  74 }
  75
  76 static struct gl_program *brwNewProgram(struct gl_context *ctx, GLenum target,
  77                                         GLuint id, bool is_arb_asm);
  78
  79 static nir_shader *
  80 compile_fp64_funcs(struct gl_context *ctx,
  81                    const nir_shader_compiler_options *options,
  82                    void *mem_ctx,
  83                    gl_shader_stage stage)
  84 {
  85    const GLuint name = ~0;
  86    struct gl_shader *sh;
  87
  88    sh = _mesa_new_shader(name, stage);
  89
  90    sh->Source = float64_source;
  91    sh->CompileStatus = COMPILE_FAILURE;
  92    _mesa_glsl_compile_shader(ctx, sh, false, false, true);
  93
  94    if (!sh->CompileStatus) {
  95       if (sh->InfoLog) {
  96          _mesa_problem(ctx,
  97                        "fp64 software impl compile failed:\n%s\nsource:\n%s\n",
  98                        sh->InfoLog, float64_source);
  99       }
 100    }
 101
 102    struct gl_shader_program *sh_prog;
 103    sh_prog = _mesa_new_shader_program(name);
 104    sh_prog->Label = NULL;
 105    sh_prog->NumShaders = 1;
 106    sh_prog->Shaders = malloc(sizeof(struct gl_shader *));
 107    sh_prog->Shaders[0] = sh;
 108
 109    struct gl_linked_shader *linked = rzalloc(NULL, struct gl_linked_shader);
 110    linked->Stage = stage;
 111    linked->Program =
 112       brwNewProgram(ctx,
 113                     _mesa_shader_stage_to_program(stage),
 114                     name, false);
 115
 116    linked->ir = sh->ir;
 117    sh_prog->_LinkedShaders[stage] = linked;
 118
 119    nir_shader *nir = glsl_to_nir(sh_prog, stage, options);
 120
 121    return nir_shader_clone(mem_ctx, nir);
 122 }
 123
 124 nir_shader *
 125 brw_create_nir(struct brw_context *brw,
 126                const struct gl_shader_program *shader_prog,
 127                struct gl_program *prog,
 128                gl_shader_stage stage,
 129                bool is_scalar)
 130 {
 131    const struct gen_device_info *devinfo = &brw->screen->devinfo;
 132    struct gl_context *ctx = &brw->ctx;
 133    const nir_shader_compiler_options *options =
 134       ctx->Const.ShaderCompilerOptions[stage].NirOptions;
 135    nir_shader *nir;
 136
 137    /* First, lower the GLSL/Mesa IR or SPIR-V to NIR */
 138    if (shader_prog) {
 139       if (shader_prog->data->spirv) {
 140          nir = _mesa_spirv_to_nir(ctx, shader_prog, stage, options);
 141       } else {
 142          nir = glsl_to_nir(shader_prog, stage, options);
 143       }
 144       assert (nir);
 145
 146       nir_remove_dead_variables(nir, nir_var_shader_in | nir_var_shader_out);
 147       nir_lower_returns(nir);
 148       nir_validate_shader(nir, "after glsl_to_nir or spirv_to_nir and "
 149                                "return lowering");
 150       NIR_PASS_V(nir, nir_lower_io_to_temporaries,
 151                  nir_shader_get_entrypoint(nir), true, false);
 152    } else {
 153       nir = prog_to_nir(prog, options);
 154       NIR_PASS_V(nir, nir_lower_regs_to_ssa); /* turn registers into SSA */
 155       NIR_PASS_V(nir, gl_nir_lower_samplers, NULL);
 156    }
 157    nir_validate_shader(nir, "before brw_preprocess_nir");
 158
 159    nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
 160
 161    if (!devinfo->has_64bit_types && nir->info.uses_64bit) {
 162       nir_shader *fp64 = compile_fp64_funcs(ctx, options, ralloc_parent(nir), stage);
 163
 164       nir_validate_shader(fp64, "fp64");
 165       exec_list_append(&nir->functions, &fp64->functions);
 166    }
 167
 168    nir = brw_preprocess_nir(brw->screen->compiler, nir);
 169
 170    NIR_PASS_V(nir, brw_nir_lower_image_load_store, devinfo);
 171
 172    if (stage == MESA_SHADER_TESS_CTRL) {
 173       /* Lower gl_PatchVerticesIn from a sys. value to a uniform on Gen8+. */
 174       static const gl_state_index16 tokens[STATE_LENGTH] =
 175          { STATE_INTERNAL, STATE_TCS_PATCH_VERTICES_IN };
 176       nir_lower_patch_vertices(nir, 0, devinfo->gen >= 8 ? tokens : NULL);
 177    }
 178
 179    if (stage == MESA_SHADER_TESS_EVAL) {
 180       /* Lower gl_PatchVerticesIn to a constant if we have a TCS, or
 181        * a uniform if we don't.
 182        */
 183       struct gl_linked_shader *tcs =
 184          shader_prog->_LinkedShaders[MESA_SHADER_TESS_CTRL];
 185       uint32_t static_patch_vertices =
 186          tcs ? tcs->Program->nir->info.tess.tcs_vertices_out : 0;
 187       static const gl_state_index16 tokens[STATE_LENGTH] =
 188          { STATE_INTERNAL, STATE_TES_PATCH_VERTICES_IN };
 189       nir_lower_patch_vertices(nir, static_patch_vertices, tokens);
 190    }
 191
 192    if (stage == MESA_SHADER_FRAGMENT) {
 193       static const struct nir_lower_wpos_ytransform_options wpos_options = {
 194          .state_tokens = {STATE_INTERNAL, STATE_FB_WPOS_Y_TRANSFORM, 0, 0, 0},
 195          .fs_coord_pixel_center_integer = 1,
 196          .fs_coord_origin_upper_left = 1,
 197       };
 198
 199       bool progress = false;
 200       NIR_PASS(progress, nir, nir_lower_wpos_ytransform, &wpos_options);
 201       if (progress) {
 202          _mesa_add_state_reference(prog->Parameters,
 203                                    wpos_options.state_tokens);
 204       }
 205    }
 206
 207    NIR_PASS_V(nir, brw_nir_lower_uniforms, is_scalar);
 208
 209    return nir;
 210 }
 211
 212 void
 213 brw_shader_gather_info(nir_shader *nir, struct gl_program *prog)
 214 {
 215    nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
 216
 217    /* Copy the info we just generated back into the gl_program */
 218    const char *prog_name = prog->info.name;
 219    const char *prog_label = prog->info.label;
 220    prog->info = nir->info;
 221    prog->info.name = prog_name;
 222    prog->info.label = prog_label;
 223 }
 224
 225 static unsigned
 226 get_new_program_id(struct intel_screen *screen)
 227 {
 228    return p_atomic_inc_return(&screen->program_id);
 229 }
 230
 231 static struct gl_program *brwNewProgram(struct gl_context *ctx, GLenum target,
 232                                         GLuint id, bool is_arb_asm)
 233 {
 234    struct brw_context *brw = brw_context(ctx);
 235    struct brw_program *prog = rzalloc(NULL, struct brw_program);
 236
 237    if (prog) {
 238       prog->id = get_new_program_id(brw->screen);
 239
 240       return _mesa_init_gl_program(&prog->program, target, id, is_arb_asm);
 241    }
 242
 243    return NULL;
 244 }
 245
 246 static void brwDeleteProgram( struct gl_context *ctx,
 247                               struct gl_program *prog )
 248 {
 249    struct brw_context *brw = brw_context(ctx);
 250
 251    /* Beware!  prog's refcount has reached zero, and it's about to be freed.
 252     *
 253     * In brw_upload_pipeline_state(), we compare brw->programs[i] to
 254     * ctx->FooProgram._Current, and flag BRW_NEW_FOO_PROGRAM if the
 255     * pointer has changed.
 256     *
 257     * We cannot leave brw->programs[i] as a dangling pointer to the dead
 258     * program.  malloc() may allocate the same memory for a new gl_program,
 259     * causing us to see matching pointers...but totally different programs.
 260     *
 261     * We cannot set brw->programs[i] to NULL, either.  If we've deleted the
 262     * active program, Mesa may set ctx->FooProgram._Current to NULL.  That
 263     * would cause us to see matching pointers (NULL == NULL), and fail to
 264     * detect that a program has changed since our last draw.
 265     *
 266     * So, set it to a bogus gl_program pointer that will never match,
 267     * causing us to properly reevaluate the state on our next draw.
 268     *
 269     * Getting this wrong causes heisenbugs which are very hard to catch,
 270     * as you need a very specific allocation pattern to hit the problem.
 271     */
 272    static const struct gl_program deleted_program;
 273
 274    for (int i = 0; i < MESA_SHADER_STAGES; i++) {
 275       if (brw->programs[i] == prog)
 276          brw->programs[i] = (struct gl_program *) &deleted_program;
 277    }
 278
 279    _mesa_delete_program( ctx, prog );
 280 }
 281
 282
 283 static GLboolean
 284 brwProgramStringNotify(struct gl_context *ctx,
 285                        GLenum target,
 286                        struct gl_program *prog)
 287 {
 288    assert(target == GL_VERTEX_PROGRAM_ARB || !prog->arb.IsPositionInvariant);
 289
 290    struct brw_context *brw = brw_context(ctx);
 291    const struct brw_compiler *compiler = brw->screen->compiler;
 292
 293    switch (target) {
 294    case GL_FRAGMENT_PROGRAM_ARB: {
 295       struct brw_program *newFP = brw_program(prog);
 296       const struct brw_program *curFP =
 297          brw_program_const(brw->programs[MESA_SHADER_FRAGMENT]);
 298
 299       if (newFP == curFP)
 300          brw->ctx.NewDriverState |= BRW_NEW_FRAGMENT_PROGRAM;
 301       newFP->id = get_new_program_id(brw->screen);
 302
 303       prog->nir = brw_create_nir(brw, NULL, prog, MESA_SHADER_FRAGMENT, true);
 304
 305       brw_shader_gather_info(prog->nir, prog);
 306
 307       brw_fs_precompile(ctx, prog);
 308       break;
 309    }
 310    case GL_VERTEX_PROGRAM_ARB: {
 311       struct brw_program *newVP = brw_program(prog);
 312       const struct brw_program *curVP =
 313          brw_program_const(brw->programs[MESA_SHADER_VERTEX]);
 314
 315       if (newVP == curVP)
 316          brw->ctx.NewDriverState |= BRW_NEW_VERTEX_PROGRAM;
 317       if (newVP->program.arb.IsPositionInvariant) {
 318          _mesa_insert_mvp_code(ctx, &newVP->program);
 319       }
 320       newVP->id = get_new_program_id(brw->screen);
 321
 322       /* Also tell tnl about it:
 323        */
 324       _tnl_program_string(ctx, target, prog);
 325
 326       prog->nir = brw_create_nir(brw, NULL, prog, MESA_SHADER_VERTEX,
 327                                  compiler->scalar_stage[MESA_SHADER_VERTEX]);
 328
 329       brw_shader_gather_info(prog->nir, prog);
 330
 331       brw_vs_precompile(ctx, prog);
 332       break;
 333    }
 334    default:
 335       /*
 336        * driver->ProgramStringNotify is only called for ARB programs, fixed
 337        * function vertex programs, and ir_to_mesa (which isn't used by the
 338        * i965 back-end).  Therefore, even after geometry shaders are added,
 339        * this function should only ever be called with a target of
 340        * GL_VERTEX_PROGRAM_ARB or GL_FRAGMENT_PROGRAM_ARB.
 341        */
 342       unreachable("Unexpected target in brwProgramStringNotify");
 343    }
 344
 345    return true;
 346 }
 347
 348 static void
 349 brw_memory_barrier(struct gl_context *ctx, GLbitfield barriers)
 350 {
 351    struct brw_context *brw = brw_context(ctx);
 352    const struct gen_device_info *devinfo = &brw->screen->devinfo;
 353    unsigned bits = PIPE_CONTROL_DATA_CACHE_FLUSH | PIPE_CONTROL_CS_STALL;
 354    assert(devinfo->gen >= 7 && devinfo->gen <= 11);
 355
 356    if (barriers & (GL_VERTEX_ATTRIB_ARRAY_BARRIER_BIT |
 357                    GL_ELEMENT_ARRAY_BARRIER_BIT |
 358                    GL_COMMAND_BARRIER_BIT))
 359       bits |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
 360
 361    if (barriers & GL_UNIFORM_BARRIER_BIT)
 362       bits |= (PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
 363                PIPE_CONTROL_CONST_CACHE_INVALIDATE);
 364
 365    if (barriers & GL_TEXTURE_FETCH_BARRIER_BIT)
 366       bits |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
 367
 368    if (barriers & (GL_TEXTURE_UPDATE_BARRIER_BIT |
 369                    GL_PIXEL_BUFFER_BARRIER_BIT))
 370       bits |= (PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
 371                PIPE_CONTROL_RENDER_TARGET_FLUSH);
 372
 373    if (barriers & GL_FRAMEBUFFER_BARRIER_BIT)
 374       bits |= (PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
 375                PIPE_CONTROL_RENDER_TARGET_FLUSH);
 376
 377    /* Typed surface messages are handled by the render cache on IVB, so we
 378     * need to flush it too.
 379     */
 380    if (devinfo->gen == 7 && !devinfo->is_haswell)
 381       bits |= PIPE_CONTROL_RENDER_TARGET_FLUSH;
 382
 383    brw_emit_pipe_control_flush(brw, bits);
 384 }
 385
 386 static void
 387 brw_framebuffer_fetch_barrier(struct gl_context *ctx)
 388 {
 389    struct brw_context *brw = brw_context(ctx);
 390    const struct gen_device_info *devinfo = &brw->screen->devinfo;
 391
 392    if (!ctx->Extensions.EXT_shader_framebuffer_fetch) {
 393       if (devinfo->gen >= 6) {
 394          brw_emit_pipe_control_flush(brw,
 395                                      PIPE_CONTROL_RENDER_TARGET_FLUSH |
 396                                      PIPE_CONTROL_CS_STALL);
 397          brw_emit_pipe_control_flush(brw,
 398                                      PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE);
 399       } else {
 400          brw_emit_pipe_control_flush(brw,
 401                                      PIPE_CONTROL_RENDER_TARGET_FLUSH);
 402       }
 403    }
 404 }
 405
 406 void
 407 brw_get_scratch_bo(struct brw_context *brw,
 408                    struct brw_bo **scratch_bo, int size)
 409 {
 410    struct brw_bo *old_bo = *scratch_bo;
 411
 412    if (old_bo && old_bo->size < size) {
 413       brw_bo_unreference(old_bo);
 414       old_bo = NULL;
 415    }
 416
 417    if (!old_bo) {
 418       *scratch_bo =
 419          brw_bo_alloc(brw->bufmgr, "scratch bo", size, BRW_MEMZONE_SCRATCH);
 420    }
 421 }
 422
 423 /**
 424  * Reserve enough scratch space for the given stage to hold \p per_thread_size
 425  * bytes times the given \p thread_count.
 426  */
 427 void
 428 brw_alloc_stage_scratch(struct brw_context *brw,
 429                         struct brw_stage_state *stage_state,
 430                         unsigned per_thread_size)
 431 {
 432    if (stage_state->per_thread_scratch >= per_thread_size)
 433       return;
 434
 435    stage_state->per_thread_scratch = per_thread_size;
 436
 437    if (stage_state->scratch_bo)
 438       brw_bo_unreference(stage_state->scratch_bo);
 439
 440    const struct gen_device_info *devinfo = &brw->screen->devinfo;
 441    unsigned thread_count;
 442    switch(stage_state->stage) {
 443    case MESA_SHADER_VERTEX:
 444       thread_count = devinfo->max_vs_threads;
 445       break;
 446    case MESA_SHADER_TESS_CTRL:
 447       thread_count = devinfo->max_tcs_threads;
 448       break;
 449    case MESA_SHADER_TESS_EVAL:
 450       thread_count = devinfo->max_tes_threads;
 451       break;
 452    case MESA_SHADER_GEOMETRY:
 453       thread_count = devinfo->max_gs_threads;
 454       break;
 455    case MESA_SHADER_FRAGMENT:
 456       thread_count = devinfo->max_wm_threads;
 457       break;
 458    case MESA_SHADER_COMPUTE: {
 459       unsigned subslices = MAX2(brw->screen->subslice_total, 1);
 460
 461       /* The documentation for 3DSTATE_PS "Scratch Space Base Pointer" says:
 462        *
 463        * "Scratch Space per slice is computed based on 4 sub-slices.  SW must
 464        *  allocate scratch space enough so that each slice has 4 slices
 465        *  allowed."
 466        *
 467        * According to the other driver team, this applies to compute shaders
 468        * as well.  This is not currently documented at all.
 469        *
 470        * brw->screen->subslice_total is the TOTAL number of subslices
 471        * and we wish to view that there are 4 subslices per slice
 472        * instead of the actual number of subslices per slice.
 473        */
 474       if (devinfo->gen >= 9 && devinfo->gen < 11)
 475          subslices = 4 * brw->screen->devinfo.num_slices;
 476
 477       unsigned scratch_ids_per_subslice;
 478       if (devinfo->is_haswell) {
 479          /* WaCSScratchSize:hsw
 480           *
 481           * Haswell's scratch space address calculation appears to be sparse
 482           * rather than tightly packed. The Thread ID has bits indicating
 483           * which subslice, EU within a subslice, and thread within an EU it
 484           * is. There's a maximum of two slices and two subslices, so these
 485           * can be stored with a single bit. Even though there are only 10 EUs
 486           * per subslice, this is stored in 4 bits, so there's an effective
 487           * maximum value of 16 EUs. Similarly, although there are only 7
 488           * threads per EU, this is stored in a 3 bit number, giving an
 489           * effective maximum value of 8 threads per EU.
 490           *
 491           * This means that we need to use 16 * 8 instead of 10 * 7 for the
 492           * number of threads per subslice.
 493           */
 494          scratch_ids_per_subslice = 16 * 8;
 495       } else if (devinfo->is_cherryview) {
 496          /* Cherryview devices have either 6 or 8 EUs per subslice, and each
 497           * EU has 7 threads. The 6 EU devices appear to calculate thread IDs
 498           * as if it had 8 EUs.
 499           */
 500          scratch_ids_per_subslice = 8 * 7;
 501       } else {
 502          scratch_ids_per_subslice = devinfo->max_cs_threads;
 503       }
 504
 505       thread_count = scratch_ids_per_subslice * subslices;
 506       break;
 507    }
 508    default:
 509       unreachable("Unsupported stage!");
 510    }
 511
 512    stage_state->scratch_bo =
 513       brw_bo_alloc(brw->bufmgr, "shader scratch space",
 514                    per_thread_size * thread_count, BRW_MEMZONE_SCRATCH);
 515 }
 516
 517 void brwInitFragProgFuncs( struct dd_function_table *functions )
 518 {
 519    assert(functions->ProgramStringNotify == _tnl_program_string);
 520
 521    functions->NewProgram = brwNewProgram;
 522    functions->DeleteProgram = brwDeleteProgram;
 523    functions->ProgramStringNotify = brwProgramStringNotify;
 524
 525    functions->LinkShader = brw_link_shader;
 526
 527    functions->MemoryBarrier = brw_memory_barrier;
 528    functions->FramebufferFetchBarrier = brw_framebuffer_fetch_barrier;
 529 }
 530
 531 struct shader_times {
 532    uint64_t time;
 533    uint64_t written;
 534    uint64_t reset;
 535 };
 536
 537 void
 538 brw_init_shader_time(struct brw_context *brw)
 539 {
 540    const int max_entries = 2048;
 541    brw->shader_time.bo =
 542       brw_bo_alloc(brw->bufmgr, "shader time",
 543                    max_entries * BRW_SHADER_TIME_STRIDE * 3,
 544                    BRW_MEMZONE_OTHER);
 545    brw->shader_time.names = rzalloc_array(brw, const char *, max_entries);
 546    brw->shader_time.ids = rzalloc_array(brw, int, max_entries);
 547    brw->shader_time.types = rzalloc_array(brw, enum shader_time_shader_type,
 548                                           max_entries);
 549    brw->shader_time.cumulative = rzalloc_array(brw, struct shader_times,
 550                                                max_entries);
 551    brw->shader_time.max_entries = max_entries;
 552 }
 553
 554 static int
 555 compare_time(const void *a, const void *b)
 556 {
 557    uint64_t * const *a_val = a;
 558    uint64_t * const *b_val = b;
 559
 560    /* We don't just subtract because we're turning the value to an int. */
 561    if (**a_val < **b_val)
 562       return -1;
 563    else if (**a_val == **b_val)
 564       return 0;
 565    else
 566       return 1;
 567 }
 568
 569 static void
 570 print_shader_time_line(const char *stage, const char *name,
 571                        int shader_num, uint64_t time, uint64_t total)
 572 {
 573    fprintf(stderr, "%-6s%-18s", stage, name);
 574
 575    if (shader_num != 0)
 576       fprintf(stderr, "%4d: ", shader_num);
 577    else
 578       fprintf(stderr, "    : ");
 579
 580    fprintf(stderr, "%16lld (%7.2f Gcycles)      %4.1f%%\n",
 581            (long long)time,
 582            (double)time / 1000000000.0,
 583            (double)time / total * 100.0);
 584 }
 585
 586 static void
 587 brw_report_shader_time(struct brw_context *brw)
 588 {
 589    if (!brw->shader_time.bo || !brw->shader_time.num_entries)
 590       return;
 591
 592    uint64_t scaled[brw->shader_time.num_entries];
 593    uint64_t *sorted[brw->shader_time.num_entries];
 594    uint64_t total_by_type[ST_CS + 1];
 595    memset(total_by_type, 0, sizeof(total_by_type));
 596    double total = 0;
 597    for (int i = 0; i < brw->shader_time.num_entries; i++) {
 598       uint64_t written = 0, reset = 0;
 599       enum shader_time_shader_type type = brw->shader_time.types[i];
 600
 601       sorted[i] = &scaled[i];
 602
 603       switch (type) {
 604       case ST_VS:
 605       case ST_TCS:
 606       case ST_TES:
 607       case ST_GS:
 608       case ST_FS8:
 609       case ST_FS16:
 610       case ST_FS32:
 611       case ST_CS:
 612          written = brw->shader_time.cumulative[i].written;
 613          reset = brw->shader_time.cumulative[i].reset;
 614          break;
 615
 616       default:
 617          /* I sometimes want to print things that aren't the 3 shader times.
 618           * Just print the sum in that case.
 619           */
 620          written = 1;
 621          reset = 0;
 622          break;
 623       }
 624
 625       uint64_t time = brw->shader_time.cumulative[i].time;
 626       if (written) {
 627          scaled[i] = time / written * (written + reset);
 628       } else {
 629          scaled[i] = time;
 630       }
 631
 632       switch (type) {
 633       case ST_VS:
 634       case ST_TCS:
 635       case ST_TES:
 636       case ST_GS:
 637       case ST_FS8:
 638       case ST_FS16:
 639       case ST_FS32:
 640       case ST_CS:
 641          total_by_type[type] += scaled[i];
 642          break;
 643       default:
 644          break;
 645       }
 646
 647       total += scaled[i];
 648    }
 649
 650    if (total == 0) {
 651       fprintf(stderr, "No shader time collected yet\n");
 652       return;
 653    }
 654
 655    qsort(sorted, brw->shader_time.num_entries, sizeof(sorted[0]), compare_time);
 656
 657    fprintf(stderr, "\n");
 658    fprintf(stderr, "type          ID                  cycles spent                   %% of total\n");
 659    for (int s = 0; s < brw->shader_time.num_entries; s++) {
 660       const char *stage;
 661       /* Work back from the sorted pointers times to a time to print. */
 662       int i = sorted[s] - scaled;
 663
 664       if (scaled[i] == 0)
 665          continue;
 666
 667       int shader_num = brw->shader_time.ids[i];
 668       const char *shader_name = brw->shader_time.names[i];
 669
 670       switch (brw->shader_time.types[i]) {
 671       case ST_VS:
 672          stage = "vs";
 673          break;
 674       case ST_TCS:
 675          stage = "tcs";
 676          break;
 677       case ST_TES:
 678          stage = "tes";
 679          break;
 680       case ST_GS:
 681          stage = "gs";
 682          break;
 683       case ST_FS8:
 684          stage = "fs8";
 685          break;
 686       case ST_FS16:
 687          stage = "fs16";
 688          break;
 689       case ST_FS32:
 690          stage = "fs32";
 691          break;
 692       case ST_CS:
 693          stage = "cs";
 694          break;
 695       default:
 696          stage = "other";
 697          break;
 698       }
 699
 700       print_shader_time_line(stage, shader_name, shader_num,
 701                              scaled[i], total);
 702    }
 703
 704    fprintf(stderr, "\n");
 705    print_shader_time_line("total", "vs", 0, total_by_type[ST_VS], total);
 706    print_shader_time_line("total", "tcs", 0, total_by_type[ST_TCS], total);
 707    print_shader_time_line("total", "tes", 0, total_by_type[ST_TES], total);
 708    print_shader_time_line("total", "gs", 0, total_by_type[ST_GS], total);
 709    print_shader_time_line("total", "fs8", 0, total_by_type[ST_FS8], total);
 710    print_shader_time_line("total", "fs16", 0, total_by_type[ST_FS16], total);
 711    print_shader_time_line("total", "fs32", 0, total_by_type[ST_FS32], total);
 712    print_shader_time_line("total", "cs", 0, total_by_type[ST_CS], total);
 713 }
 714
 715 static void
 716 brw_collect_shader_time(struct brw_context *brw)
 717 {
 718    if (!brw->shader_time.bo)
 719       return;
 720
 721    /* This probably stalls on the last rendering.  We could fix that by
 722     * delaying reading the reports, but it doesn't look like it's a big
 723     * overhead compared to the cost of tracking the time in the first place.
 724     */
 725    void *bo_map = brw_bo_map(brw, brw->shader_time.bo, MAP_READ | MAP_WRITE);
 726
 727    for (int i = 0; i < brw->shader_time.num_entries; i++) {
 728       uint32_t *times = bo_map + i * 3 * BRW_SHADER_TIME_STRIDE;
 729
 730       brw->shader_time.cumulative[i].time += times[BRW_SHADER_TIME_STRIDE * 0 / 4];
 731       brw->shader_time.cumulative[i].written += times[BRW_SHADER_TIME_STRIDE * 1 / 4];
 732       brw->shader_time.cumulative[i].reset += times[BRW_SHADER_TIME_STRIDE * 2 / 4];
 733    }
 734
 735    /* Zero the BO out to clear it out for our next collection.
 736     */
 737    memset(bo_map, 0, brw->shader_time.bo->size);
 738    brw_bo_unmap(brw->shader_time.bo);
 739 }
 740
 741 void
 742 brw_collect_and_report_shader_time(struct brw_context *brw)
 743 {
 744    brw_collect_shader_time(brw);
 745
 746    if (brw->shader_time.report_time == 0 ||
 747        get_time() - brw->shader_time.report_time >= 1.0) {
 748       brw_report_shader_time(brw);
 749       brw->shader_time.report_time = get_time();
 750    }
 751 }
 752
 753 /**
 754  * Chooses an index in the shader_time buffer and sets up tracking information
 755  * for our printouts.
 756  *
 757  * Note that this holds on to references to the underlying programs, which may
 758  * change their lifetimes compared to normal operation.
 759  */
 760 int
 761 brw_get_shader_time_index(struct brw_context *brw, struct gl_program *prog,
 762                           enum shader_time_shader_type type, bool is_glsl_sh)
 763 {
 764    int shader_time_index = brw->shader_time.num_entries++;
 765    assert(shader_time_index < brw->shader_time.max_entries);
 766    brw->shader_time.types[shader_time_index] = type;
 767
 768    const char *name;
 769    if (prog->Id == 0) {
 770       name = "ff";
 771    } else if (is_glsl_sh) {
 772       name = prog->info.label ?
 773          ralloc_strdup(brw->shader_time.names, prog->info.label) : "glsl";
 774    } else {
 775       name = "prog";
 776    }
 777
 778    brw->shader_time.names[shader_time_index] = name;
 779    brw->shader_time.ids[shader_time_index] = prog->Id;
 780
 781    return shader_time_index;
 782 }
 783
 784 void
 785 brw_destroy_shader_time(struct brw_context *brw)
 786 {
 787    brw_bo_unreference(brw->shader_time.bo);
 788    brw->shader_time.bo = NULL;
 789 }
 790
 791 void
 792 brw_stage_prog_data_free(const void *p)
 793 {
 794    struct brw_stage_prog_data *prog_data = (struct brw_stage_prog_data *)p;
 795
 796    ralloc_free(prog_data->param);
 797    ralloc_free(prog_data->pull_param);
 798 }
 799
 800 void
 801 brw_dump_arb_asm(const char *stage, struct gl_program *prog)
 802 {
 803    fprintf(stderr, "ARB_%s_program %d ir for native %s shader\n",
 804            stage, prog->Id, stage);
 805    _mesa_print_program(prog);
 806 }
 807
 808 void
 809 brw_setup_tex_for_precompile(const struct gen_device_info *devinfo,
 810                              struct brw_sampler_prog_key_data *tex,
 811                              struct gl_program *prog)
 812 {
 813    const bool has_shader_channel_select = devinfo->is_haswell || devinfo->gen >= 8;
 814    unsigned sampler_count = util_last_bit(prog->SamplersUsed);
 815    for (unsigned i = 0; i < sampler_count; i++) {
 816       if (!has_shader_channel_select && (prog->ShadowSamplers & (1 << i))) {
 817          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
 818          tex->swizzles[i] =
 819             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
 820       } else {
 821          /* Color sampler: assume no swizzling. */
 822          tex->swizzles[i] = SWIZZLE_XYZW;
 823       }
 824    }
 825 }
 826
 827 /**
 828  * Sets up the starting offsets for the groups of binding table entries
 829  * common to all pipeline stages.
 830  *
 831  * Unused groups are initialized to 0xd0d0d0d0 to make it obvious that they're
 832  * unused but also make sure that addition of small offsets to them will
 833  * trigger some of our asserts that surface indices are < BRW_MAX_SURFACES.
 834  */
 835 uint32_t
 836 brw_assign_common_binding_table_offsets(const struct gen_device_info *devinfo,
 837                                         const struct gl_program *prog,
 838                                         struct brw_stage_prog_data *stage_prog_data,
 839                                         uint32_t next_binding_table_offset)
 840 {
 841    int num_textures = util_last_bit(prog->SamplersUsed);
 842
 843    stage_prog_data->binding_table.texture_start = next_binding_table_offset;
 844    next_binding_table_offset += num_textures;
 845
 846    if (prog->info.num_ubos) {
 847       assert(prog->info.num_ubos <= BRW_MAX_UBO);
 848       stage_prog_data->binding_table.ubo_start = next_binding_table_offset;
 849       next_binding_table_offset += prog->info.num_ubos;
 850    } else {
 851       stage_prog_data->binding_table.ubo_start = 0xd0d0d0d0;
 852    }
 853
 854    if (prog->info.num_ssbos || prog->info.num_abos) {
 855       assert(prog->info.num_abos <= BRW_MAX_ABO);
 856       assert(prog->info.num_ssbos <= BRW_MAX_SSBO);
 857       stage_prog_data->binding_table.ssbo_start = next_binding_table_offset;
 858       next_binding_table_offset += prog->info.num_abos + prog->info.num_ssbos;
 859    } else {
 860       stage_prog_data->binding_table.ssbo_start = 0xd0d0d0d0;
 861    }
 862
 863    if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
 864       stage_prog_data->binding_table.shader_time_start = next_binding_table_offset;
 865       next_binding_table_offset++;
 866    } else {
 867       stage_prog_data->binding_table.shader_time_start = 0xd0d0d0d0;
 868    }
 869
 870    if (prog->info.uses_texture_gather) {
 871       if (devinfo->gen >= 8) {
 872          stage_prog_data->binding_table.gather_texture_start =
 873             stage_prog_data->binding_table.texture_start;
 874       } else {
 875          stage_prog_data->binding_table.gather_texture_start = next_binding_table_offset;
 876          next_binding_table_offset += num_textures;
 877       }
 878    } else {
 879       stage_prog_data->binding_table.gather_texture_start = 0xd0d0d0d0;
 880    }
 881
 882    if (prog->info.num_images) {
 883       stage_prog_data->binding_table.image_start = next_binding_table_offset;
 884       next_binding_table_offset += prog->info.num_images;
 885    } else {
 886       stage_prog_data->binding_table.image_start = 0xd0d0d0d0;
 887    }
 888
 889    /* This may or may not be used depending on how the compile goes. */
 890    stage_prog_data->binding_table.pull_constants_start = next_binding_table_offset;
 891    next_binding_table_offset++;
 892
 893    /* Plane 0 is just the regular texture section */
 894    stage_prog_data->binding_table.plane_start[0] = stage_prog_data->binding_table.texture_start;
 895
 896    stage_prog_data->binding_table.plane_start[1] = next_binding_table_offset;
 897    next_binding_table_offset += num_textures;
 898
 899    stage_prog_data->binding_table.plane_start[2] = next_binding_table_offset;
 900    next_binding_table_offset += num_textures;
 901
 902    /* Set the binding table size.  Some callers may append new entries
 903     * and increase this accordingly.
 904     */
 905    stage_prog_data->binding_table.size_bytes = next_binding_table_offset * 4;
 906
 907    assert(next_binding_table_offset <= BRW_MAX_SURFACES);
 908    return next_binding_table_offset;
 909 }
 910
 911 void
 912 brw_prog_key_set_id(union brw_any_prog_key *key, gl_shader_stage stage,
 913                     unsigned id)
 914 {
 915    static const unsigned stage_offsets[] = {
 916       offsetof(struct brw_vs_prog_key, program_string_id),
 917       offsetof(struct brw_tcs_prog_key, program_string_id),
 918       offsetof(struct brw_tes_prog_key, program_string_id),
 919       offsetof(struct brw_gs_prog_key, program_string_id),
 920       offsetof(struct brw_wm_prog_key, program_string_id),
 921       offsetof(struct brw_cs_prog_key, program_string_id),
 922    };
 923    assert((int)stage >= 0 && stage < ARRAY_SIZE(stage_offsets));
 924    *(unsigned*)((uint8_t*)key + stage_offsets[stage]) = id;
 925 }
 926
 927 void
 928 brw_populate_default_key(const struct gen_device_info *devinfo,
 929                          union brw_any_prog_key *prog_key,
 930                          struct gl_shader_program *sh_prog,
 931                          struct gl_program *prog)
 932 {
 933    switch (prog->info.stage) {
 934    case MESA_SHADER_VERTEX:
 935       brw_vs_populate_default_key(devinfo, &prog_key->vs, prog);
 936       break;
 937    case MESA_SHADER_TESS_CTRL:
 938       brw_tcs_populate_default_key(devinfo, &prog_key->tcs, sh_prog, prog);
 939       break;
 940    case MESA_SHADER_TESS_EVAL:
 941       brw_tes_populate_default_key(devinfo, &prog_key->tes, sh_prog, prog);
 942       break;
 943    case MESA_SHADER_GEOMETRY:
 944       brw_gs_populate_default_key(devinfo, &prog_key->gs, prog);
 945       break;
 946    case MESA_SHADER_FRAGMENT:
 947       brw_wm_populate_default_key(devinfo, &prog_key->wm, prog);
 948       break;
 949    case MESA_SHADER_COMPUTE:
 950       brw_cs_populate_default_key(devinfo, &prog_key->cs, prog);
 951       break;
 952    default:
 953       unreachable("Unsupported stage!");
 954    }
 955 }