src/mesa/drivers/dri/i965/brw_program.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keithw@vmware.com>
  30   */
  31
  32 #include <pthread.h>
  33 #include "main/imports.h"
  34 #include "main/glspirv.h"
  35 #include "program/prog_parameter.h"
  36 #include "program/prog_print.h"
  37 #include "program/prog_to_nir.h"
  38 #include "program/program.h"
  39 #include "program/programopt.h"
  40 #include "tnl/tnl.h"
  41 #include "util/ralloc.h"
  42 #include "compiler/glsl/ir.h"
  43 #include "compiler/glsl/program.h"
  44 #include "compiler/glsl/gl_nir.h"
  45 #include "compiler/glsl/glsl_to_nir.h"
  46
  47 #include "brw_program.h"
  48 #include "brw_context.h"
  49 #include "compiler/brw_nir.h"
  50 #include "brw_defines.h"
  51 #include "intel_batchbuffer.h"
  52
  53 #include "brw_cs.h"
  54 #include "brw_gs.h"
  55 #include "brw_vs.h"
  56 #include "brw_wm.h"
  57
  58 #include "main/shaderapi.h"
  59 #include "main/shaderobj.h"
  60
  61 static bool
  62 brw_nir_lower_uniforms(nir_shader *nir, bool is_scalar)
  63 {
  64    if (is_scalar) {
  65       nir_assign_var_locations(&nir->uniforms, &nir->num_uniforms,
  66                                type_size_scalar_bytes);
  67       return nir_lower_io(nir, nir_var_uniform, type_size_scalar_bytes, 0);
  68    } else {
  69       nir_assign_var_locations(&nir->uniforms, &nir->num_uniforms,
  70                                type_size_vec4_bytes);
  71       return nir_lower_io(nir, nir_var_uniform, type_size_vec4_bytes, 0);
  72    }
  73 }
  74
  75 static struct gl_program *brwNewProgram(struct gl_context *ctx, GLenum target,
  76                                         GLuint id, bool is_arb_asm);
  77
  78 nir_shader *
  79 brw_create_nir(struct brw_context *brw,
  80                const struct gl_shader_program *shader_prog,
  81                struct gl_program *prog,
  82                gl_shader_stage stage,
  83                bool is_scalar)
  84 {
  85    const struct gen_device_info *devinfo = &brw->screen->devinfo;
  86    struct gl_context *ctx = &brw->ctx;
  87    const nir_shader_compiler_options *options =
  88       ctx->Const.ShaderCompilerOptions[stage].NirOptions;
  89    nir_shader *nir;
  90
  91    /* First, lower the GLSL/Mesa IR or SPIR-V to NIR */
  92    if (shader_prog) {
  93       if (shader_prog->data->spirv) {
  94          nir = _mesa_spirv_to_nir(ctx, shader_prog, stage, options);
  95       } else {
  96          nir = glsl_to_nir(ctx, shader_prog, stage, options);
  97       }
  98       assert (nir);
  99
 100       nir_remove_dead_variables(nir, nir_var_shader_in | nir_var_shader_out);
 101       nir_validate_shader(nir, "after glsl_to_nir or spirv_to_nir");
 102       NIR_PASS_V(nir, nir_lower_io_to_temporaries,
 103                  nir_shader_get_entrypoint(nir), true, false);
 104    } else {
 105       nir = prog_to_nir(prog, options);
 106       NIR_PASS_V(nir, nir_lower_regs_to_ssa); /* turn registers into SSA */
 107       NIR_PASS_V(nir, gl_nir_lower_samplers, NULL);
 108    }
 109    nir_validate_shader(nir, "before brw_preprocess_nir");
 110
 111    nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
 112
 113    nir_shader *softfp64 = NULL;
 114    if ((options->lower_doubles_options & nir_lower_fp64_full_software) &&
 115        nir->info.uses_64bit) {
 116       softfp64 = glsl_float64_funcs_to_nir(ctx, options);
 117       ralloc_steal(ralloc_parent(nir), softfp64);
 118    }
 119
 120    nir = brw_preprocess_nir(brw->screen->compiler, nir, softfp64);
 121
 122    NIR_PASS_V(nir, brw_nir_lower_image_load_store, devinfo);
 123
 124    NIR_PASS_V(nir, gl_nir_lower_buffers, shader_prog);
 125    /* Do a round of constant folding to clean up address calculations */
 126    NIR_PASS_V(nir, nir_opt_constant_folding);
 127
 128    if (stage == MESA_SHADER_TESS_CTRL) {
 129       /* Lower gl_PatchVerticesIn from a sys. value to a uniform on Gen8+. */
 130       static const gl_state_index16 tokens[STATE_LENGTH] =
 131          { STATE_INTERNAL, STATE_TCS_PATCH_VERTICES_IN };
 132       nir_lower_patch_vertices(nir, 0, devinfo->gen >= 8 ? tokens : NULL);
 133    }
 134
 135    if (stage == MESA_SHADER_TESS_EVAL) {
 136       /* Lower gl_PatchVerticesIn to a constant if we have a TCS, or
 137        * a uniform if we don't.
 138        */
 139       struct gl_linked_shader *tcs =
 140          shader_prog->_LinkedShaders[MESA_SHADER_TESS_CTRL];
 141       uint32_t static_patch_vertices =
 142          tcs ? tcs->Program->nir->info.tess.tcs_vertices_out : 0;
 143       static const gl_state_index16 tokens[STATE_LENGTH] =
 144          { STATE_INTERNAL, STATE_TES_PATCH_VERTICES_IN };
 145       nir_lower_patch_vertices(nir, static_patch_vertices, tokens);
 146    }
 147
 148    if (stage == MESA_SHADER_FRAGMENT) {
 149       static const struct nir_lower_wpos_ytransform_options wpos_options = {
 150          .state_tokens = {STATE_INTERNAL, STATE_FB_WPOS_Y_TRANSFORM, 0, 0, 0},
 151          .fs_coord_pixel_center_integer = 1,
 152          .fs_coord_origin_upper_left = 1,
 153       };
 154
 155       bool progress = false;
 156       NIR_PASS(progress, nir, nir_lower_wpos_ytransform, &wpos_options);
 157       if (progress) {
 158          _mesa_add_state_reference(prog->Parameters,
 159                                    wpos_options.state_tokens);
 160       }
 161    }
 162
 163    NIR_PASS_V(nir, brw_nir_lower_uniforms, is_scalar);
 164
 165    return nir;
 166 }
 167
 168 void
 169 brw_shader_gather_info(nir_shader *nir, struct gl_program *prog)
 170 {
 171    nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
 172
 173    /* Copy the info we just generated back into the gl_program */
 174    const char *prog_name = prog->info.name;
 175    const char *prog_label = prog->info.label;
 176    prog->info = nir->info;
 177    prog->info.name = prog_name;
 178    prog->info.label = prog_label;
 179 }
 180
 181 static unsigned
 182 get_new_program_id(struct intel_screen *screen)
 183 {
 184    return p_atomic_inc_return(&screen->program_id);
 185 }
 186
 187 static struct gl_program *brwNewProgram(struct gl_context *ctx, GLenum target,
 188                                         GLuint id, bool is_arb_asm)
 189 {
 190    struct brw_context *brw = brw_context(ctx);
 191    struct brw_program *prog = rzalloc(NULL, struct brw_program);
 192
 193    if (prog) {
 194       prog->id = get_new_program_id(brw->screen);
 195
 196       return _mesa_init_gl_program(&prog->program, target, id, is_arb_asm);
 197    }
 198
 199    return NULL;
 200 }
 201
 202 static void brwDeleteProgram( struct gl_context *ctx,
 203                               struct gl_program *prog )
 204 {
 205    struct brw_context *brw = brw_context(ctx);
 206
 207    /* Beware!  prog's refcount has reached zero, and it's about to be freed.
 208     *
 209     * In brw_upload_pipeline_state(), we compare brw->programs[i] to
 210     * ctx->FooProgram._Current, and flag BRW_NEW_FOO_PROGRAM if the
 211     * pointer has changed.
 212     *
 213     * We cannot leave brw->programs[i] as a dangling pointer to the dead
 214     * program.  malloc() may allocate the same memory for a new gl_program,
 215     * causing us to see matching pointers...but totally different programs.
 216     *
 217     * We cannot set brw->programs[i] to NULL, either.  If we've deleted the
 218     * active program, Mesa may set ctx->FooProgram._Current to NULL.  That
 219     * would cause us to see matching pointers (NULL == NULL), and fail to
 220     * detect that a program has changed since our last draw.
 221     *
 222     * So, set it to a bogus gl_program pointer that will never match,
 223     * causing us to properly reevaluate the state on our next draw.
 224     *
 225     * Getting this wrong causes heisenbugs which are very hard to catch,
 226     * as you need a very specific allocation pattern to hit the problem.
 227     */
 228    static const struct gl_program deleted_program;
 229
 230    for (int i = 0; i < MESA_SHADER_STAGES; i++) {
 231       if (brw->programs[i] == prog)
 232          brw->programs[i] = (struct gl_program *) &deleted_program;
 233    }
 234
 235    _mesa_delete_program( ctx, prog );
 236 }
 237
 238
 239 static GLboolean
 240 brwProgramStringNotify(struct gl_context *ctx,
 241                        GLenum target,
 242                        struct gl_program *prog)
 243 {
 244    assert(target == GL_VERTEX_PROGRAM_ARB || !prog->arb.IsPositionInvariant);
 245
 246    struct brw_context *brw = brw_context(ctx);
 247    const struct brw_compiler *compiler = brw->screen->compiler;
 248
 249    switch (target) {
 250    case GL_FRAGMENT_PROGRAM_ARB: {
 251       struct brw_program *newFP = brw_program(prog);
 252       const struct brw_program *curFP =
 253          brw_program_const(brw->programs[MESA_SHADER_FRAGMENT]);
 254
 255       if (newFP == curFP)
 256          brw->ctx.NewDriverState |= BRW_NEW_FRAGMENT_PROGRAM;
 257       newFP->id = get_new_program_id(brw->screen);
 258
 259       prog->nir = brw_create_nir(brw, NULL, prog, MESA_SHADER_FRAGMENT, true);
 260
 261       brw_shader_gather_info(prog->nir, prog);
 262
 263       brw_fs_precompile(ctx, prog);
 264       break;
 265    }
 266    case GL_VERTEX_PROGRAM_ARB: {
 267       struct brw_program *newVP = brw_program(prog);
 268       const struct brw_program *curVP =
 269          brw_program_const(brw->programs[MESA_SHADER_VERTEX]);
 270
 271       if (newVP == curVP)
 272          brw->ctx.NewDriverState |= BRW_NEW_VERTEX_PROGRAM;
 273       if (newVP->program.arb.IsPositionInvariant) {
 274          _mesa_insert_mvp_code(ctx, &newVP->program);
 275       }
 276       newVP->id = get_new_program_id(brw->screen);
 277
 278       /* Also tell tnl about it:
 279        */
 280       _tnl_program_string(ctx, target, prog);
 281
 282       prog->nir = brw_create_nir(brw, NULL, prog, MESA_SHADER_VERTEX,
 283                                  compiler->scalar_stage[MESA_SHADER_VERTEX]);
 284
 285       brw_shader_gather_info(prog->nir, prog);
 286
 287       brw_vs_precompile(ctx, prog);
 288       break;
 289    }
 290    default:
 291       /*
 292        * driver->ProgramStringNotify is only called for ARB programs, fixed
 293        * function vertex programs, and ir_to_mesa (which isn't used by the
 294        * i965 back-end).  Therefore, even after geometry shaders are added,
 295        * this function should only ever be called with a target of
 296        * GL_VERTEX_PROGRAM_ARB or GL_FRAGMENT_PROGRAM_ARB.
 297        */
 298       unreachable("Unexpected target in brwProgramStringNotify");
 299    }
 300
 301    return true;
 302 }
 303
 304 static void
 305 brw_memory_barrier(struct gl_context *ctx, GLbitfield barriers)
 306 {
 307    struct brw_context *brw = brw_context(ctx);
 308    const struct gen_device_info *devinfo = &brw->screen->devinfo;
 309    unsigned bits = PIPE_CONTROL_DATA_CACHE_FLUSH | PIPE_CONTROL_CS_STALL;
 310    assert(devinfo->gen >= 7 && devinfo->gen <= 11);
 311
 312    if (barriers & (GL_VERTEX_ATTRIB_ARRAY_BARRIER_BIT |
 313                    GL_ELEMENT_ARRAY_BARRIER_BIT |
 314                    GL_COMMAND_BARRIER_BIT))
 315       bits |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
 316
 317    if (barriers & GL_UNIFORM_BARRIER_BIT)
 318       bits |= (PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
 319                PIPE_CONTROL_CONST_CACHE_INVALIDATE);
 320
 321    if (barriers & GL_TEXTURE_FETCH_BARRIER_BIT)
 322       bits |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
 323
 324    if (barriers & (GL_TEXTURE_UPDATE_BARRIER_BIT |
 325                    GL_PIXEL_BUFFER_BARRIER_BIT))
 326       bits |= (PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
 327                PIPE_CONTROL_RENDER_TARGET_FLUSH);
 328
 329    if (barriers & GL_FRAMEBUFFER_BARRIER_BIT)
 330       bits |= (PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
 331                PIPE_CONTROL_RENDER_TARGET_FLUSH);
 332
 333    /* Typed surface messages are handled by the render cache on IVB, so we
 334     * need to flush it too.
 335     */
 336    if (devinfo->gen == 7 && !devinfo->is_haswell)
 337       bits |= PIPE_CONTROL_RENDER_TARGET_FLUSH;
 338
 339    brw_emit_pipe_control_flush(brw, bits);
 340 }
 341
 342 static void
 343 brw_framebuffer_fetch_barrier(struct gl_context *ctx)
 344 {
 345    struct brw_context *brw = brw_context(ctx);
 346    const struct gen_device_info *devinfo = &brw->screen->devinfo;
 347
 348    if (!ctx->Extensions.EXT_shader_framebuffer_fetch) {
 349       if (devinfo->gen >= 6) {
 350          brw_emit_pipe_control_flush(brw,
 351                                      PIPE_CONTROL_RENDER_TARGET_FLUSH |
 352                                      PIPE_CONTROL_CS_STALL);
 353          brw_emit_pipe_control_flush(brw,
 354                                      PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE);
 355       } else {
 356          brw_emit_pipe_control_flush(brw,
 357                                      PIPE_CONTROL_RENDER_TARGET_FLUSH);
 358       }
 359    }
 360 }
 361
 362 void
 363 brw_get_scratch_bo(struct brw_context *brw,
 364                    struct brw_bo **scratch_bo, int size)
 365 {
 366    struct brw_bo *old_bo = *scratch_bo;
 367
 368    if (old_bo && old_bo->size < size) {
 369       brw_bo_unreference(old_bo);
 370       old_bo = NULL;
 371    }
 372
 373    if (!old_bo) {
 374       *scratch_bo =
 375          brw_bo_alloc(brw->bufmgr, "scratch bo", size, BRW_MEMZONE_SCRATCH);
 376    }
 377 }
 378
 379 /**
 380  * Reserve enough scratch space for the given stage to hold \p per_thread_size
 381  * bytes times the given \p thread_count.
 382  */
 383 void
 384 brw_alloc_stage_scratch(struct brw_context *brw,
 385                         struct brw_stage_state *stage_state,
 386                         unsigned per_thread_size)
 387 {
 388    if (stage_state->per_thread_scratch >= per_thread_size)
 389       return;
 390
 391    stage_state->per_thread_scratch = per_thread_size;
 392
 393    if (stage_state->scratch_bo)
 394       brw_bo_unreference(stage_state->scratch_bo);
 395
 396    const struct gen_device_info *devinfo = &brw->screen->devinfo;
 397    unsigned thread_count;
 398    switch(stage_state->stage) {
 399    case MESA_SHADER_VERTEX:
 400       thread_count = devinfo->max_vs_threads;
 401       break;
 402    case MESA_SHADER_TESS_CTRL:
 403       thread_count = devinfo->max_tcs_threads;
 404       break;
 405    case MESA_SHADER_TESS_EVAL:
 406       thread_count = devinfo->max_tes_threads;
 407       break;
 408    case MESA_SHADER_GEOMETRY:
 409       thread_count = devinfo->max_gs_threads;
 410       break;
 411    case MESA_SHADER_FRAGMENT:
 412       thread_count = devinfo->max_wm_threads;
 413       break;
 414    case MESA_SHADER_COMPUTE: {
 415       unsigned subslices = MAX2(brw->screen->subslice_total, 1);
 416
 417       /* The documentation for 3DSTATE_PS "Scratch Space Base Pointer" says:
 418        *
 419        * "Scratch Space per slice is computed based on 4 sub-slices.  SW must
 420        *  allocate scratch space enough so that each slice has 4 slices
 421        *  allowed."
 422        *
 423        * According to the other driver team, this applies to compute shaders
 424        * as well.  This is not currently documented at all.
 425        *
 426        * brw->screen->subslice_total is the TOTAL number of subslices
 427        * and we wish to view that there are 4 subslices per slice
 428        * instead of the actual number of subslices per slice.
 429        */
 430       if (devinfo->gen >= 9 && devinfo->gen < 11)
 431          subslices = 4 * brw->screen->devinfo.num_slices;
 432
 433       unsigned scratch_ids_per_subslice;
 434       if (devinfo->is_haswell) {
 435          /* WaCSScratchSize:hsw
 436           *
 437           * Haswell's scratch space address calculation appears to be sparse
 438           * rather than tightly packed. The Thread ID has bits indicating
 439           * which subslice, EU within a subslice, and thread within an EU it
 440           * is. There's a maximum of two slices and two subslices, so these
 441           * can be stored with a single bit. Even though there are only 10 EUs
 442           * per subslice, this is stored in 4 bits, so there's an effective
 443           * maximum value of 16 EUs. Similarly, although there are only 7
 444           * threads per EU, this is stored in a 3 bit number, giving an
 445           * effective maximum value of 8 threads per EU.
 446           *
 447           * This means that we need to use 16 * 8 instead of 10 * 7 for the
 448           * number of threads per subslice.
 449           */
 450          scratch_ids_per_subslice = 16 * 8;
 451       } else if (devinfo->is_cherryview) {
 452          /* Cherryview devices have either 6 or 8 EUs per subslice, and each
 453           * EU has 7 threads. The 6 EU devices appear to calculate thread IDs
 454           * as if it had 8 EUs.
 455           */
 456          scratch_ids_per_subslice = 8 * 7;
 457       } else {
 458          scratch_ids_per_subslice = devinfo->max_cs_threads;
 459       }
 460
 461       thread_count = scratch_ids_per_subslice * subslices;
 462       break;
 463    }
 464    default:
 465       unreachable("Unsupported stage!");
 466    }
 467
 468    stage_state->scratch_bo =
 469       brw_bo_alloc(brw->bufmgr, "shader scratch space",
 470                    per_thread_size * thread_count, BRW_MEMZONE_SCRATCH);
 471 }
 472
 473 void brwInitFragProgFuncs( struct dd_function_table *functions )
 474 {
 475    assert(functions->ProgramStringNotify == _tnl_program_string);
 476
 477    functions->NewProgram = brwNewProgram;
 478    functions->DeleteProgram = brwDeleteProgram;
 479    functions->ProgramStringNotify = brwProgramStringNotify;
 480
 481    functions->LinkShader = brw_link_shader;
 482
 483    functions->MemoryBarrier = brw_memory_barrier;
 484    functions->FramebufferFetchBarrier = brw_framebuffer_fetch_barrier;
 485 }
 486
 487 struct shader_times {
 488    uint64_t time;
 489    uint64_t written;
 490    uint64_t reset;
 491 };
 492
 493 void
 494 brw_init_shader_time(struct brw_context *brw)
 495 {
 496    const int max_entries = 2048;
 497    brw->shader_time.bo =
 498       brw_bo_alloc(brw->bufmgr, "shader time",
 499                    max_entries * BRW_SHADER_TIME_STRIDE * 3,
 500                    BRW_MEMZONE_OTHER);
 501    brw->shader_time.names = rzalloc_array(brw, const char *, max_entries);
 502    brw->shader_time.ids = rzalloc_array(brw, int, max_entries);
 503    brw->shader_time.types = rzalloc_array(brw, enum shader_time_shader_type,
 504                                           max_entries);
 505    brw->shader_time.cumulative = rzalloc_array(brw, struct shader_times,
 506                                                max_entries);
 507    brw->shader_time.max_entries = max_entries;
 508 }
 509
 510 static int
 511 compare_time(const void *a, const void *b)
 512 {
 513    uint64_t * const *a_val = a;
 514    uint64_t * const *b_val = b;
 515
 516    /* We don't just subtract because we're turning the value to an int. */
 517    if (**a_val < **b_val)
 518       return -1;
 519    else if (**a_val == **b_val)
 520       return 0;
 521    else
 522       return 1;
 523 }
 524
 525 static void
 526 print_shader_time_line(const char *stage, const char *name,
 527                        int shader_num, uint64_t time, uint64_t total)
 528 {
 529    fprintf(stderr, "%-6s%-18s", stage, name);
 530
 531    if (shader_num != 0)
 532       fprintf(stderr, "%4d: ", shader_num);
 533    else
 534       fprintf(stderr, "    : ");
 535
 536    fprintf(stderr, "%16lld (%7.2f Gcycles)      %4.1f%%\n",
 537            (long long)time,
 538            (double)time / 1000000000.0,
 539            (double)time / total * 100.0);
 540 }
 541
 542 static void
 543 brw_report_shader_time(struct brw_context *brw)
 544 {
 545    if (!brw->shader_time.bo || !brw->shader_time.num_entries)
 546       return;
 547
 548    uint64_t scaled[brw->shader_time.num_entries];
 549    uint64_t *sorted[brw->shader_time.num_entries];
 550    uint64_t total_by_type[ST_CS + 1];
 551    memset(total_by_type, 0, sizeof(total_by_type));
 552    double total = 0;
 553    for (int i = 0; i < brw->shader_time.num_entries; i++) {
 554       uint64_t written = 0, reset = 0;
 555       enum shader_time_shader_type type = brw->shader_time.types[i];
 556
 557       sorted[i] = &scaled[i];
 558
 559       switch (type) {
 560       case ST_VS:
 561       case ST_TCS:
 562       case ST_TES:
 563       case ST_GS:
 564       case ST_FS8:
 565       case ST_FS16:
 566       case ST_FS32:
 567       case ST_CS:
 568          written = brw->shader_time.cumulative[i].written;
 569          reset = brw->shader_time.cumulative[i].reset;
 570          break;
 571
 572       default:
 573          /* I sometimes want to print things that aren't the 3 shader times.
 574           * Just print the sum in that case.
 575           */
 576          written = 1;
 577          reset = 0;
 578          break;
 579       }
 580
 581       uint64_t time = brw->shader_time.cumulative[i].time;
 582       if (written) {
 583          scaled[i] = time / written * (written + reset);
 584       } else {
 585          scaled[i] = time;
 586       }
 587
 588       switch (type) {
 589       case ST_VS:
 590       case ST_TCS:
 591       case ST_TES:
 592       case ST_GS:
 593       case ST_FS8:
 594       case ST_FS16:
 595       case ST_FS32:
 596       case ST_CS:
 597          total_by_type[type] += scaled[i];
 598          break;
 599       default:
 600          break;
 601       }
 602
 603       total += scaled[i];
 604    }
 605
 606    if (total == 0) {
 607       fprintf(stderr, "No shader time collected yet\n");
 608       return;
 609    }
 610
 611    qsort(sorted, brw->shader_time.num_entries, sizeof(sorted[0]), compare_time);
 612
 613    fprintf(stderr, "\n");
 614    fprintf(stderr, "type          ID                  cycles spent                   %% of total\n");
 615    for (int s = 0; s < brw->shader_time.num_entries; s++) {
 616       const char *stage;
 617       /* Work back from the sorted pointers times to a time to print. */
 618       int i = sorted[s] - scaled;
 619
 620       if (scaled[i] == 0)
 621          continue;
 622
 623       int shader_num = brw->shader_time.ids[i];
 624       const char *shader_name = brw->shader_time.names[i];
 625
 626       switch (brw->shader_time.types[i]) {
 627       case ST_VS:
 628          stage = "vs";
 629          break;
 630       case ST_TCS:
 631          stage = "tcs";
 632          break;
 633       case ST_TES:
 634          stage = "tes";
 635          break;
 636       case ST_GS:
 637          stage = "gs";
 638          break;
 639       case ST_FS8:
 640          stage = "fs8";
 641          break;
 642       case ST_FS16:
 643          stage = "fs16";
 644          break;
 645       case ST_FS32:
 646          stage = "fs32";
 647          break;
 648       case ST_CS:
 649          stage = "cs";
 650          break;
 651       default:
 652          stage = "other";
 653          break;
 654       }
 655
 656       print_shader_time_line(stage, shader_name, shader_num,
 657                              scaled[i], total);
 658    }
 659
 660    fprintf(stderr, "\n");
 661    print_shader_time_line("total", "vs", 0, total_by_type[ST_VS], total);
 662    print_shader_time_line("total", "tcs", 0, total_by_type[ST_TCS], total);
 663    print_shader_time_line("total", "tes", 0, total_by_type[ST_TES], total);
 664    print_shader_time_line("total", "gs", 0, total_by_type[ST_GS], total);
 665    print_shader_time_line("total", "fs8", 0, total_by_type[ST_FS8], total);
 666    print_shader_time_line("total", "fs16", 0, total_by_type[ST_FS16], total);
 667    print_shader_time_line("total", "fs32", 0, total_by_type[ST_FS32], total);
 668    print_shader_time_line("total", "cs", 0, total_by_type[ST_CS], total);
 669 }
 670
 671 static void
 672 brw_collect_shader_time(struct brw_context *brw)
 673 {
 674    if (!brw->shader_time.bo)
 675       return;
 676
 677    /* This probably stalls on the last rendering.  We could fix that by
 678     * delaying reading the reports, but it doesn't look like it's a big
 679     * overhead compared to the cost of tracking the time in the first place.
 680     */
 681    void *bo_map = brw_bo_map(brw, brw->shader_time.bo, MAP_READ | MAP_WRITE);
 682
 683    for (int i = 0; i < brw->shader_time.num_entries; i++) {
 684       uint32_t *times = bo_map + i * 3 * BRW_SHADER_TIME_STRIDE;
 685
 686       brw->shader_time.cumulative[i].time += times[BRW_SHADER_TIME_STRIDE * 0 / 4];
 687       brw->shader_time.cumulative[i].written += times[BRW_SHADER_TIME_STRIDE * 1 / 4];
 688       brw->shader_time.cumulative[i].reset += times[BRW_SHADER_TIME_STRIDE * 2 / 4];
 689    }
 690
 691    /* Zero the BO out to clear it out for our next collection.
 692     */
 693    memset(bo_map, 0, brw->shader_time.bo->size);
 694    brw_bo_unmap(brw->shader_time.bo);
 695 }
 696
 697 void
 698 brw_collect_and_report_shader_time(struct brw_context *brw)
 699 {
 700    brw_collect_shader_time(brw);
 701
 702    if (brw->shader_time.report_time == 0 ||
 703        get_time() - brw->shader_time.report_time >= 1.0) {
 704       brw_report_shader_time(brw);
 705       brw->shader_time.report_time = get_time();
 706    }
 707 }
 708
 709 /**
 710  * Chooses an index in the shader_time buffer and sets up tracking information
 711  * for our printouts.
 712  *
 713  * Note that this holds on to references to the underlying programs, which may
 714  * change their lifetimes compared to normal operation.
 715  */
 716 int
 717 brw_get_shader_time_index(struct brw_context *brw, struct gl_program *prog,
 718                           enum shader_time_shader_type type, bool is_glsl_sh)
 719 {
 720    int shader_time_index = brw->shader_time.num_entries++;
 721    assert(shader_time_index < brw->shader_time.max_entries);
 722    brw->shader_time.types[shader_time_index] = type;
 723
 724    const char *name;
 725    if (prog->Id == 0) {
 726       name = "ff";
 727    } else if (is_glsl_sh) {
 728       name = prog->info.label ?
 729          ralloc_strdup(brw->shader_time.names, prog->info.label) : "glsl";
 730    } else {
 731       name = "prog";
 732    }
 733
 734    brw->shader_time.names[shader_time_index] = name;
 735    brw->shader_time.ids[shader_time_index] = prog->Id;
 736
 737    return shader_time_index;
 738 }
 739
 740 void
 741 brw_destroy_shader_time(struct brw_context *brw)
 742 {
 743    brw_bo_unreference(brw->shader_time.bo);
 744    brw->shader_time.bo = NULL;
 745 }
 746
 747 void
 748 brw_stage_prog_data_free(const void *p)
 749 {
 750    struct brw_stage_prog_data *prog_data = (struct brw_stage_prog_data *)p;
 751
 752    ralloc_free(prog_data->param);
 753    ralloc_free(prog_data->pull_param);
 754 }
 755
 756 void
 757 brw_dump_arb_asm(const char *stage, struct gl_program *prog)
 758 {
 759    fprintf(stderr, "ARB_%s_program %d ir for native %s shader\n",
 760            stage, prog->Id, stage);
 761    _mesa_print_program(prog);
 762 }
 763
 764 void
 765 brw_setup_tex_for_precompile(const struct gen_device_info *devinfo,
 766                              struct brw_sampler_prog_key_data *tex,
 767                              struct gl_program *prog)
 768 {
 769    const bool has_shader_channel_select = devinfo->is_haswell || devinfo->gen >= 8;
 770    unsigned sampler_count = util_last_bit(prog->SamplersUsed);
 771    for (unsigned i = 0; i < sampler_count; i++) {
 772       if (!has_shader_channel_select && (prog->ShadowSamplers & (1 << i))) {
 773          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
 774          tex->swizzles[i] =
 775             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
 776       } else {
 777          /* Color sampler: assume no swizzling. */
 778          tex->swizzles[i] = SWIZZLE_XYZW;
 779       }
 780    }
 781 }
 782
 783 /**
 784  * Sets up the starting offsets for the groups of binding table entries
 785  * common to all pipeline stages.
 786  *
 787  * Unused groups are initialized to 0xd0d0d0d0 to make it obvious that they're
 788  * unused but also make sure that addition of small offsets to them will
 789  * trigger some of our asserts that surface indices are < BRW_MAX_SURFACES.
 790  */
 791 uint32_t
 792 brw_assign_common_binding_table_offsets(const struct gen_device_info *devinfo,
 793                                         const struct gl_program *prog,
 794                                         struct brw_stage_prog_data *stage_prog_data,
 795                                         uint32_t next_binding_table_offset)
 796 {
 797    int num_textures = util_last_bit(prog->SamplersUsed);
 798
 799    stage_prog_data->binding_table.texture_start = next_binding_table_offset;
 800    next_binding_table_offset += num_textures;
 801
 802    if (prog->info.num_ubos) {
 803       assert(prog->info.num_ubos <= BRW_MAX_UBO);
 804       stage_prog_data->binding_table.ubo_start = next_binding_table_offset;
 805       next_binding_table_offset += prog->info.num_ubos;
 806    } else {
 807       stage_prog_data->binding_table.ubo_start = 0xd0d0d0d0;
 808    }
 809
 810    if (prog->info.num_ssbos || prog->info.num_abos) {
 811       assert(prog->info.num_abos <= BRW_MAX_ABO);
 812       assert(prog->info.num_ssbos <= BRW_MAX_SSBO);
 813       stage_prog_data->binding_table.ssbo_start = next_binding_table_offset;
 814       next_binding_table_offset += prog->info.num_abos + prog->info.num_ssbos;
 815    } else {
 816       stage_prog_data->binding_table.ssbo_start = 0xd0d0d0d0;
 817    }
 818
 819    if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
 820       stage_prog_data->binding_table.shader_time_start = next_binding_table_offset;
 821       next_binding_table_offset++;
 822    } else {
 823       stage_prog_data->binding_table.shader_time_start = 0xd0d0d0d0;
 824    }
 825
 826    if (prog->info.uses_texture_gather) {
 827       if (devinfo->gen >= 8) {
 828          stage_prog_data->binding_table.gather_texture_start =
 829             stage_prog_data->binding_table.texture_start;
 830       } else {
 831          stage_prog_data->binding_table.gather_texture_start = next_binding_table_offset;
 832          next_binding_table_offset += num_textures;
 833       }
 834    } else {
 835       stage_prog_data->binding_table.gather_texture_start = 0xd0d0d0d0;
 836    }
 837
 838    if (prog->info.num_images) {
 839       stage_prog_data->binding_table.image_start = next_binding_table_offset;
 840       next_binding_table_offset += prog->info.num_images;
 841    } else {
 842       stage_prog_data->binding_table.image_start = 0xd0d0d0d0;
 843    }
 844
 845    /* This may or may not be used depending on how the compile goes. */
 846    stage_prog_data->binding_table.pull_constants_start = next_binding_table_offset;
 847    next_binding_table_offset++;
 848
 849    /* Plane 0 is just the regular texture section */
 850    stage_prog_data->binding_table.plane_start[0] = stage_prog_data->binding_table.texture_start;
 851
 852    stage_prog_data->binding_table.plane_start[1] = next_binding_table_offset;
 853    next_binding_table_offset += num_textures;
 854
 855    stage_prog_data->binding_table.plane_start[2] = next_binding_table_offset;
 856    next_binding_table_offset += num_textures;
 857
 858    /* Set the binding table size.  Some callers may append new entries
 859     * and increase this accordingly.
 860     */
 861    stage_prog_data->binding_table.size_bytes = next_binding_table_offset * 4;
 862
 863    assert(next_binding_table_offset <= BRW_MAX_SURFACES);
 864    return next_binding_table_offset;
 865 }
 866
 867 void
 868 brw_prog_key_set_id(union brw_any_prog_key *key, gl_shader_stage stage,
 869                     unsigned id)
 870 {
 871    static const unsigned stage_offsets[] = {
 872       offsetof(struct brw_vs_prog_key, program_string_id),
 873       offsetof(struct brw_tcs_prog_key, program_string_id),
 874       offsetof(struct brw_tes_prog_key, program_string_id),
 875       offsetof(struct brw_gs_prog_key, program_string_id),
 876       offsetof(struct brw_wm_prog_key, program_string_id),
 877       offsetof(struct brw_cs_prog_key, program_string_id),
 878    };
 879    assert((int)stage >= 0 && stage < ARRAY_SIZE(stage_offsets));
 880    *(unsigned*)((uint8_t*)key + stage_offsets[stage]) = id;
 881 }
 882
 883 void
 884 brw_populate_default_key(const struct gen_device_info *devinfo,
 885                          union brw_any_prog_key *prog_key,
 886                          struct gl_shader_program *sh_prog,
 887                          struct gl_program *prog)
 888 {
 889    switch (prog->info.stage) {
 890    case MESA_SHADER_VERTEX:
 891       brw_vs_populate_default_key(devinfo, &prog_key->vs, prog);
 892       break;
 893    case MESA_SHADER_TESS_CTRL:
 894       brw_tcs_populate_default_key(devinfo, &prog_key->tcs, sh_prog, prog);
 895       break;
 896    case MESA_SHADER_TESS_EVAL:
 897       brw_tes_populate_default_key(devinfo, &prog_key->tes, sh_prog, prog);
 898       break;
 899    case MESA_SHADER_GEOMETRY:
 900       brw_gs_populate_default_key(devinfo, &prog_key->gs, prog);
 901       break;
 902    case MESA_SHADER_FRAGMENT:
 903       brw_wm_populate_default_key(devinfo, &prog_key->wm, prog);
 904       break;
 905    case MESA_SHADER_COMPUTE:
 906       brw_cs_populate_default_key(devinfo, &prog_key->cs, prog);
 907       break;
 908    default:
 909       unreachable("Unsupported stage!");
 910    }
 911 }