#include "brw_cfg.h"
#include "brw_eu.h"
#include "dev/gen_debug.h"
+#include "util/mesa-sha1.h"
using namespace brw;
/* If the instruction is predicated, we'll predicate the send, not
* the header setup.
*/
- brw_set_default_predicate_control(p, false);
+ brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
gen6_resolve_implied_move(p, &header, inst->base_mrf);
void *log_data,
const nir_shader *nir,
struct brw_vue_prog_data *prog_data,
- const struct cfg_t *cfg)
+ const struct cfg_t *cfg,
+ const performance &perf,
+ struct brw_compile_stats *stats)
{
const struct gen_device_info *devinfo = p->devinfo;
const char *stage_abbrev = _mesa_shader_stage_to_abbrev(nir->info.stage);
bool debug_flag = INTEL_DEBUG &
intel_debug_flag_for_shader_stage(nir->info.stage);
struct disasm_info *disasm_info = disasm_initialize(devinfo, cfg);
+
+ /* `send_count` explicitly does not include spills or fills, as we'd
+ * like to use it as a metric for intentional memory access or other
+ * shared function use. Otherwise, subtle changes to scheduling or
+ * register allocation could cause it to fluctuate wildly - and that
+ * effect is already counted in spill/fill counts.
+ */
int spill_count = 0, fill_count = 0;
- int loop_count = 0;
+ int loop_count = 0, send_count = 0;
foreach_block_and_inst (block, vec4_instruction, inst, cfg) {
struct brw_reg src[3], dst;
generate_math_gen6(p, inst, dst, src[0], brw_null_reg());
} else {
generate_math1_gen4(p, inst, dst, src[0]);
+ send_count++;
}
break;
generate_math_gen6(p, inst, dst, src[0], src[1]);
} else {
generate_math2_gen4(p, inst, dst, src[0], src[1]);
+ send_count++;
}
break;
case SHADER_OPCODE_SAMPLEINFO:
generate_tex(p, prog_data, nir->info.stage,
inst, dst, src[0], src[1], src[2]);
+ send_count++;
break;
case SHADER_OPCODE_GET_BUFFER_SIZE:
generate_get_buffer_size(p, prog_data, inst, dst, src[0], src[1]);
+ send_count++;
break;
case VS_OPCODE_URB_WRITE:
generate_vs_urb_write(p, inst);
+ send_count++;
break;
case SHADER_OPCODE_GEN4_SCRATCH_READ:
case VS_OPCODE_PULL_CONSTANT_LOAD:
generate_pull_constant_load(p, prog_data, inst, dst, src[0], src[1]);
+ send_count++;
break;
case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
generate_pull_constant_load_gen7(p, prog_data, inst, dst, src[0], src[1]);
+ send_count++;
break;
case VS_OPCODE_SET_SIMD4X2_HEADER_GEN9:
case GS_OPCODE_URB_WRITE:
generate_gs_urb_write(p, inst);
+ send_count++;
break;
case GS_OPCODE_URB_WRITE_ALLOCATE:
generate_gs_urb_write_allocate(p, inst);
+ send_count++;
break;
case GS_OPCODE_SVB_WRITE:
generate_gs_svb_write(p, prog_data, inst, dst, src[0], src[1]);
+ send_count++;
break;
case GS_OPCODE_SVB_SET_DST_INDEX:
case GS_OPCODE_THREAD_END:
generate_gs_thread_end(p, inst);
+ send_count++;
break;
case GS_OPCODE_SET_WRITE_OFFSET:
case GS_OPCODE_FF_SYNC:
generate_gs_ff_sync(p, inst, dst, src[0], src[1]);
+ send_count++;
break;
case GS_OPCODE_FF_SYNC_SET_PRIMITIVES:
case SHADER_OPCODE_SHADER_TIME_ADD:
brw_shader_time_add(p, src[0],
prog_data->base.binding_table.shader_time_start);
+ send_count++;
break;
case VEC4_OPCODE_UNTYPED_ATOMIC:
assert(src[2].file == BRW_IMMEDIATE_VALUE);
brw_untyped_atomic(p, dst, src[0], src[1], src[2].ud, inst->mlen,
!inst->dst.is_null(), inst->header_size);
+ send_count++;
break;
case VEC4_OPCODE_UNTYPED_SURFACE_READ:
assert(src[2].file == BRW_IMMEDIATE_VALUE);
brw_untyped_surface_read(p, dst, src[0], src[1], inst->mlen,
src[2].ud);
+ send_count++;
break;
case VEC4_OPCODE_UNTYPED_SURFACE_WRITE:
assert(src[2].file == BRW_IMMEDIATE_VALUE);
brw_untyped_surface_write(p, src[0], src[1], inst->mlen,
src[2].ud, inst->header_size);
+ send_count++;
break;
- case SHADER_OPCODE_MEMORY_FENCE:
- brw_memory_fence(p, dst, BRW_OPCODE_SEND);
+ case SHADER_OPCODE_MEMORY_FENCE: {
+ const unsigned sends =
+ brw_memory_fence(p, dst, src[0], BRW_OPCODE_SEND, false,
+ /* bti */ 0);
+ send_count += sends;
break;
+ }
case SHADER_OPCODE_FIND_LIVE_CHANNEL: {
const struct brw_reg mask =
*
* where they pack the four bytes from the low and high four DW.
*/
- assert(_mesa_is_pow_two(dst.writemask) &&
- dst.writemask != 0);
+ assert(util_is_power_of_two_nonzero(dst.writemask));
unsigned offset = __builtin_ctz(dst.writemask);
dst.type = BRW_REGISTER_TYPE_UB;
case TCS_OPCODE_URB_WRITE:
generate_tcs_urb_write(p, inst, src[0]);
+ send_count++;
break;
case VEC4_OPCODE_URB_READ:
generate_vec4_urb_read(p, inst, dst, src[0]);
+ send_count++;
break;
case TCS_OPCODE_SET_INPUT_URB_OFFSETS:
case TCS_OPCODE_RELEASE_INPUT:
generate_tcs_release_input(p, dst, src[0], src[1]);
+ send_count++;
break;
case TCS_OPCODE_THREAD_END:
generate_tcs_thread_end(p, inst);
+ send_count++;
break;
case SHADER_OPCODE_BARRIER:
brw_barrier(p, src[0]);
brw_WAIT(p);
+ send_count++;
break;
case SHADER_OPCODE_MOV_INDIRECT:
int after_size = p->next_insn_offset;
if (unlikely(debug_flag)) {
- fprintf(stderr, "Native code for %s %s shader %s:\n",
- nir->info.label ? nir->info.label : "unnamed",
- _mesa_shader_stage_to_string(nir->info.stage), nir->info.name);
+ unsigned char sha1[21];
+ char sha1buf[41];
- fprintf(stderr, "%s vec4 shader: %d instructions. %d loops. %u cycles. %d:%d "
- "spills:fills. Compacted %d to %d bytes (%.0f%%)\n",
- stage_abbrev, before_size / 16, loop_count, cfg->cycle_count,
- spill_count, fill_count, before_size, after_size,
- 100.0f * (before_size - after_size) / before_size);
+ _mesa_sha1_compute(p->store, p->next_insn_offset, sha1);
+ _mesa_sha1_format(sha1buf, sha1);
+
+ fprintf(stderr, "Native code for %s %s shader %s (sha1 %s):\n",
+ nir->info.label ? nir->info.label : "unnamed",
+ _mesa_shader_stage_to_string(nir->info.stage), nir->info.name,
+ sha1buf);
- dump_assembly(p->store, disasm_info);
+ fprintf(stderr, "%s vec4 shader: %d instructions. %d loops. %u cycles. %d:%d "
+ "spills:fills, %u sends. Compacted %d to %d bytes (%.0f%%)\n",
+ stage_abbrev, before_size / 16, loop_count, perf.latency,
+ spill_count, fill_count, send_count, before_size, after_size,
+ 100.0f * (before_size - after_size) / before_size);
+
+ /* overriding the shader makes disasm_info invalid */
+ if (!brw_try_override_assembly(p, 0, sha1buf)) {
+ dump_assembly(p->store, disasm_info);
+ } else {
+ fprintf(stderr, "Successfully overrode shader with sha1 %s\n\n", sha1buf);
+ }
}
ralloc_free(disasm_info);
assert(validated);
compiler->shader_debug_log(log_data,
"%s vec4 shader: %d inst, %d loops, %u cycles, "
- "%d:%d spills:fills, compacted %d to %d bytes.",
+ "%d:%d spills:fills, %u sends, "
+ "compacted %d to %d bytes.",
stage_abbrev, before_size / 16,
- loop_count, cfg->cycle_count, spill_count,
- fill_count, before_size, after_size);
-
+ loop_count, perf.latency, spill_count,
+ fill_count, send_count, before_size, after_size);
+ if (stats) {
+ stats->dispatch_width = 0;
+ stats->instructions = before_size / 16;
+ stats->sends = send_count;
+ stats->loops = loop_count;
+ stats->cycles = perf.latency;
+ stats->spills = spill_count;
+ stats->fills = fill_count;
+ }
}
extern "C" const unsigned *
void *mem_ctx,
const nir_shader *nir,
struct brw_vue_prog_data *prog_data,
- const struct cfg_t *cfg)
+ const struct cfg_t *cfg,
+ const performance &perf,
+ struct brw_compile_stats *stats)
{
struct brw_codegen *p = rzalloc(mem_ctx, struct brw_codegen);
brw_init_codegen(compiler->devinfo, p, mem_ctx);
brw_set_default_access_mode(p, BRW_ALIGN_16);
- generate_code(p, compiler, log_data, nir, prog_data, cfg);
+ generate_code(p, compiler, log_data, nir, prog_data, cfg, perf, stats);
return brw_get_program(p, &prog_data->base.program_size);
}