X-Git-Url: https://git.libre-soc.org/?p=mesa.git;a=blobdiff_plain;f=src%2Fintel%2Fcompiler%2Fbrw_vec4_generator.cpp;h=e9142c2c65c0270ae46dbbec9bd794fe8dbfd6bb;hp=d85e3c4324112c889250562f68259bffac143575;hb=90b6745bc80cf6dabb8f736dbf12d47c2a6602f5;hpb=b390ff35170fdc2b7f1fb1709a79d81edcd56981 diff --git a/src/intel/compiler/brw_vec4_generator.cpp b/src/intel/compiler/brw_vec4_generator.cpp index d85e3c43241..e9142c2c65c 100644 --- a/src/intel/compiler/brw_vec4_generator.cpp +++ b/src/intel/compiler/brw_vec4_generator.cpp @@ -24,6 +24,7 @@ #include "brw_cfg.h" #include "brw_eu.h" #include "dev/gen_debug.h" +#include "util/mesa-sha1.h" using namespace brw; @@ -269,6 +270,17 @@ generate_tex(struct brw_codegen *p, break; } + /* Stomp the resinfo output type to UINT32. On gens 4-5, the output type + * is set as part of the message descriptor. On gen4, the PRM seems to + * allow UINT32 and FLOAT32 (i965 PRM, Vol. 4 Section 4.8.1.1), but on + * later gens UINT32 is required. Once you hit Sandy Bridge, the bit is + * gone from the message descriptor entirely and you just get UINT32 all + * the time regasrdless. Since we can really only do non-UINT32 on gen4, + * just stomp it to UINT32 all the time. + */ + if (inst->opcode == SHADER_OPCODE_TXS) + return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32; + uint32_t base_binding_table_index = (inst->opcode == SHADER_OPCODE_TG4 || inst->opcode == SHADER_OPCODE_TG4_OFFSET) ? prog_data->base.binding_table.gather_texture_start @@ -1207,7 +1219,7 @@ generate_scratch_write(struct brw_codegen *p, /* If the instruction is predicated, we'll predicate the send, not * the header setup. */ - brw_set_default_predicate_control(p, false); + brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); gen6_resolve_implied_move(p, &header, inst->base_mrf); @@ -1496,15 +1508,24 @@ generate_code(struct brw_codegen *p, void *log_data, const nir_shader *nir, struct brw_vue_prog_data *prog_data, - const struct cfg_t *cfg) + const struct cfg_t *cfg, + const performance &perf, + struct brw_compile_stats *stats) { const struct gen_device_info *devinfo = p->devinfo; const char *stage_abbrev = _mesa_shader_stage_to_abbrev(nir->info.stage); bool debug_flag = INTEL_DEBUG & intel_debug_flag_for_shader_stage(nir->info.stage); struct disasm_info *disasm_info = disasm_initialize(devinfo, cfg); + + /* `send_count` explicitly does not include spills or fills, as we'd + * like to use it as a metric for intentional memory access or other + * shared function use. Otherwise, subtle changes to scheduling or + * register allocation could cause it to fluctuate wildly - and that + * effect is already counted in spill/fill counts. + */ int spill_count = 0, fill_count = 0; - int loop_count = 0; + int loop_count = 0, send_count = 0; foreach_block_and_inst (block, vec4_instruction, inst, cfg) { struct brw_reg src[3], dst; @@ -1744,6 +1765,7 @@ generate_code(struct brw_codegen *p, generate_math_gen6(p, inst, dst, src[0], brw_null_reg()); } else { generate_math1_gen4(p, inst, dst, src[0]); + send_count++; } break; @@ -1757,6 +1779,7 @@ generate_code(struct brw_codegen *p, generate_math_gen6(p, inst, dst, src[0], src[1]); } else { generate_math2_gen4(p, inst, dst, src[0], src[1]); + send_count++; } break; @@ -1773,14 +1796,17 @@ generate_code(struct brw_codegen *p, case SHADER_OPCODE_SAMPLEINFO: generate_tex(p, prog_data, nir->info.stage, inst, dst, src[0], src[1], src[2]); + send_count++; break; case SHADER_OPCODE_GET_BUFFER_SIZE: generate_get_buffer_size(p, prog_data, inst, dst, src[0], src[1]); + send_count++; break; case VS_OPCODE_URB_WRITE: generate_vs_urb_write(p, inst); + send_count++; break; case SHADER_OPCODE_GEN4_SCRATCH_READ: @@ -1795,10 +1821,12 @@ generate_code(struct brw_codegen *p, case VS_OPCODE_PULL_CONSTANT_LOAD: generate_pull_constant_load(p, prog_data, inst, dst, src[0], src[1]); + send_count++; break; case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7: generate_pull_constant_load_gen7(p, prog_data, inst, dst, src[0], src[1]); + send_count++; break; case VS_OPCODE_SET_SIMD4X2_HEADER_GEN9: @@ -1807,14 +1835,17 @@ generate_code(struct brw_codegen *p, case GS_OPCODE_URB_WRITE: generate_gs_urb_write(p, inst); + send_count++; break; case GS_OPCODE_URB_WRITE_ALLOCATE: generate_gs_urb_write_allocate(p, inst); + send_count++; break; case GS_OPCODE_SVB_WRITE: generate_gs_svb_write(p, prog_data, inst, dst, src[0], src[1]); + send_count++; break; case GS_OPCODE_SVB_SET_DST_INDEX: @@ -1823,6 +1854,7 @@ generate_code(struct brw_codegen *p, case GS_OPCODE_THREAD_END: generate_gs_thread_end(p, inst); + send_count++; break; case GS_OPCODE_SET_WRITE_OFFSET: @@ -1835,6 +1867,7 @@ generate_code(struct brw_codegen *p, case GS_OPCODE_FF_SYNC: generate_gs_ff_sync(p, inst, dst, src[0], src[1]); + send_count++; break; case GS_OPCODE_FF_SYNC_SET_PRIMITIVES: @@ -1864,12 +1897,14 @@ generate_code(struct brw_codegen *p, case SHADER_OPCODE_SHADER_TIME_ADD: brw_shader_time_add(p, src[0], prog_data->base.binding_table.shader_time_start); + send_count++; break; case VEC4_OPCODE_UNTYPED_ATOMIC: assert(src[2].file == BRW_IMMEDIATE_VALUE); brw_untyped_atomic(p, dst, src[0], src[1], src[2].ud, inst->mlen, !inst->dst.is_null(), inst->header_size); + send_count++; break; case VEC4_OPCODE_UNTYPED_SURFACE_READ: @@ -1877,16 +1912,22 @@ generate_code(struct brw_codegen *p, assert(src[2].file == BRW_IMMEDIATE_VALUE); brw_untyped_surface_read(p, dst, src[0], src[1], inst->mlen, src[2].ud); + send_count++; break; case VEC4_OPCODE_UNTYPED_SURFACE_WRITE: assert(src[2].file == BRW_IMMEDIATE_VALUE); brw_untyped_surface_write(p, src[0], src[1], inst->mlen, src[2].ud, inst->header_size); + send_count++; break; case SHADER_OPCODE_MEMORY_FENCE: - brw_memory_fence(p, dst, src[0], BRW_OPCODE_SEND, false, /* bti */ 0); + brw_memory_fence(p, dst, src[0], BRW_OPCODE_SEND, + brw_message_target(inst->sfid), + /* commit_enable */ false, + /* bti */ 0); + send_count++; break; case SHADER_OPCODE_FIND_LIVE_CHANNEL: { @@ -2035,8 +2076,7 @@ generate_code(struct brw_codegen *p, * * where they pack the four bytes from the low and high four DW. */ - assert(_mesa_is_pow_two(dst.writemask) && - dst.writemask != 0); + assert(util_is_power_of_two_nonzero(dst.writemask)); unsigned offset = __builtin_ctz(dst.writemask); dst.type = BRW_REGISTER_TYPE_UB; @@ -2066,10 +2106,12 @@ generate_code(struct brw_codegen *p, case TCS_OPCODE_URB_WRITE: generate_tcs_urb_write(p, inst, src[0]); + send_count++; break; case VEC4_OPCODE_URB_READ: generate_vec4_urb_read(p, inst, dst, src[0]); + send_count++; break; case TCS_OPCODE_SET_INPUT_URB_OFFSETS: @@ -2111,15 +2153,18 @@ generate_code(struct brw_codegen *p, case TCS_OPCODE_RELEASE_INPUT: generate_tcs_release_input(p, dst, src[0], src[1]); + send_count++; break; case TCS_OPCODE_THREAD_END: generate_tcs_thread_end(p, inst); + send_count++; break; case SHADER_OPCODE_BARRIER: brw_barrier(p, src[0]); brw_WAIT(p); + send_count++; break; case SHADER_OPCODE_MOV_INDIRECT: @@ -2174,28 +2219,50 @@ generate_code(struct brw_codegen *p, int after_size = p->next_insn_offset; if (unlikely(debug_flag)) { - fprintf(stderr, "Native code for %s %s shader %s:\n", - nir->info.label ? nir->info.label : "unnamed", - _mesa_shader_stage_to_string(nir->info.stage), nir->info.name); + unsigned char sha1[21]; + char sha1buf[41]; - fprintf(stderr, "%s vec4 shader: %d instructions. %d loops. %u cycles. %d:%d " - "spills:fills. Compacted %d to %d bytes (%.0f%%)\n", - stage_abbrev, before_size / 16, loop_count, cfg->cycle_count, - spill_count, fill_count, before_size, after_size, - 100.0f * (before_size - after_size) / before_size); + _mesa_sha1_compute(p->store, p->next_insn_offset, sha1); + _mesa_sha1_format(sha1buf, sha1); + + fprintf(stderr, "Native code for %s %s shader %s (sha1 %s):\n", + nir->info.label ? nir->info.label : "unnamed", + _mesa_shader_stage_to_string(nir->info.stage), nir->info.name, + sha1buf); - dump_assembly(p->store, disasm_info); + fprintf(stderr, "%s vec4 shader: %d instructions. %d loops. %u cycles. %d:%d " + "spills:fills, %u sends. Compacted %d to %d bytes (%.0f%%)\n", + stage_abbrev, before_size / 16, loop_count, perf.latency, + spill_count, fill_count, send_count, before_size, after_size, + 100.0f * (before_size - after_size) / before_size); + + /* overriding the shader makes disasm_info invalid */ + if (!brw_try_override_assembly(p, 0, sha1buf)) { + dump_assembly(p->store, 0, p->next_insn_offset, + disasm_info, perf.block_latency); + } else { + fprintf(stderr, "Successfully overrode shader with sha1 %s\n\n", sha1buf); + } } ralloc_free(disasm_info); assert(validated); compiler->shader_debug_log(log_data, "%s vec4 shader: %d inst, %d loops, %u cycles, " - "%d:%d spills:fills, compacted %d to %d bytes.", + "%d:%d spills:fills, %u sends, " + "compacted %d to %d bytes.", stage_abbrev, before_size / 16, - loop_count, cfg->cycle_count, spill_count, - fill_count, before_size, after_size); - + loop_count, perf.latency, spill_count, + fill_count, send_count, before_size, after_size); + if (stats) { + stats->dispatch_width = 0; + stats->instructions = before_size / 16; + stats->sends = send_count; + stats->loops = loop_count; + stats->cycles = perf.latency; + stats->spills = spill_count; + stats->fills = fill_count; + } } extern "C" const unsigned * @@ -2204,13 +2271,22 @@ brw_vec4_generate_assembly(const struct brw_compiler *compiler, void *mem_ctx, const nir_shader *nir, struct brw_vue_prog_data *prog_data, - const struct cfg_t *cfg) + const struct cfg_t *cfg, + const performance &perf, + struct brw_compile_stats *stats) { struct brw_codegen *p = rzalloc(mem_ctx, struct brw_codegen); brw_init_codegen(compiler->devinfo, p, mem_ctx); brw_set_default_access_mode(p, BRW_ALIGN_16); - generate_code(p, compiler, log_data, nir, prog_data, cfg); + generate_code(p, compiler, log_data, nir, prog_data, cfg, perf, stats); + + assert(prog_data->base.const_data_size == 0); + if (nir->constant_data_size > 0) { + prog_data->base.const_data_size = nir->constant_data_size; + prog_data->base.const_data_offset = + brw_append_data(p, nir->constant_data, nir->constant_data_size, 32); + } return brw_get_program(p, &prog_data->base.program_size); }