X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fintel%2Fcompiler%2Fbrw_vec4.cpp;h=ee91be0a7ab6469ff4a12c8b83209d4cf70630af;hb=d4c6e3f196fb61939b0b6c9d1051ac1e35625d42;hp=e4838146ac15cb6638002a435e5a9b8135afd89a;hpb=70de61594dcf99f24eb31ebf98d62f13e1f44c2e;p=mesa.git diff --git a/src/intel/compiler/brw_vec4.cpp b/src/intel/compiler/brw_vec4.cpp index e4838146ac1..ee91be0a7ab 100644 --- a/src/intel/compiler/brw_vec4.cpp +++ b/src/intel/compiler/brw_vec4.cpp @@ -26,11 +26,11 @@ #include "brw_cfg.h" #include "brw_nir.h" #include "brw_vec4_builder.h" -#include "brw_vec4_live_variables.h" #include "brw_vec4_vs.h" #include "brw_dead_control_flow.h" -#include "common/gen_debug.h" +#include "dev/gen_debug.h" #include "program/prog_parameter.h" +#include "util/u_math.h" #define MAX_INSTRUCTION (1 << 30) @@ -41,7 +41,7 @@ namespace brw { void src_reg::init() { - memset(this, 0, sizeof(*this)); + memset((void*)this, 0, sizeof(*this)); this->file = BAD_FILE; this->type = BRW_REGISTER_TYPE_UD; } @@ -83,7 +83,7 @@ src_reg::src_reg(const dst_reg ®) : void dst_reg::init() { - memset(this, 0, sizeof(*this)); + memset((void*)this, 0, sizeof(*this)); this->file = BAD_FILE; this->type = BRW_REGISTER_TYPE_UD; this->writemask = WRITEMASK_XYZW; @@ -147,17 +147,14 @@ dst_reg::equals(const dst_reg &r) const } bool -vec4_instruction::is_send_from_grf() +vec4_instruction::is_send_from_grf() const { switch (opcode) { case SHADER_OPCODE_SHADER_TIME_ADD: case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7: - case SHADER_OPCODE_UNTYPED_ATOMIC: - case SHADER_OPCODE_UNTYPED_SURFACE_READ: - case SHADER_OPCODE_UNTYPED_SURFACE_WRITE: - case SHADER_OPCODE_TYPED_ATOMIC: - case SHADER_OPCODE_TYPED_SURFACE_READ: - case SHADER_OPCODE_TYPED_SURFACE_WRITE: + case VEC4_OPCODE_UNTYPED_ATOMIC: + case VEC4_OPCODE_UNTYPED_SURFACE_READ: + case VEC4_OPCODE_UNTYPED_SURFACE_WRITE: case VEC4_OPCODE_URB_READ: case TCS_OPCODE_URB_WRITE: case TCS_OPCODE_RELEASE_INPUT: @@ -211,12 +208,9 @@ vec4_instruction::size_read(unsigned arg) const { switch (opcode) { case SHADER_OPCODE_SHADER_TIME_ADD: - case SHADER_OPCODE_UNTYPED_ATOMIC: - case SHADER_OPCODE_UNTYPED_SURFACE_READ: - case SHADER_OPCODE_UNTYPED_SURFACE_WRITE: - case SHADER_OPCODE_TYPED_ATOMIC: - case SHADER_OPCODE_TYPED_SURFACE_READ: - case SHADER_OPCODE_TYPED_SURFACE_WRITE: + case VEC4_OPCODE_UNTYPED_ATOMIC: + case VEC4_OPCODE_UNTYPED_SURFACE_READ: + case VEC4_OPCODE_UNTYPED_SURFACE_WRITE: case TCS_OPCODE_URB_WRITE: if (arg == 0) return mlen * REG_SIZE; @@ -256,6 +250,26 @@ vec4_instruction::can_do_source_mods(const struct gen_device_info *devinfo) return true; } +bool +vec4_instruction::can_do_cmod() +{ + if (!backend_instruction::can_do_cmod()) + return false; + + /* The accumulator result appears to get used for the conditional modifier + * generation. When negating a UD value, there is a 33rd bit generated for + * the sign in the accumulator value, so now you can't check, for example, + * equality with a 32-bit value. See piglit fs-op-neg-uvec4. + */ + for (unsigned i = 0; i < 3; i++) { + if (src[i].file != BAD_FILE && + type_is_unsigned_int(src[i].type) && src[i].negate) + return false; + } + + return true; +} + bool vec4_instruction::can_do_writemask(const struct gen_device_info *devinfo) { @@ -312,13 +326,13 @@ vec4_instruction::can_change_types() const * instruction -- the generate_* functions generate additional MOVs * for setup. */ -int -vec4_visitor::implied_mrf_writes(vec4_instruction *inst) +unsigned +vec4_instruction::implied_mrf_writes() const { - if (inst->mlen == 0 || inst->is_send_from_grf()) + if (mlen == 0 || is_send_from_grf()) return 0; - switch (inst->opcode) { + switch (opcode) { case SHADER_OPCODE_RCP: case SHADER_OPCODE_RSQ: case SHADER_OPCODE_SQRT: @@ -362,7 +376,7 @@ vec4_visitor::implied_mrf_writes(vec4_instruction *inst) case SHADER_OPCODE_TG4_OFFSET: case SHADER_OPCODE_SAMPLEINFO: case SHADER_OPCODE_GET_BUFFER_SIZE: - return inst->header_size; + return header_size; default: unreachable("not reached"); } @@ -375,13 +389,20 @@ src_reg::equals(const src_reg &r) const !reladdr && !r.reladdr); } +bool +src_reg::negative_equals(const src_reg &r) const +{ + return this->backend_reg::negative_equals(r) && + !reladdr && !r.reladdr; +} + bool vec4_visitor::opt_vector_float() { bool progress = false; foreach_block(block, cfg) { - int last_reg = -1, last_offset = -1; + unsigned last_reg = ~0u, last_offset = ~0u; enum brw_reg_file last_reg_file = BAD_FILE; uint8_t imm[4] = { 0 }; @@ -392,7 +413,7 @@ vec4_visitor::opt_vector_float() foreach_inst_in_block_safe(vec4_instruction, inst, block) { int vf = -1; - enum brw_reg_type need_type; + enum brw_reg_type need_type = BRW_REGISTER_TYPE_LAST; /* Look for unconditional MOVs from an immediate with a partial * writemask. Skip type-conversion MOVs other than integer 0, @@ -414,7 +435,7 @@ vec4_visitor::opt_vector_float() need_type = BRW_REGISTER_TYPE_F; } } else { - last_reg = -1; + last_reg = ~0u; } /* If this wasn't a MOV, or the destination register doesn't match, @@ -442,7 +463,7 @@ vec4_visitor::opt_vector_float() } inst_count = 0; - last_reg = -1; + last_reg = ~0u;; writemask = 0; dest_type = BRW_REGISTER_TYPE_F; @@ -475,7 +496,7 @@ vec4_visitor::opt_vector_float() } if (progress) - invalidate_live_intervals(); + invalidate_analysis(DEPENDENCY_INSTRUCTIONS); return progress; } @@ -556,7 +577,7 @@ vec4_visitor::opt_reduce_swizzle() } if (progress) - invalidate_live_intervals(); + invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL); return progress; } @@ -611,6 +632,9 @@ set_push_constant_loc(const int nr_uniforms, int *new_uniform_count, void vec4_visitor::pack_uniform_registers() { + if (!compiler->compact_params) + return; + uint8_t chans_used[this->uniforms]; int new_loc[this->uniforms]; int new_chan[this->uniforms]; @@ -688,8 +712,11 @@ vec4_visitor::pack_uniform_registers() * the next part of our packing algorithm. */ int reg = inst->src[0].nr; - for (unsigned i = 0; i < vec4s_read; i++) + int channel_size = type_sz(inst->src[0].type) / 4; + for (unsigned i = 0; i < vec4s_read; i++) { chans_used[reg + i] = 4; + channel_sizes[reg + i] = MAX2(channel_sizes[reg + i], channel_size); + } } } @@ -789,10 +816,19 @@ vec4_visitor::opt_algebraic() break; if (inst->saturate) { - if (inst->dst.type != inst->src[0].type) + /* Full mixed-type saturates don't happen. However, we can end up + * with things like: + * + * mov.sat(8) g21<1>DF -1F + * + * Other mixed-size-but-same-base-type cases may also be possible. + */ + if (inst->dst.type != inst->src[0].type && + inst->dst.type != BRW_REGISTER_TYPE_DF && + inst->src[0].type != BRW_REGISTER_TYPE_F) assert(!"unimplemented: saturate mixed types"); - if (brw_saturate_immediate(inst->dst.type, + if (brw_saturate_immediate(inst->src[0].type, &inst->src[0].as_brw_reg())) { inst->saturate = false; progress = true; @@ -800,6 +836,14 @@ vec4_visitor::opt_algebraic() } break; + case BRW_OPCODE_OR: + if (inst->src[1].is_zero()) { + inst->opcode = BRW_OPCODE_MOV; + inst->src[1] = src_reg(); + progress = true; + } + break; + case VEC4_OPCODE_UNPACK_UNIFORM: if (inst->src[0].file != UNIFORM) { inst->opcode = BRW_OPCODE_MOV; @@ -844,18 +888,6 @@ vec4_visitor::opt_algebraic() progress = true; } break; - case BRW_OPCODE_CMP: - if (inst->conditional_mod == BRW_CONDITIONAL_GE && - inst->src[0].abs && - inst->src[0].negate && - inst->src[1].is_zero()) { - inst->src[0].abs = false; - inst->src[0].negate = false; - inst->conditional_mod = BRW_CONDITIONAL_Z; - progress = true; - break; - } - break; case SHADER_OPCODE_BROADCAST: if (is_uniform(inst->src[0]) || inst->src[1].is_zero()) { @@ -872,7 +904,8 @@ vec4_visitor::opt_algebraic() } if (progress) - invalidate_live_intervals(); + invalidate_analysis(DEPENDENCY_INSTRUCTION_DATA_FLOW | + DEPENDENCY_INSTRUCTION_DETAIL); return progress; } @@ -1124,6 +1157,12 @@ vec4_instruction::can_reswizzle(const struct gen_device_info *devinfo, if (devinfo->gen == 6 && is_math() && swizzle != BRW_SWIZZLE_XYZW) return false; + /* If we write to the flag register changing the swizzle would change + * what channels are written to the flag register. + */ + if (writes_flag()) + return false; + /* We can't swizzle implicit accumulator access. We'd have to * reswizzle the producer of the accumulator value in addition * to the consumer (i.e. both MUL and MACH). Just skip this. @@ -1168,9 +1207,31 @@ vec4_instruction::reswizzle(int dst_writemask, int swizzle) opcode != BRW_OPCODE_DP3 && opcode != BRW_OPCODE_DP2 && opcode != VEC4_OPCODE_PACK_BYTES) { for (int i = 0; i < 3; i++) { - if (src[i].file == BAD_FILE || src[i].file == IMM) + if (src[i].file == BAD_FILE) continue; + if (src[i].file == IMM) { + assert(src[i].type != BRW_REGISTER_TYPE_V && + src[i].type != BRW_REGISTER_TYPE_UV); + + /* Vector immediate types need to be reswizzled. */ + if (src[i].type == BRW_REGISTER_TYPE_VF) { + const unsigned imm[] = { + (src[i].ud >> 0) & 0x0ff, + (src[i].ud >> 8) & 0x0ff, + (src[i].ud >> 16) & 0x0ff, + (src[i].ud >> 24) & 0x0ff, + }; + + src[i] = brw_imm_vf4(imm[BRW_GET_SWZ(swizzle, 0)], + imm[BRW_GET_SWZ(swizzle, 1)], + imm[BRW_GET_SWZ(swizzle, 2)], + imm[BRW_GET_SWZ(swizzle, 3)]); + } + + continue; + } + src[i].swizzle = brw_compose_swizzle(swizzle, src[i].swizzle); } } @@ -1192,8 +1253,7 @@ vec4_visitor::opt_register_coalesce() { bool progress = false; int next_ip = 0; - - calculate_live_intervals(); + const vec4_live_variables &live = live_analysis.require(); foreach_block_and_inst_safe (block, vec4_instruction, inst, cfg) { int ip = next_ip; @@ -1235,7 +1295,7 @@ vec4_visitor::opt_register_coalesce() /* Can't coalesce this GRF if someone else was going to * read it later. */ - if (var_range_end(var_from_reg(alloc, dst_reg(inst->src[0])), 8) > ip) + if (live.var_range_end(var_from_reg(alloc, dst_reg(inst->src[0])), 8) > ip) continue; /* We need to check interference with the final destination between this @@ -1275,6 +1335,15 @@ vec4_visitor::opt_register_coalesce() } } + /* VS_OPCODE_UNPACK_FLAGS_SIMD4X2 generates a bunch of mov(1) + * instructions, and this optimization pass is not capable of + * handling that. Bail on these instructions and hope that some + * later optimization pass can do the right thing after they are + * expanded. + */ + if (scan_inst->opcode == VS_OPCODE_UNPACK_FLAGS_SIMD4X2) + break; + /* This doesn't handle saturation on the instruction we * want to coalesce away if the register types do not match. * But if scan_inst is a non type-converting 'mov', we can fix @@ -1352,8 +1421,10 @@ vec4_visitor::opt_register_coalesce() * in the register instead. */ if (to_mrf && scan_inst->mlen > 0) { - if (inst->dst.nr >= scan_inst->base_mrf && - inst->dst.nr < scan_inst->base_mrf + scan_inst->mlen) { + unsigned start = scan_inst->base_mrf; + unsigned end = scan_inst->base_mrf + scan_inst->mlen; + + if (inst->dst.nr >= start && inst->dst.nr < end) { break; } } else { @@ -1403,7 +1474,7 @@ vec4_visitor::opt_register_coalesce() } if (progress) - invalidate_live_intervals(); + invalidate_analysis(DEPENDENCY_INSTRUCTIONS); return progress; } @@ -1453,6 +1524,9 @@ vec4_visitor::eliminate_find_live_channel() } } + if (progress) + invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL); + return progress; } @@ -1527,19 +1601,19 @@ vec4_visitor::split_virtual_grfs() } } } - invalidate_live_intervals(); + invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL | DEPENDENCY_VARIABLES); } void -vec4_visitor::dump_instruction(backend_instruction *be_inst) +vec4_visitor::dump_instruction(const backend_instruction *be_inst) const { dump_instruction(be_inst, stderr); } void -vec4_visitor::dump_instruction(backend_instruction *be_inst, FILE *file) +vec4_visitor::dump_instruction(const backend_instruction *be_inst, FILE *file) const { - vec4_instruction *inst = (vec4_instruction *)be_inst; + const vec4_instruction *inst = (const vec4_instruction *)be_inst; if (inst->predicate) { fprintf(file, "(%cf%d.%d%s) ", @@ -1830,7 +1904,7 @@ vec4_visitor::lower_minmax() } if (progress) - invalidate_live_intervals(); + invalidate_analysis(DEPENDENCY_INSTRUCTIONS); return progress; } @@ -1945,6 +2019,31 @@ is_align1_df(vec4_instruction *inst) } } +/** + * Three source instruction must have a GRF/MRF destination register. + * ARF NULL is not allowed. Fix that up by allocating a temporary GRF. + */ +void +vec4_visitor::fixup_3src_null_dest() +{ + bool progress = false; + + foreach_block_and_inst_safe (block, vec4_instruction, inst, cfg) { + if (inst->is_3src(devinfo) && inst->dst.is_null()) { + const unsigned size_written = type_sz(inst->dst.type); + const unsigned num_regs = DIV_ROUND_UP(size_written, REG_SIZE); + + inst->dst = retype(dst_reg(VGRF, alloc.allocate(num_regs)), + inst->dst.type); + progress = true; + } + } + + if (progress) + invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL | + DEPENDENCY_VARIABLES); +} + void vec4_visitor::convert_to_hw_regs() { @@ -2270,7 +2369,7 @@ vec4_visitor::lower_simd_width() } if (progress) - invalidate_live_intervals(); + invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); return progress; } @@ -2427,7 +2526,7 @@ vec4_visitor::scalarize_df() } if (progress) - invalidate_live_intervals(); + invalidate_analysis(DEPENDENCY_INSTRUCTIONS); return progress; } @@ -2470,7 +2569,7 @@ vec4_visitor::lower_64bit_mad_to_mul_add() } if (progress) - invalidate_live_intervals(); + invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); return progress; } @@ -2567,6 +2666,13 @@ vec4_visitor::apply_logical_swizzle(struct brw_reg *hw_reg, } } +void +vec4_visitor::invalidate_analysis(brw::analysis_dependency_class c) +{ + backend_shader::invalidate_analysis(c); + live_analysis.invalidate(c); +} + bool vec4_visitor::run() { @@ -2696,6 +2802,8 @@ vec4_visitor::run() OPT(scalarize_df); } + fixup_3src_null_dest(); + bool allocated_without_spills = reg_allocate(); if (!allocated_without_spills) { @@ -2745,13 +2853,13 @@ brw_compile_vs(const struct brw_compiler *compiler, void *log_data, void *mem_ctx, const struct brw_vs_prog_key *key, struct brw_vs_prog_data *prog_data, - const nir_shader *src_shader, + nir_shader *nir, int shader_time_index, + struct brw_compile_stats *stats, char **error_str) { const bool is_scalar = compiler->scalar_stage[MESA_SHADER_VERTEX]; - nir_shader *shader = nir_shader_clone(mem_ctx, src_shader); - shader = brw_nir_apply_sampler_key(shader, compiler, &key->tex, is_scalar); + brw_nir_apply_key(nir, compiler, &key->base, 8, is_scalar); const unsigned *assembly = NULL; @@ -2767,57 +2875,65 @@ brw_compile_vs(const struct brw_compiler *compiler, void *log_data, */ assert(!is_scalar); assert(key->copy_edgeflag); - shader->info.inputs_read |= VERT_BIT_EDGEFLAG; + nir->info.inputs_read |= VERT_BIT_EDGEFLAG; } - prog_data->inputs_read = shader->info.inputs_read; - prog_data->double_inputs_read = shader->info.vs.double_inputs; + prog_data->inputs_read = nir->info.inputs_read; + prog_data->double_inputs_read = nir->info.vs.double_inputs; - brw_nir_lower_vs_inputs(shader, key->gl_attrib_wa_flags); - brw_nir_lower_vue_outputs(shader, is_scalar); - shader = brw_postprocess_nir(shader, compiler, is_scalar); + brw_nir_lower_vs_inputs(nir, key->gl_attrib_wa_flags); + brw_nir_lower_vue_outputs(nir); + brw_postprocess_nir(nir, compiler, is_scalar); prog_data->base.clip_distance_mask = - ((1 << shader->info.clip_distance_array_size) - 1); + ((1 << nir->info.clip_distance_array_size) - 1); prog_data->base.cull_distance_mask = - ((1 << shader->info.cull_distance_array_size) - 1) << - shader->info.clip_distance_array_size; + ((1 << nir->info.cull_distance_array_size) - 1) << + nir->info.clip_distance_array_size; - unsigned nr_attribute_slots = _mesa_bitcount_64(prog_data->inputs_read); + unsigned nr_attribute_slots = util_bitcount64(prog_data->inputs_read); /* gl_VertexID and gl_InstanceID are system values, but arrive via an * incoming vertex attribute. So, add an extra slot. */ - if (shader->info.system_values_read & - (BITFIELD64_BIT(SYSTEM_VALUE_BASE_VERTEX) | + if (nir->info.system_values_read & + (BITFIELD64_BIT(SYSTEM_VALUE_FIRST_VERTEX) | BITFIELD64_BIT(SYSTEM_VALUE_BASE_INSTANCE) | BITFIELD64_BIT(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) | BITFIELD64_BIT(SYSTEM_VALUE_INSTANCE_ID))) { nr_attribute_slots++; } - if (shader->info.system_values_read & - BITFIELD64_BIT(SYSTEM_VALUE_BASE_VERTEX)) - prog_data->uses_basevertex = true; + /* gl_DrawID and IsIndexedDraw share its very own vec4 */ + if (nir->info.system_values_read & + (BITFIELD64_BIT(SYSTEM_VALUE_DRAW_ID) | + BITFIELD64_BIT(SYSTEM_VALUE_IS_INDEXED_DRAW))) { + nr_attribute_slots++; + } - if (shader->info.system_values_read & + if (nir->info.system_values_read & + BITFIELD64_BIT(SYSTEM_VALUE_IS_INDEXED_DRAW)) + prog_data->uses_is_indexed_draw = true; + + if (nir->info.system_values_read & + BITFIELD64_BIT(SYSTEM_VALUE_FIRST_VERTEX)) + prog_data->uses_firstvertex = true; + + if (nir->info.system_values_read & BITFIELD64_BIT(SYSTEM_VALUE_BASE_INSTANCE)) prog_data->uses_baseinstance = true; - if (shader->info.system_values_read & + if (nir->info.system_values_read & BITFIELD64_BIT(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE)) prog_data->uses_vertexid = true; - if (shader->info.system_values_read & + if (nir->info.system_values_read & BITFIELD64_BIT(SYSTEM_VALUE_INSTANCE_ID)) prog_data->uses_instanceid = true; - /* gl_DrawID has its very own vec4 */ - if (shader->info.system_values_read & - BITFIELD64_BIT(SYSTEM_VALUE_DRAW_ID)) { - prog_data->uses_drawid = true; - nr_attribute_slots++; - } + if (nir->info.system_values_read & + BITFIELD64_BIT(SYSTEM_VALUE_DRAW_ID)) + prog_data->uses_drawid = true; /* The 3DSTATE_VS documentation lists the lower bound on "Vertex URB Entry * Read Length" as 1 in vec4 mode, and 0 in SIMD8 mode. Empirically, in @@ -2859,9 +2975,9 @@ brw_compile_vs(const struct brw_compiler *compiler, void *log_data, if (is_scalar) { prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8; - fs_visitor v(compiler, log_data, mem_ctx, key, &prog_data->base.base, - NULL, /* prog; Only used for TEXTURE_RECTANGLE on gen < 8 */ - shader, 8, shader_time_index); + fs_visitor v(compiler, log_data, mem_ctx, &key->base, + &prog_data->base.base, + nir, 8, shader_time_index); if (!v.run_vs()) { if (error_str) *error_str = ralloc_strdup(mem_ctx, v.fail_msg); @@ -2871,19 +2987,21 @@ brw_compile_vs(const struct brw_compiler *compiler, void *log_data, prog_data->base.base.dispatch_grf_start_reg = v.payload.num_regs; - fs_generator g(compiler, log_data, mem_ctx, (void *) key, - &prog_data->base.base, v.promoted_constants, - v.runtime_check_aads_emit, MESA_SHADER_VERTEX); + fs_generator g(compiler, log_data, mem_ctx, + &prog_data->base.base, v.runtime_check_aads_emit, + MESA_SHADER_VERTEX); if (INTEL_DEBUG & DEBUG_VS) { const char *debug_name = ralloc_asprintf(mem_ctx, "%s vertex shader %s", - shader->info.label ? shader->info.label : + nir->info.label ? nir->info.label : "unnamed", - shader->info.name); + nir->info.name); g.enable_debug(debug_name); } - g.generate_code(v.cfg, 8); + g.generate_code(v.cfg, 8, v.shader_stats, + v.performance_analysis.require(), stats); + g.add_const_data(nir->constant_data, nir->constant_data_size); assembly = g.get_assembly(); } @@ -2891,7 +3009,7 @@ brw_compile_vs(const struct brw_compiler *compiler, void *log_data, prog_data->base.dispatch_mode = DISPATCH_MODE_4X2_DUAL_OBJECT; vec4_vs_visitor v(compiler, log_data, key, prog_data, - shader, mem_ctx, shader_time_index); + nir, mem_ctx, shader_time_index); if (!v.run()) { if (error_str) *error_str = ralloc_strdup(mem_ctx, v.fail_msg); @@ -2900,7 +3018,10 @@ brw_compile_vs(const struct brw_compiler *compiler, void *log_data, } assembly = brw_vec4_generate_assembly(compiler, log_data, mem_ctx, - shader, &prog_data->base, v.cfg); + nir, &prog_data->base, + v.cfg, + v.performance_analysis.require(), + stats); } return assembly;