X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fintel%2Fcompiler%2Fbrw_vec4.cpp;h=ee91be0a7ab6469ff4a12c8b83209d4cf70630af;hb=d4c6e3f196fb61939b0b6c9d1051ac1e35625d42;hp=e27be8fc2547771a5fefb06a3be55c158f38efe9;hpb=7f728bce811fc283e672e3a07b008bb7b52de35e;p=mesa.git diff --git a/src/intel/compiler/brw_vec4.cpp b/src/intel/compiler/brw_vec4.cpp index e27be8fc254..ee91be0a7ab 100644 --- a/src/intel/compiler/brw_vec4.cpp +++ b/src/intel/compiler/brw_vec4.cpp @@ -26,11 +26,11 @@ #include "brw_cfg.h" #include "brw_nir.h" #include "brw_vec4_builder.h" -#include "brw_vec4_live_variables.h" #include "brw_vec4_vs.h" #include "brw_dead_control_flow.h" -#include "common/gen_debug.h" +#include "dev/gen_debug.h" #include "program/prog_parameter.h" +#include "util/u_math.h" #define MAX_INSTRUCTION (1 << 30) @@ -41,9 +41,9 @@ namespace brw { void src_reg::init() { - memset(this, 0, sizeof(*this)); - + memset((void*)this, 0, sizeof(*this)); this->file = BAD_FILE; + this->type = BRW_REGISTER_TYPE_UD; } src_reg::src_reg(enum brw_reg_file file, int nr, const glsl_type *type) @@ -83,8 +83,9 @@ src_reg::src_reg(const dst_reg ®) : void dst_reg::init() { - memset(this, 0, sizeof(*this)); + memset((void*)this, 0, sizeof(*this)); this->file = BAD_FILE; + this->type = BRW_REGISTER_TYPE_UD; this->writemask = WRITEMASK_XYZW; } @@ -146,17 +147,14 @@ dst_reg::equals(const dst_reg &r) const } bool -vec4_instruction::is_send_from_grf() +vec4_instruction::is_send_from_grf() const { switch (opcode) { case SHADER_OPCODE_SHADER_TIME_ADD: case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7: - case SHADER_OPCODE_UNTYPED_ATOMIC: - case SHADER_OPCODE_UNTYPED_SURFACE_READ: - case SHADER_OPCODE_UNTYPED_SURFACE_WRITE: - case SHADER_OPCODE_TYPED_ATOMIC: - case SHADER_OPCODE_TYPED_SURFACE_READ: - case SHADER_OPCODE_TYPED_SURFACE_WRITE: + case VEC4_OPCODE_UNTYPED_ATOMIC: + case VEC4_OPCODE_UNTYPED_SURFACE_READ: + case VEC4_OPCODE_UNTYPED_SURFACE_WRITE: case VEC4_OPCODE_URB_READ: case TCS_OPCODE_URB_WRITE: case TCS_OPCODE_RELEASE_INPUT: @@ -210,12 +208,9 @@ vec4_instruction::size_read(unsigned arg) const { switch (opcode) { case SHADER_OPCODE_SHADER_TIME_ADD: - case SHADER_OPCODE_UNTYPED_ATOMIC: - case SHADER_OPCODE_UNTYPED_SURFACE_READ: - case SHADER_OPCODE_UNTYPED_SURFACE_WRITE: - case SHADER_OPCODE_TYPED_ATOMIC: - case SHADER_OPCODE_TYPED_SURFACE_READ: - case SHADER_OPCODE_TYPED_SURFACE_WRITE: + case VEC4_OPCODE_UNTYPED_ATOMIC: + case VEC4_OPCODE_UNTYPED_SURFACE_READ: + case VEC4_OPCODE_UNTYPED_SURFACE_WRITE: case TCS_OPCODE_URB_WRITE: if (arg == 0) return mlen * REG_SIZE; @@ -255,6 +250,26 @@ vec4_instruction::can_do_source_mods(const struct gen_device_info *devinfo) return true; } +bool +vec4_instruction::can_do_cmod() +{ + if (!backend_instruction::can_do_cmod()) + return false; + + /* The accumulator result appears to get used for the conditional modifier + * generation. When negating a UD value, there is a 33rd bit generated for + * the sign in the accumulator value, so now you can't check, for example, + * equality with a 32-bit value. See piglit fs-op-neg-uvec4. + */ + for (unsigned i = 0; i < 3; i++) { + if (src[i].file != BAD_FILE && + type_is_unsigned_int(src[i].type) && src[i].negate) + return false; + } + + return true; +} + bool vec4_instruction::can_do_writemask(const struct gen_device_info *devinfo) { @@ -311,13 +326,13 @@ vec4_instruction::can_change_types() const * instruction -- the generate_* functions generate additional MOVs * for setup. */ -int -vec4_visitor::implied_mrf_writes(vec4_instruction *inst) +unsigned +vec4_instruction::implied_mrf_writes() const { - if (inst->mlen == 0 || inst->is_send_from_grf()) + if (mlen == 0 || is_send_from_grf()) return 0; - switch (inst->opcode) { + switch (opcode) { case SHADER_OPCODE_RCP: case SHADER_OPCODE_RSQ: case SHADER_OPCODE_SQRT: @@ -360,8 +375,8 @@ vec4_visitor::implied_mrf_writes(vec4_instruction *inst) case SHADER_OPCODE_TG4: case SHADER_OPCODE_TG4_OFFSET: case SHADER_OPCODE_SAMPLEINFO: - case VS_OPCODE_GET_BUFFER_SIZE: - return inst->header_size; + case SHADER_OPCODE_GET_BUFFER_SIZE: + return header_size; default: unreachable("not reached"); } @@ -374,13 +389,20 @@ src_reg::equals(const src_reg &r) const !reladdr && !r.reladdr); } +bool +src_reg::negative_equals(const src_reg &r) const +{ + return this->backend_reg::negative_equals(r) && + !reladdr && !r.reladdr; +} + bool vec4_visitor::opt_vector_float() { bool progress = false; foreach_block(block, cfg) { - int last_reg = -1, last_offset = -1; + unsigned last_reg = ~0u, last_offset = ~0u; enum brw_reg_file last_reg_file = BAD_FILE; uint8_t imm[4] = { 0 }; @@ -391,7 +413,7 @@ vec4_visitor::opt_vector_float() foreach_inst_in_block_safe(vec4_instruction, inst, block) { int vf = -1; - enum brw_reg_type need_type; + enum brw_reg_type need_type = BRW_REGISTER_TYPE_LAST; /* Look for unconditional MOVs from an immediate with a partial * writemask. Skip type-conversion MOVs other than integer 0, @@ -413,7 +435,7 @@ vec4_visitor::opt_vector_float() need_type = BRW_REGISTER_TYPE_F; } } else { - last_reg = -1; + last_reg = ~0u; } /* If this wasn't a MOV, or the destination register doesn't match, @@ -441,7 +463,7 @@ vec4_visitor::opt_vector_float() } inst_count = 0; - last_reg = -1; + last_reg = ~0u;; writemask = 0; dest_type = BRW_REGISTER_TYPE_F; @@ -474,7 +496,7 @@ vec4_visitor::opt_vector_float() } if (progress) - invalidate_live_intervals(); + invalidate_analysis(DEPENDENCY_INSTRUCTIONS); return progress; } @@ -555,7 +577,7 @@ vec4_visitor::opt_reduce_swizzle() } if (progress) - invalidate_live_intervals(); + invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL); return progress; } @@ -583,16 +605,49 @@ vec4_visitor::split_uniform_registers() } } +/* This function returns the register number where we placed the uniform */ +static int +set_push_constant_loc(const int nr_uniforms, int *new_uniform_count, + const int src, const int size, const int channel_size, + int *new_loc, int *new_chan, + int *new_chans_used) +{ + int dst; + /* Find the lowest place we can slot this uniform in. */ + for (dst = 0; dst < nr_uniforms; dst++) { + if (ALIGN(new_chans_used[dst], channel_size) + size <= 4) + break; + } + + assert(dst < nr_uniforms); + + new_loc[src] = dst; + new_chan[src] = ALIGN(new_chans_used[dst], channel_size); + new_chans_used[dst] = ALIGN(new_chans_used[dst], channel_size) + size; + + *new_uniform_count = MAX2(*new_uniform_count, dst + 1); + return dst; +} + void vec4_visitor::pack_uniform_registers() { + if (!compiler->compact_params) + return; + uint8_t chans_used[this->uniforms]; int new_loc[this->uniforms]; int new_chan[this->uniforms]; + bool is_aligned_to_dvec4[this->uniforms]; + int new_chans_used[this->uniforms]; + int channel_sizes[this->uniforms]; memset(chans_used, 0, sizeof(chans_used)); memset(new_loc, 0, sizeof(new_loc)); memset(new_chan, 0, sizeof(new_chan)); + memset(new_chans_used, 0, sizeof(new_chans_used)); + memset(is_aligned_to_dvec4, 0, sizeof(is_aligned_to_dvec4)); + memset(channel_sizes, 0, sizeof(channel_sizes)); /* Find which uniform vectors are actually used by the program. We * expect unused vector elements when we've moved array access out @@ -622,7 +677,7 @@ vec4_visitor::pack_uniform_registers() continue; assert(type_sz(inst->src[i].type) % 4 == 0); - unsigned channel_size = type_sz(inst->src[i].type) / 4; + int channel_size = type_sz(inst->src[i].type) / 4; int reg = inst->src[i].nr; for (int c = 0; c < 4; c++) { @@ -631,10 +686,15 @@ vec4_visitor::pack_uniform_registers() unsigned channel = BRW_GET_SWZ(inst->src[i].swizzle, c) + 1; unsigned used = MAX2(chans_used[reg], channel * channel_size); - if (used <= 4) + if (used <= 4) { chans_used[reg] = used; - else + channel_sizes[reg] = MAX2(channel_sizes[reg], channel_size); + } else { + is_aligned_to_dvec4[reg] = true; + is_aligned_to_dvec4[reg + 1] = true; chans_used[reg + 1] = used - 4; + channel_sizes[reg + 1] = MAX2(channel_sizes[reg + 1], channel_size); + } } } @@ -652,49 +712,69 @@ vec4_visitor::pack_uniform_registers() * the next part of our packing algorithm. */ int reg = inst->src[0].nr; - for (unsigned i = 0; i < vec4s_read; i++) + int channel_size = type_sz(inst->src[0].type) / 4; + for (unsigned i = 0; i < vec4s_read; i++) { chans_used[reg + i] = 4; + channel_sizes[reg + i] = MAX2(channel_sizes[reg + i], channel_size); + } } } int new_uniform_count = 0; + /* As the uniforms are going to be reordered, take the data from a temporary + * copy of the original param[]. + */ + uint32_t *param = ralloc_array(NULL, uint32_t, stage_prog_data->nr_params); + memcpy(param, stage_prog_data->param, + sizeof(uint32_t) * stage_prog_data->nr_params); + /* Now, figure out a packing of the live uniform vectors into our - * push constants. + * push constants. Start with dvec{3,4} because they are aligned to + * dvec4 size (2 vec4). */ for (int src = 0; src < uniforms; src++) { int size = chans_used[src]; - if (size == 0) + if (size == 0 || !is_aligned_to_dvec4[src]) continue; - int dst; - /* Find the lowest place we can slot this uniform in. */ - for (dst = 0; dst < src; dst++) { - if (chans_used[dst] + size <= 4) - break; + /* dvec3 are aligned to dvec4 size, apply the alignment of the size + * to 4 to avoid moving last component of a dvec3 to the available + * location at the end of a previous dvec3. These available locations + * could be filled by smaller variables in next loop. + */ + size = ALIGN(size, 4); + int dst = set_push_constant_loc(uniforms, &new_uniform_count, + src, size, channel_sizes[src], + new_loc, new_chan, + new_chans_used); + /* Move the references to the data */ + for (int j = 0; j < size; j++) { + stage_prog_data->param[dst * 4 + new_chan[src] + j] = + param[src * 4 + j]; } + } - if (src == dst) { - new_loc[src] = dst; - new_chan[src] = 0; - } else { - new_loc[src] = dst; - new_chan[src] = chans_used[dst]; + /* Continue with the rest of data, which is aligned to vec4. */ + for (int src = 0; src < uniforms; src++) { + int size = chans_used[src]; - /* Move the references to the data */ - for (int j = 0; j < size; j++) { - stage_prog_data->param[dst * 4 + new_chan[src] + j] = - stage_prog_data->param[src * 4 + j]; - } + if (size == 0 || is_aligned_to_dvec4[src]) + continue; - chans_used[dst] += size; - chans_used[src] = 0; + int dst = set_push_constant_loc(uniforms, &new_uniform_count, + src, size, channel_sizes[src], + new_loc, new_chan, + new_chans_used); + /* Move the references to the data */ + for (int j = 0; j < size; j++) { + stage_prog_data->param[dst * 4 + new_chan[src] + j] = + param[src * 4 + j]; } - - new_uniform_count = MAX2(new_uniform_count, dst + 1); } + ralloc_free(param); this->uniforms = new_uniform_count; /* Now, update the instructions for our repacked uniforms. */ @@ -705,9 +785,9 @@ vec4_visitor::pack_uniform_registers() if (inst->src[i].file != UNIFORM) continue; + int chan = new_chan[src] / channel_sizes[src]; inst->src[i].nr = new_loc[src]; - inst->src[i].swizzle += BRW_SWIZZLE4(new_chan[src], new_chan[src], - new_chan[src], new_chan[src]); + inst->src[i].swizzle += BRW_SWIZZLE4(chan, chan, chan, chan); } } } @@ -736,10 +816,19 @@ vec4_visitor::opt_algebraic() break; if (inst->saturate) { - if (inst->dst.type != inst->src[0].type) + /* Full mixed-type saturates don't happen. However, we can end up + * with things like: + * + * mov.sat(8) g21<1>DF -1F + * + * Other mixed-size-but-same-base-type cases may also be possible. + */ + if (inst->dst.type != inst->src[0].type && + inst->dst.type != BRW_REGISTER_TYPE_DF && + inst->src[0].type != BRW_REGISTER_TYPE_F) assert(!"unimplemented: saturate mixed types"); - if (brw_saturate_immediate(inst->dst.type, + if (brw_saturate_immediate(inst->src[0].type, &inst->src[0].as_brw_reg())) { inst->saturate = false; progress = true; @@ -747,6 +836,14 @@ vec4_visitor::opt_algebraic() } break; + case BRW_OPCODE_OR: + if (inst->src[1].is_zero()) { + inst->opcode = BRW_OPCODE_MOV; + inst->src[1] = src_reg(); + progress = true; + } + break; + case VEC4_OPCODE_UNPACK_UNIFORM: if (inst->src[0].file != UNIFORM) { inst->opcode = BRW_OPCODE_MOV; @@ -791,18 +888,6 @@ vec4_visitor::opt_algebraic() progress = true; } break; - case BRW_OPCODE_CMP: - if (inst->conditional_mod == BRW_CONDITIONAL_GE && - inst->src[0].abs && - inst->src[0].negate && - inst->src[1].is_zero()) { - inst->src[0].abs = false; - inst->src[0].negate = false; - inst->conditional_mod = BRW_CONDITIONAL_Z; - progress = true; - break; - } - break; case SHADER_OPCODE_BROADCAST: if (is_uniform(inst->src[0]) || inst->src[1].is_zero()) { @@ -819,7 +904,8 @@ vec4_visitor::opt_algebraic() } if (progress) - invalidate_live_intervals(); + invalidate_analysis(DEPENDENCY_INSTRUCTION_DATA_FLOW | + DEPENDENCY_INSTRUCTION_DETAIL); return progress; } @@ -853,7 +939,7 @@ vec4_visitor::move_push_constants_to_pull_constants() pull_constant_loc[i / 4] = -1; if (i >= max_uniform_components) { - const gl_constant_value **values = &stage_prog_data->param[i]; + uint32_t *values = &stage_prog_data->param[i]; /* Try to find an existing copy of this uniform in the pull * constants if it was part of an array access already. @@ -932,7 +1018,7 @@ vec4_visitor::is_dep_ctrl_unsafe(const vec4_instruction *inst) * affected, at least by the 64b restriction, since DepCtrl with double * precision instructions seems to produce GPU hangs in some cases. */ - if (devinfo->gen == 8 || devinfo->is_broxton) { + if (devinfo->gen == 8 || gen_device_info_is_9lp(devinfo)) { if (inst->opcode == BRW_OPCODE_MUL && IS_DWORD(inst->src[0]) && IS_DWORD(inst->src[1])) @@ -1071,6 +1157,12 @@ vec4_instruction::can_reswizzle(const struct gen_device_info *devinfo, if (devinfo->gen == 6 && is_math() && swizzle != BRW_SWIZZLE_XYZW) return false; + /* If we write to the flag register changing the swizzle would change + * what channels are written to the flag register. + */ + if (writes_flag()) + return false; + /* We can't swizzle implicit accumulator access. We'd have to * reswizzle the producer of the accumulator value in addition * to the consumer (i.e. both MUL and MACH). Just skip this. @@ -1115,9 +1207,31 @@ vec4_instruction::reswizzle(int dst_writemask, int swizzle) opcode != BRW_OPCODE_DP3 && opcode != BRW_OPCODE_DP2 && opcode != VEC4_OPCODE_PACK_BYTES) { for (int i = 0; i < 3; i++) { - if (src[i].file == BAD_FILE || src[i].file == IMM) + if (src[i].file == BAD_FILE) continue; + if (src[i].file == IMM) { + assert(src[i].type != BRW_REGISTER_TYPE_V && + src[i].type != BRW_REGISTER_TYPE_UV); + + /* Vector immediate types need to be reswizzled. */ + if (src[i].type == BRW_REGISTER_TYPE_VF) { + const unsigned imm[] = { + (src[i].ud >> 0) & 0x0ff, + (src[i].ud >> 8) & 0x0ff, + (src[i].ud >> 16) & 0x0ff, + (src[i].ud >> 24) & 0x0ff, + }; + + src[i] = brw_imm_vf4(imm[BRW_GET_SWZ(swizzle, 0)], + imm[BRW_GET_SWZ(swizzle, 1)], + imm[BRW_GET_SWZ(swizzle, 2)], + imm[BRW_GET_SWZ(swizzle, 3)]); + } + + continue; + } + src[i].swizzle = brw_compose_swizzle(swizzle, src[i].swizzle); } } @@ -1139,8 +1253,7 @@ vec4_visitor::opt_register_coalesce() { bool progress = false; int next_ip = 0; - - calculate_live_intervals(); + const vec4_live_variables &live = live_analysis.require(); foreach_block_and_inst_safe (block, vec4_instruction, inst, cfg) { int ip = next_ip; @@ -1182,7 +1295,7 @@ vec4_visitor::opt_register_coalesce() /* Can't coalesce this GRF if someone else was going to * read it later. */ - if (var_range_end(var_from_reg(alloc, dst_reg(inst->src[0])), 8) > ip) + if (live.var_range_end(var_from_reg(alloc, dst_reg(inst->src[0])), 8) > ip) continue; /* We need to check interference with the final destination between this @@ -1222,6 +1335,15 @@ vec4_visitor::opt_register_coalesce() } } + /* VS_OPCODE_UNPACK_FLAGS_SIMD4X2 generates a bunch of mov(1) + * instructions, and this optimization pass is not capable of + * handling that. Bail on these instructions and hope that some + * later optimization pass can do the right thing after they are + * expanded. + */ + if (scan_inst->opcode == VS_OPCODE_UNPACK_FLAGS_SIMD4X2) + break; + /* This doesn't handle saturation on the instruction we * want to coalesce away if the register types do not match. * But if scan_inst is a non type-converting 'mov', we can fix @@ -1299,8 +1421,10 @@ vec4_visitor::opt_register_coalesce() * in the register instead. */ if (to_mrf && scan_inst->mlen > 0) { - if (inst->dst.nr >= scan_inst->base_mrf && - inst->dst.nr < scan_inst->base_mrf + scan_inst->mlen) { + unsigned start = scan_inst->base_mrf; + unsigned end = scan_inst->base_mrf + scan_inst->mlen; + + if (inst->dst.nr >= start && inst->dst.nr < end) { break; } } else { @@ -1350,7 +1474,7 @@ vec4_visitor::opt_register_coalesce() } if (progress) - invalidate_live_intervals(); + invalidate_analysis(DEPENDENCY_INSTRUCTIONS); return progress; } @@ -1400,6 +1524,9 @@ vec4_visitor::eliminate_find_live_channel() } } + if (progress) + invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL); + return progress; } @@ -1474,24 +1601,25 @@ vec4_visitor::split_virtual_grfs() } } } - invalidate_live_intervals(); + invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL | DEPENDENCY_VARIABLES); } void -vec4_visitor::dump_instruction(backend_instruction *be_inst) +vec4_visitor::dump_instruction(const backend_instruction *be_inst) const { dump_instruction(be_inst, stderr); } void -vec4_visitor::dump_instruction(backend_instruction *be_inst, FILE *file) +vec4_visitor::dump_instruction(const backend_instruction *be_inst, FILE *file) const { - vec4_instruction *inst = (vec4_instruction *)be_inst; + const vec4_instruction *inst = (const vec4_instruction *)be_inst; if (inst->predicate) { - fprintf(file, "(%cf0.%d%s) ", + fprintf(file, "(%cf%d.%d%s) ", inst->predicate_inverse ? '-' : '+', - inst->flag_subreg, + inst->flag_subreg / 2, + inst->flag_subreg % 2, pred_ctrl_align16[inst->predicate]); } @@ -1503,9 +1631,10 @@ vec4_visitor::dump_instruction(backend_instruction *be_inst, FILE *file) fprintf(file, "%s", conditional_modifier[inst->conditional_mod]); if (!inst->predicate && (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL && + inst->opcode != BRW_OPCODE_CSEL && inst->opcode != BRW_OPCODE_IF && inst->opcode != BRW_OPCODE_WHILE))) { - fprintf(file, ".f0.%d", inst->flag_subreg); + fprintf(file, ".f%d.%d", inst->flag_subreg / 2, inst->flag_subreg % 2); } } fprintf(file, " "); @@ -1565,7 +1694,7 @@ vec4_visitor::dump_instruction(backend_instruction *be_inst, FILE *file) if (inst->dst.writemask & 8) fprintf(file, "w"); } - fprintf(file, ":%s", brw_reg_type_letters(inst->dst.type)); + fprintf(file, ":%s", brw_reg_type_to_letters(inst->dst.type)); if (inst->src[0].file != BAD_FILE) fprintf(file, ", "); @@ -1660,7 +1789,7 @@ vec4_visitor::dump_instruction(backend_instruction *be_inst, FILE *file) fprintf(file, "|"); if (inst->src[i].file != IMM) { - fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type)); + fprintf(file, ":%s", brw_reg_type_to_letters(inst->src[i].type)); } if (i < 2 && inst->src[i + 1].file != BAD_FILE) @@ -1677,103 +1806,26 @@ vec4_visitor::dump_instruction(backend_instruction *be_inst, FILE *file) } -static inline struct brw_reg -attribute_to_hw_reg(int attr, brw_reg_type type, bool interleaved) -{ - struct brw_reg reg; - - unsigned width = REG_SIZE / 2 / MAX2(4, type_sz(type)); - if (interleaved) { - reg = stride(brw_vecn_grf(width, attr / 2, (attr % 2) * 4), 0, width, 1); - } else { - reg = brw_vecn_grf(width, attr, 0); - } - - reg.type = type; - return reg; -} - - -/** - * Replace each register of type ATTR in this->instructions with a reference - * to a fixed HW register. - * - * If interleaved is true, then each attribute takes up half a register, with - * register N containing attribute 2*N in its first half and attribute 2*N+1 - * in its second half (this corresponds to the payload setup used by geometry - * shaders in "single" or "dual instanced" dispatch mode). If interleaved is - * false, then each attribute takes up a whole register, with register N - * containing attribute N (this corresponds to the payload setup used by - * vertex shaders, and by geometry shaders in "dual object" dispatch mode). - */ -void -vec4_visitor::lower_attributes_to_hw_regs(const int *attribute_map, - bool interleaved) -{ - foreach_block_and_inst(block, vec4_instruction, inst, cfg) { - for (int i = 0; i < 3; i++) { - if (inst->src[i].file != ATTR) - continue; - - int grf = attribute_map[inst->src[i].nr + - inst->src[i].offset / REG_SIZE]; - assert(inst->src[i].offset % REG_SIZE == 0); - - /* All attributes used in the shader need to have been assigned a - * hardware register by the caller - */ - assert(grf != 0); - - struct brw_reg reg = - attribute_to_hw_reg(grf, inst->src[i].type, interleaved); - reg.swizzle = inst->src[i].swizzle; - if (inst->src[i].abs) - reg = brw_abs(reg); - if (inst->src[i].negate) - reg = negate(reg); - - inst->src[i] = reg; - } - } -} - int vec4_vs_visitor::setup_attributes(int payload_reg) { - int nr_attributes; - int attribute_map[VERT_ATTRIB_MAX + 2]; - memset(attribute_map, 0, sizeof(attribute_map)); - - nr_attributes = 0; - GLbitfield64 vs_inputs = vs_prog_data->inputs_read; - while (vs_inputs) { - GLuint first = ffsll(vs_inputs) - 1; - int needed_slots = - (vs_prog_data->double_inputs_read & BITFIELD64_BIT(first)) ? 2 : 1; - for (int c = 0; c < needed_slots; c++) { - attribute_map[first + c] = payload_reg + nr_attributes; - nr_attributes++; - vs_inputs &= ~BITFIELD64_BIT(first + c); + foreach_block_and_inst(block, vec4_instruction, inst, cfg) { + for (int i = 0; i < 3; i++) { + if (inst->src[i].file == ATTR) { + assert(inst->src[i].offset % REG_SIZE == 0); + int grf = payload_reg + inst->src[i].nr + + inst->src[i].offset / REG_SIZE; + + struct brw_reg reg = brw_vec8_grf(grf, 0); + reg.swizzle = inst->src[i].swizzle; + reg.type = inst->src[i].type; + reg.abs = inst->src[i].abs; + reg.negate = inst->src[i].negate; + inst->src[i] = reg; + } } } - /* VertexID is stored by the VF as the last vertex element, but we - * don't represent it with a flag in inputs_read, so we call it - * VERT_ATTRIB_MAX. - */ - if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid || - vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance) { - attribute_map[VERT_ATTRIB_MAX] = payload_reg + nr_attributes; - nr_attributes++; - } - - if (vs_prog_data->uses_drawid) { - attribute_map[VERT_ATTRIB_MAX + 1] = payload_reg + nr_attributes; - nr_attributes++; - } - - lower_attributes_to_hw_regs(attribute_map, false /* interleaved */); - return payload_reg + vs_prog_data->nr_attribute_slots; } @@ -1786,12 +1838,10 @@ vec4_visitor::setup_uniforms(int reg) * matter what, or the GPU would hang. */ if (devinfo->gen < 6 && this->uniforms == 0) { - stage_prog_data->param = - reralloc(NULL, stage_prog_data->param, const gl_constant_value *, 4); + brw_stage_prog_data_add_params(stage_prog_data, 4); for (unsigned int i = 0; i < 4; i++) { unsigned int slot = this->uniforms * 4 + i; - static gl_constant_value zero = { 0.0 }; - stage_prog_data->param[slot] = &zero; + stage_prog_data->param[slot] = BRW_PARAM_BUILTIN_ZERO; } this->uniforms++; @@ -1800,6 +1850,9 @@ vec4_visitor::setup_uniforms(int reg) reg += ALIGN(uniforms, 2) / 2; } + for (int i = 0; i < 4; i++) + reg += stage_prog_data->ubo_ranges[i].length; + stage_prog_data->nr_params = this->uniforms * 4; prog_data->base.curb_read_length = @@ -1851,7 +1904,7 @@ vec4_visitor::lower_minmax() } if (progress) - invalidate_live_intervals(); + invalidate_analysis(DEPENDENCY_INSTRUCTIONS); return progress; } @@ -1966,18 +2019,41 @@ is_align1_df(vec4_instruction *inst) } } +/** + * Three source instruction must have a GRF/MRF destination register. + * ARF NULL is not allowed. Fix that up by allocating a temporary GRF. + */ +void +vec4_visitor::fixup_3src_null_dest() +{ + bool progress = false; + + foreach_block_and_inst_safe (block, vec4_instruction, inst, cfg) { + if (inst->is_3src(devinfo) && inst->dst.is_null()) { + const unsigned size_written = type_sz(inst->dst.type); + const unsigned num_regs = DIV_ROUND_UP(size_written, REG_SIZE); + + inst->dst = retype(dst_reg(VGRF, alloc.allocate(num_regs)), + inst->dst.type); + progress = true; + } + } + + if (progress) + invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL | + DEPENDENCY_VARIABLES); +} + void vec4_visitor::convert_to_hw_regs() { foreach_block_and_inst(block, vec4_instruction, inst, cfg) { for (int i = 0; i < 3; i++) { - struct src_reg &src = inst->src[i]; + class src_reg &src = inst->src[i]; struct brw_reg reg; switch (src.file) { case VGRF: { - const unsigned type_size = type_sz(src.type); - const unsigned width = REG_SIZE / 2 / MAX2(4, type_size); - reg = byte_offset(brw_vecn_grf(width, src.nr, 0), src.offset); + reg = byte_offset(brw_vecn_grf(4, src.nr, 0), src.offset); reg.type = src.type; reg.abs = src.abs; reg.negate = src.negate; @@ -1985,12 +2061,11 @@ vec4_visitor::convert_to_hw_regs() } case UNIFORM: { - const unsigned width = REG_SIZE / 2 / MAX2(4, type_sz(src.type)); reg = stride(byte_offset(brw_vec4_grf( prog_data->base.dispatch_grf_start_reg + src.nr / 2, src.nr % 2 * 4), src.offset), - 0, width, 1); + 0, 4, 1); reg.type = src.type; reg.abs = src.abs; reg.negate = src.negate; @@ -2264,7 +2339,12 @@ vec4_visitor::lower_simd_width() if (linst->src[i].file == BAD_FILE) continue; - if (!is_uniform(linst->src[i])) + bool is_interleaved_attr = + linst->src[i].file == ATTR && + stage_uses_interleaved_attributes(stage, + prog_data->dispatch_mode); + + if (!is_uniform(linst->src[i]) && !is_interleaved_attr) linst->src[i] = horiz_offset(linst->src[i], channel_offset); } @@ -2289,7 +2369,7 @@ vec4_visitor::lower_simd_width() } if (progress) - invalidate_live_intervals(); + invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); return progress; } @@ -2446,7 +2526,7 @@ vec4_visitor::scalarize_df() } if (progress) - invalidate_live_intervals(); + invalidate_analysis(DEPENDENCY_INSTRUCTIONS); return progress; } @@ -2489,7 +2569,7 @@ vec4_visitor::lower_64bit_mad_to_mul_add() } if (progress) - invalidate_live_intervals(); + invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); return progress; } @@ -2527,6 +2607,11 @@ vec4_visitor::apply_logical_swizzle(struct brw_reg *hw_reg, assert(brw_is_single_value_swizzle(reg.swizzle) || is_supported_64bit_region(inst, arg)); + /* Apply the region <2, 2, 1> for GRF or <0, 2, 1> for uniforms, as align16 + * HW can only do 32-bit swizzle channels. + */ + hw_reg->width = BRW_WIDTH_2; + if (is_supported_64bit_region(inst, arg) && !is_gen7_supported_64bit_swizzle(inst, arg)) { /* Supported 64-bit swizzles are those such that their first two @@ -2581,6 +2666,13 @@ vec4_visitor::apply_logical_swizzle(struct brw_reg *hw_reg, } } +void +vec4_visitor::invalidate_analysis(brw::analysis_dependency_class c) +{ + backend_shader::invalidate_analysis(c); + live_analysis.invalidate(c); +} + bool vec4_visitor::run() { @@ -2618,7 +2710,7 @@ vec4_visitor::run() if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \ char filename[64]; \ snprintf(filename, 64, "%s-%s-%02d-%02d-" #pass, \ - stage_abbrev, nir->info->name, iteration, pass_num); \ + stage_abbrev, nir->info.name, iteration, pass_num); \ \ backend_shader::dump_instructions(filename); \ } \ @@ -2631,7 +2723,7 @@ vec4_visitor::run() if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) { char filename[64]; snprintf(filename, 64, "%s-%s-00-00-start", - stage_abbrev, nir->info->name); + stage_abbrev, nir->info.name); backend_shader::dump_instructions(filename); } @@ -2710,6 +2802,8 @@ vec4_visitor::run() OPT(scalarize_df); } + fixup_3src_null_dest(); + bool allocated_without_spills = reg_allocate(); if (!allocated_without_spills) { @@ -2759,50 +2853,87 @@ brw_compile_vs(const struct brw_compiler *compiler, void *log_data, void *mem_ctx, const struct brw_vs_prog_key *key, struct brw_vs_prog_data *prog_data, - const nir_shader *src_shader, - gl_clip_plane *clip_planes, - bool use_legacy_snorm_formula, + nir_shader *nir, int shader_time_index, - unsigned *final_assembly_size, + struct brw_compile_stats *stats, char **error_str) { const bool is_scalar = compiler->scalar_stage[MESA_SHADER_VERTEX]; - nir_shader *shader = nir_shader_clone(mem_ctx, src_shader); - shader = brw_nir_apply_sampler_key(shader, compiler, &key->tex, is_scalar); - brw_nir_lower_vs_inputs(shader, is_scalar, - use_legacy_snorm_formula, key->gl_attrib_wa_flags); - brw_nir_lower_vue_outputs(shader, is_scalar); - shader = brw_postprocess_nir(shader, compiler, is_scalar); + brw_nir_apply_key(nir, compiler, &key->base, 8, is_scalar); const unsigned *assembly = NULL; + if (prog_data->base.vue_map.varying_to_slot[VARYING_SLOT_EDGE] != -1) { + /* If the output VUE map contains VARYING_SLOT_EDGE then we need to copy + * the edge flag from VERT_ATTRIB_EDGEFLAG. This will be done + * automatically by brw_vec4_visitor::emit_urb_slot but we need to + * ensure that prog_data->inputs_read is accurate. + * + * In order to make late NIR passes aware of the change, we actually + * whack shader->info.inputs_read instead. This is safe because we just + * made a copy of the shader. + */ + assert(!is_scalar); + assert(key->copy_edgeflag); + nir->info.inputs_read |= VERT_BIT_EDGEFLAG; + } + + prog_data->inputs_read = nir->info.inputs_read; + prog_data->double_inputs_read = nir->info.vs.double_inputs; + + brw_nir_lower_vs_inputs(nir, key->gl_attrib_wa_flags); + brw_nir_lower_vue_outputs(nir); + brw_postprocess_nir(nir, compiler, is_scalar); + prog_data->base.clip_distance_mask = - ((1 << shader->info->clip_distance_array_size) - 1); + ((1 << nir->info.clip_distance_array_size) - 1); prog_data->base.cull_distance_mask = - ((1 << shader->info->cull_distance_array_size) - 1) << - shader->info->clip_distance_array_size; + ((1 << nir->info.cull_distance_array_size) - 1) << + nir->info.clip_distance_array_size; - unsigned nr_attribute_slots = _mesa_bitcount_64(prog_data->inputs_read); + unsigned nr_attribute_slots = util_bitcount64(prog_data->inputs_read); /* gl_VertexID and gl_InstanceID are system values, but arrive via an * incoming vertex attribute. So, add an extra slot. */ - if (shader->info->system_values_read & - (BITFIELD64_BIT(SYSTEM_VALUE_BASE_VERTEX) | + if (nir->info.system_values_read & + (BITFIELD64_BIT(SYSTEM_VALUE_FIRST_VERTEX) | BITFIELD64_BIT(SYSTEM_VALUE_BASE_INSTANCE) | BITFIELD64_BIT(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) | BITFIELD64_BIT(SYSTEM_VALUE_INSTANCE_ID))) { nr_attribute_slots++; } - /* gl_DrawID has its very own vec4 */ - if (shader->info->system_values_read & - BITFIELD64_BIT(SYSTEM_VALUE_DRAW_ID)) { + /* gl_DrawID and IsIndexedDraw share its very own vec4 */ + if (nir->info.system_values_read & + (BITFIELD64_BIT(SYSTEM_VALUE_DRAW_ID) | + BITFIELD64_BIT(SYSTEM_VALUE_IS_INDEXED_DRAW))) { nr_attribute_slots++; } - unsigned nr_attributes = nr_attribute_slots - - DIV_ROUND_UP(_mesa_bitcount_64(shader->info->double_inputs_read), 2); + if (nir->info.system_values_read & + BITFIELD64_BIT(SYSTEM_VALUE_IS_INDEXED_DRAW)) + prog_data->uses_is_indexed_draw = true; + + if (nir->info.system_values_read & + BITFIELD64_BIT(SYSTEM_VALUE_FIRST_VERTEX)) + prog_data->uses_firstvertex = true; + + if (nir->info.system_values_read & + BITFIELD64_BIT(SYSTEM_VALUE_BASE_INSTANCE)) + prog_data->uses_baseinstance = true; + + if (nir->info.system_values_read & + BITFIELD64_BIT(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE)) + prog_data->uses_vertexid = true; + + if (nir->info.system_values_read & + BITFIELD64_BIT(SYSTEM_VALUE_INSTANCE_ID)) + prog_data->uses_instanceid = true; + + if (nir->info.system_values_read & + BITFIELD64_BIT(SYSTEM_VALUE_DRAW_ID)) + prog_data->uses_drawid = true; /* The 3DSTATE_VS documentation lists the lower bound on "Vertex URB Entry * Read Length" as 1 in vec4 mode, and 0 in SIMD8 mode. Empirically, in @@ -2815,7 +2946,6 @@ brw_compile_vs(const struct brw_compiler *compiler, void *log_data, prog_data->base.urb_read_length = DIV_ROUND_UP(MAX2(nr_attribute_slots, 1), 2); - prog_data->nr_attributes = nr_attributes; prog_data->nr_attribute_slots = nr_attribute_slots; /* Since vertex shaders reuse the same VUE entry for inputs and outputs @@ -2825,10 +2955,17 @@ brw_compile_vs(const struct brw_compiler *compiler, void *log_data, const unsigned vue_entries = MAX2(nr_attribute_slots, (unsigned)prog_data->base.vue_map.num_slots); - if (compiler->devinfo->gen == 6) + if (compiler->devinfo->gen == 6) { prog_data->base.urb_entry_size = DIV_ROUND_UP(vue_entries, 8); - else + } else { prog_data->base.urb_entry_size = DIV_ROUND_UP(vue_entries, 4); + /* On Cannonlake software shall not program an allocation size that + * specifies a size that is a multiple of 3 64B (512-bit) cachelines. + */ + if (compiler->devinfo->gen == 10 && + prog_data->base.urb_entry_size % 3 == 0) + prog_data->base.urb_entry_size++; + } if (INTEL_DEBUG & DEBUG_VS) { fprintf(stderr, "VS Output "); @@ -2838,10 +2975,10 @@ brw_compile_vs(const struct brw_compiler *compiler, void *log_data, if (is_scalar) { prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8; - fs_visitor v(compiler, log_data, mem_ctx, key, &prog_data->base.base, - NULL, /* prog; Only used for TEXTURE_RECTANGLE on gen < 8 */ - shader, 8, shader_time_index); - if (!v.run_vs(clip_planes)) { + fs_visitor v(compiler, log_data, mem_ctx, &key->base, + &prog_data->base.base, + nir, 8, shader_time_index); + if (!v.run_vs()) { if (error_str) *error_str = ralloc_strdup(mem_ctx, v.fail_msg); @@ -2850,28 +2987,29 @@ brw_compile_vs(const struct brw_compiler *compiler, void *log_data, prog_data->base.base.dispatch_grf_start_reg = v.payload.num_regs; - fs_generator g(compiler, log_data, mem_ctx, (void *) key, - &prog_data->base.base, v.promoted_constants, - v.runtime_check_aads_emit, MESA_SHADER_VERTEX); + fs_generator g(compiler, log_data, mem_ctx, + &prog_data->base.base, v.runtime_check_aads_emit, + MESA_SHADER_VERTEX); if (INTEL_DEBUG & DEBUG_VS) { const char *debug_name = ralloc_asprintf(mem_ctx, "%s vertex shader %s", - shader->info->label ? shader->info->label : + nir->info.label ? nir->info.label : "unnamed", - shader->info->name); + nir->info.name); g.enable_debug(debug_name); } - g.generate_code(v.cfg, 8); - assembly = g.get_assembly(final_assembly_size); + g.generate_code(v.cfg, 8, v.shader_stats, + v.performance_analysis.require(), stats); + g.add_const_data(nir->constant_data, nir->constant_data_size); + assembly = g.get_assembly(); } if (!assembly) { prog_data->base.dispatch_mode = DISPATCH_MODE_4X2_DUAL_OBJECT; vec4_vs_visitor v(compiler, log_data, key, prog_data, - shader, clip_planes, mem_ctx, - shader_time_index, use_legacy_snorm_formula); + nir, mem_ctx, shader_time_index); if (!v.run()) { if (error_str) *error_str = ralloc_strdup(mem_ctx, v.fail_msg); @@ -2880,8 +3018,10 @@ brw_compile_vs(const struct brw_compiler *compiler, void *log_data, } assembly = brw_vec4_generate_assembly(compiler, log_data, mem_ctx, - shader, &prog_data->base, v.cfg, - final_assembly_size); + nir, &prog_data->base, + v.cfg, + v.performance_analysis.require(), + stats); } return assembly;