X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fmesa%2Fdrivers%2Fdri%2Fi965%2Fbrw_fs.cpp;h=c858f449c8f0642548a964dd26099e222640428c;hb=0e657b7b55bc7c83c8eb5258cd9522b0e5e581b7;hp=0244f593149bac3d7e7745a107f06e47a7277fee;hpb=69570bbad876bb9da609c3b651aacda28cecc542;p=mesa.git diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index 0244f593149..c858f449c8f 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -172,12 +172,12 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld, * be any component of a vector, and then we load 4 contiguous * components starting from that. * - * We break down the const_offset to a portion added to the variable - * offset and a portion done using reg_offset, which means that if you - * have GLSL using something like "uniform vec4 a[20]; gl_FragColor = - * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and - * CSE can later notice that those loads are all the same and eliminate - * the redundant ones. + * We break down the const_offset to a portion added to the variable offset + * and a portion done using fs_reg::offset, which means that if you have + * GLSL using something like "uniform vec4 a[20]; gl_FragColor = a[i]", + * we'll temporarily generate 4 vec4 loads from offset i * 4, and CSE can + * later notice that those loads are all the same and eliminate the + * redundant ones. */ fs_reg vec4_offset = vgrf(glsl_type::uint_type); bld.ADD(vec4_offset, varying_offset, brw_imm_ud(const_offset & ~0xf)); @@ -191,7 +191,7 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld, fs_reg vec4_result = bld.vgrf(BRW_REGISTER_TYPE_F, 4); fs_inst *inst = bld.emit(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL, vec4_result, surf_index, vec4_offset); - inst->size_written = 4 * bld.dispatch_width() / 8 * REG_SIZE; + inst->size_written = 4 * vec4_result.component_size(inst->exec_size); if (type_sz(dst.type) == 8) { shuffle_32bit_load_result_to_64bit_data( @@ -240,12 +240,6 @@ fs_inst::equals(fs_inst *inst) const offset == inst->offset); } -bool -fs_inst::overwrites_reg(const fs_reg ®) const -{ - return reg.in_range(dst, DIV_ROUND_UP(size_written, REG_SIZE)); -} - bool fs_inst::is_send_from_grf() const { @@ -353,7 +347,7 @@ fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const return false; fs_reg reg = this->src[0]; - if (reg.file != VGRF || reg.offset / REG_SIZE != 0 || reg.stride == 0) + if (reg.file != VGRF || reg.offset != 0 || reg.stride != 1) return false; if (grf_alloc.sizes[reg.nr] * REG_SIZE != this->size_written) @@ -441,15 +435,6 @@ fs_reg::equals(const fs_reg &r) const stride == r.stride); } -fs_reg & -fs_reg::set_smear(unsigned subreg) -{ - assert(file != ARF && file != FIXED_GRF && file != IMM); - offset = ROUND_DOWN_TO(offset, REG_SIZE) + subreg * type_sz(type); - stride = 0; - return *this; -} - bool fs_reg::is_contiguous() const { @@ -562,15 +547,14 @@ fs_visitor::get_timestamp(const fs_builder &bld) void fs_visitor::emit_shader_time_begin() { - shader_start_time = get_timestamp(bld.annotate("shader time start")); - /* We want only the low 32 bits of the timestamp. Since it's running * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds, * which is plenty of time for our purposes. It is identical across the * EUs, but since it's tracking GPU core speed it will increment at a * varying rate as render P-states change. */ - shader_start_time.set_smear(0); + shader_start_time = component( + get_timestamp(bld.annotate("shader time start")), 0); } void @@ -581,8 +565,7 @@ fs_visitor::emit_shader_time_end() assert(end && ((fs_inst *) end)->eot); const fs_builder ibld = bld.annotate("shader time end") .exec_all().at(NULL, end); - - fs_reg shader_end_time = get_timestamp(ibld); + const fs_reg timestamp = get_timestamp(ibld); /* We only use the low 32 bits of the timestamp - see * emit_shader_time_begin()). @@ -591,22 +574,21 @@ fs_visitor::emit_shader_time_end() * else that might disrupt timing) by setting smear to 2 and checking if * that field is != 0. */ - shader_end_time.set_smear(0); + const fs_reg shader_end_time = component(timestamp, 0); /* Check that there weren't any timestamp reset events (assuming these * were the only two timestamp reads that happened). */ - fs_reg reset = shader_end_time; - reset.set_smear(2); + const fs_reg reset = component(timestamp, 2); set_condmod(BRW_CONDITIONAL_Z, ibld.AND(ibld.null_reg_ud(), reset, brw_imm_ud(1u))); ibld.IF(BRW_PREDICATE_NORMAL); fs_reg start = shader_start_time; start.negate = true; - fs_reg diff = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD); - diff.set_smear(0); - + const fs_reg diff = component(fs_reg(VGRF, alloc.allocate(1), + BRW_REGISTER_TYPE_UD), + 0); const fs_builder cbld = ibld.group(1, 0); cbld.group(1, 0).ADD(diff, start, shader_end_time); @@ -817,8 +799,8 @@ fs_inst::components_read(unsigned i) const } } -int -fs_inst::regs_read(int arg) const +unsigned +fs_inst::size_read(int arg) const { switch (opcode) { case FS_OPCODE_FB_WRITE: @@ -837,78 +819,52 @@ fs_inst::regs_read(int arg) const case SHADER_OPCODE_TYPED_SURFACE_WRITE: case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: if (arg == 0) - return mlen; + return mlen * REG_SIZE; break; case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7: /* The payload is actually stored in src1 */ if (arg == 1) - return mlen; + return mlen * REG_SIZE; break; case FS_OPCODE_LINTERP: if (arg == 1) - return 1; + return 16; break; case SHADER_OPCODE_LOAD_PAYLOAD: if (arg < this->header_size) - return 1; + return REG_SIZE; break; case CS_OPCODE_CS_TERMINATE: case SHADER_OPCODE_BARRIER: - return 1; + return REG_SIZE; case SHADER_OPCODE_MOV_INDIRECT: if (arg == 0) { assert(src[2].file == IMM); - unsigned region_length = src[2].ud; - - if (src[0].file == UNIFORM) { - assert(region_length % 4 == 0); - return region_length / 4; - } else if (src[0].file == FIXED_GRF) { - /* If the start of the region is not register aligned, then - * there's some portion of the register that's technically - * unread at the beginning. - * - * However, the register allocator works in terms of whole - * registers, and does not use subnr. It assumes that the - * read starts at the beginning of the register, and extends - * regs_read() whole registers beyond that. - * - * To compensate, we extend the region length to include this - * unread portion at the beginning. - */ - if (src[0].subnr) - region_length += src[0].subnr; - - return DIV_ROUND_UP(region_length, REG_SIZE); - } else { - assert(!"Invalid register file"); - } + return src[2].ud; } break; default: if (is_tex() && arg == 0 && src[0].file == VGRF) - return mlen; + return mlen * REG_SIZE; break; } switch (src[arg].file) { case UNIFORM: case IMM: - return 1; + return components_read(arg) * type_sz(src[arg].type); case BAD_FILE: case ARF: case FIXED_GRF: case VGRF: case ATTR: - return DIV_ROUND_UP(components_read(arg) * - src[arg].component_size(exec_size), - REG_SIZE); + return components_read(arg) * src[arg].component_size(exec_size); case MRF: unreachable("MRF registers are not allowed as sources"); } @@ -1289,9 +1245,9 @@ fs_visitor::emit_sampleid_setup() brw_imm_v(0x44440000)); abld.AND(*reg, tmp, brw_imm_w(0xf)); } else { - fs_reg t1(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_D); - t1.set_smear(0); - fs_reg t2(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_W); + const fs_reg t1 = component(fs_reg(VGRF, alloc.allocate(1), + BRW_REGISTER_TYPE_D), 0); + const fs_reg t2(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_W); /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with * 8x multisampling, subspan 0 will represent sample N (where N @@ -2106,7 +2062,7 @@ fs_visitor::assign_constant_locations() stage_prog_data->nr_params = num_push_constants; stage_prog_data->nr_pull_params = num_pull_constants; - /* Up until now, the param[] array has been indexed by reg + reg_offset + /* Up until now, the param[] array has been indexed by reg + offset * of UNIFORM registers. Move pull constants into pull_param[] and * condense param[] to only contain the uniforms we chose to push. * @@ -2181,9 +2137,7 @@ fs_visitor::lower_constant_loads() /* Rewrite the instruction to use the temporary VGRF. */ inst->src[i].file = VGRF; inst->src[i].nr = dst.nr; - inst->src[i].offset %= 4; - inst->src[i].set_smear((pull_index & 3) * 4 / - type_sz(inst->src[i].type)); + inst->src[i].offset = (pull_index & 3) * 4 + inst->src[i].offset % 4; brw_mark_surface_used(prog_data, index); } @@ -2547,7 +2501,7 @@ fs_visitor::opt_sampler_eot() for (unsigned i = 0; i < FB_WRITE_LOGICAL_NUM_SRCS; i++) { if (i == FB_WRITE_LOGICAL_SRC_COLOR0) { if (!fb_write->src[i].equals(tex_inst->dst) || - fb_write->regs_read(i) * REG_SIZE != tex_inst->size_written) + fb_write->size_read(i) != tex_inst->size_written) return false; } else if (i != FB_WRITE_LOGICAL_SRC_COMPONENTS) { if (fb_write->src[i].file != BAD_FILE) @@ -2678,16 +2632,18 @@ fs_visitor::opt_redundant_discard_jumps() /** * Compute a bitmask with GRF granularity with a bit set for each GRF starting - * from \p r which overlaps the region starting at \p r and spanning \p n GRF - * units. + * from \p r.offset which overlaps the region starting at \p s.offset and + * spanning \p ds bytes. */ static inline unsigned -mask_relative_to(const fs_reg &r, const fs_reg &s, unsigned n) +mask_relative_to(const fs_reg &r, const fs_reg &s, unsigned ds) { - const int rel_offset = (reg_offset(s) - reg_offset(r)) / REG_SIZE; + const int rel_offset = reg_offset(s) - reg_offset(r); + const int shift = rel_offset / REG_SIZE; + const unsigned n = DIV_ROUND_UP(rel_offset % REG_SIZE + ds, REG_SIZE); assert(reg_space(r) == reg_space(s) && - rel_offset >= 0 && rel_offset < int(8 * sizeof(unsigned))); - return ((1 << n) - 1) << rel_offset; + shift >= 0 && shift < int(8 * sizeof(unsigned))); + return ((1 << n) - 1) << shift; } bool @@ -2730,7 +2686,7 @@ fs_visitor::compute_to_mrf() foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) { if (regions_overlap(scan_inst->dst, scan_inst->size_written, - inst->src[0], inst->regs_read(0) * REG_SIZE)) { + inst->src[0], inst->size_read(0))) { /* Found the last thing to write our reg we want to turn * into a compute-to-MRF. */ @@ -2747,9 +2703,8 @@ fs_visitor::compute_to_mrf() * would need us to understand coalescing out more than one MOV at * a time. */ - if (scan_inst->dst.offset / REG_SIZE < inst->src[0].offset / REG_SIZE || - scan_inst->dst.offset / REG_SIZE + DIV_ROUND_UP(scan_inst->size_written, REG_SIZE) > - inst->src[0].offset / REG_SIZE + inst->regs_read(0)) + if (!region_contained_in(scan_inst->dst, scan_inst->size_written, + inst->src[0], inst->size_read(0))) break; /* SEND instructions can't have MRF as a destination. */ @@ -2767,8 +2722,7 @@ fs_visitor::compute_to_mrf() /* Clear the bits for any registers this instruction overwrites. */ regs_left &= ~mask_relative_to( - inst->src[0], scan_inst->dst, DIV_ROUND_UP(scan_inst->size_written, - REG_SIZE)); + inst->src[0], scan_inst->dst, scan_inst->size_written); if (!regs_left) break; } @@ -2785,8 +2739,8 @@ fs_visitor::compute_to_mrf() */ bool interfered = false; for (int i = 0; i < scan_inst->sources; i++) { - if (regions_overlap(scan_inst->src[i], scan_inst->regs_read(i) * REG_SIZE, - inst->src[0], inst->regs_read(0) * REG_SIZE)) { + if (regions_overlap(scan_inst->src[i], scan_inst->size_read(i), + inst->src[0], inst->size_read(0))) { interfered = true; } } @@ -2823,21 +2777,20 @@ fs_visitor::compute_to_mrf() foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) { if (regions_overlap(scan_inst->dst, scan_inst->size_written, - inst->src[0], inst->regs_read(0) * REG_SIZE)) { + inst->src[0], inst->size_read(0))) { /* Clear the bits for any registers this instruction overwrites. */ regs_left &= ~mask_relative_to( - inst->src[0], scan_inst->dst, DIV_ROUND_UP(scan_inst->size_written, - REG_SIZE)); + inst->src[0], scan_inst->dst, scan_inst->size_written); - const unsigned rel_offset = (reg_offset(scan_inst->dst) - - reg_offset(inst->src[0])) / REG_SIZE; + const unsigned rel_offset = reg_offset(scan_inst->dst) - + reg_offset(inst->src[0]); if (inst->dst.nr & BRW_MRF_COMPR4) { /* Apply the same address transformation done by the hardware * for COMPR4 MRF writes. */ - assert(rel_offset < 2); - scan_inst->dst.nr = inst->dst.nr + rel_offset * 4; + assert(rel_offset < 2 * REG_SIZE); + scan_inst->dst.nr = inst->dst.nr + rel_offset / REG_SIZE * 4; /* Clear the COMPR4 bit if the generating instruction is not * compressed. @@ -2849,11 +2802,11 @@ fs_visitor::compute_to_mrf() /* Calculate the MRF number the result of this instruction is * ultimately written to. */ - scan_inst->dst.nr = inst->dst.nr + rel_offset; + scan_inst->dst.nr = inst->dst.nr + rel_offset / REG_SIZE; } scan_inst->dst.file = MRF; - scan_inst->dst.offset %= REG_SIZE; + scan_inst->dst.offset = inst->dst.offset + rel_offset % REG_SIZE; scan_inst->saturate |= inst->saturate; if (!regs_left) break; @@ -3027,7 +2980,7 @@ fs_visitor::remove_duplicate_mrf_writes() if (last_mrf_move[i] && regions_overlap(inst->dst, inst->size_written, last_mrf_move[i]->src[0], - last_mrf_move[i]->regs_read(0) * REG_SIZE)) { + last_mrf_move[i]->size_read(0))) { last_mrf_move[i] = NULL; } } @@ -3213,10 +3166,6 @@ fs_visitor::insert_gen4_send_dependency_workarounds() bool progress = false; - /* Note that we're done with register allocation, so GRF fs_regs always - * have a .reg_offset of 0. - */ - foreach_block_and_inst(block, fs_inst, inst, cfg) { if (inst->mlen != 0 && inst->dst.file == VGRF) { insert_gen4_pre_send_dependency_workarounds(block, inst); @@ -3500,62 +3449,27 @@ fs_visitor::lower_integer_multiplication() inst->dst.type); if (devinfo->gen >= 7) { - fs_reg src1_0_w = inst->src[1]; - fs_reg src1_1_w = inst->src[1]; - if (inst->src[1].file == IMM) { - src1_0_w.ud &= 0xffff; - src1_1_w.ud >>= 16; + ibld.MUL(low, inst->src[0], + brw_imm_uw(inst->src[1].ud & 0xffff)); + ibld.MUL(high, inst->src[0], + brw_imm_uw(inst->src[1].ud >> 16)); } else { - src1_0_w.type = BRW_REGISTER_TYPE_UW; - if (src1_0_w.stride != 0) { - assert(src1_0_w.stride == 1); - src1_0_w.stride = 2; - } - - src1_1_w.type = BRW_REGISTER_TYPE_UW; - if (src1_1_w.stride != 0) { - assert(src1_1_w.stride == 1); - src1_1_w.stride = 2; - } - src1_1_w.offset += type_sz(BRW_REGISTER_TYPE_UW); + ibld.MUL(low, inst->src[0], + subscript(inst->src[1], BRW_REGISTER_TYPE_UW, 0)); + ibld.MUL(high, inst->src[0], + subscript(inst->src[1], BRW_REGISTER_TYPE_UW, 1)); } - ibld.MUL(low, inst->src[0], src1_0_w); - ibld.MUL(high, inst->src[0], src1_1_w); } else { - fs_reg src0_0_w = inst->src[0]; - fs_reg src0_1_w = inst->src[0]; - - src0_0_w.type = BRW_REGISTER_TYPE_UW; - if (src0_0_w.stride != 0) { - assert(src0_0_w.stride == 1); - src0_0_w.stride = 2; - } - - src0_1_w.type = BRW_REGISTER_TYPE_UW; - if (src0_1_w.stride != 0) { - assert(src0_1_w.stride == 1); - src0_1_w.stride = 2; - } - src0_1_w.offset += type_sz(BRW_REGISTER_TYPE_UW); - - ibld.MUL(low, src0_0_w, inst->src[1]); - ibld.MUL(high, src0_1_w, inst->src[1]); + ibld.MUL(low, subscript(inst->src[0], BRW_REGISTER_TYPE_UW, 0), + inst->src[1]); + ibld.MUL(high, subscript(inst->src[0], BRW_REGISTER_TYPE_UW, 1), + inst->src[1]); } - fs_reg dst = inst->dst; - dst.type = BRW_REGISTER_TYPE_UW; - dst.offset = ROUND_DOWN_TO(dst.offset, REG_SIZE) + 2; - dst.stride = 2; - - high.type = BRW_REGISTER_TYPE_UW; - high.stride = 2; - - low.type = BRW_REGISTER_TYPE_UW; - low.offset = ROUND_DOWN_TO(low.offset, REG_SIZE) + 2; - low.stride = 2; - - ibld.ADD(dst, low, high); + ibld.ADD(subscript(inst->dst, BRW_REGISTER_TYPE_UW, 1), + subscript(low, BRW_REGISTER_TYPE_UW, 1), + subscript(high, BRW_REGISTER_TYPE_UW, 0)); if (inst->conditional_mod || orig_dst.file == MRF) { set_condmod(inst->conditional_mod, @@ -4607,7 +4521,7 @@ get_fpu_lowered_simd_width(const struct gen_device_info *devinfo, unsigned reg_count = DIV_ROUND_UP(inst->size_written, REG_SIZE); for (unsigned i = 0; i < inst->sources; i++) - reg_count = MAX2(reg_count, (unsigned)inst->regs_read(i)); + reg_count = MAX2(reg_count, DIV_ROUND_UP(inst->size_read(i), REG_SIZE)); /* Calculate the maximum execution size of the instruction based on the * factor by which it goes over the hardware limit of 2 GRFs. @@ -4631,8 +4545,8 @@ get_fpu_lowered_simd_width(const struct gen_device_info *devinfo, */ if (devinfo->gen < 8) { for (unsigned i = 0; i < inst->sources; i++) { - if (DIV_ROUND_UP(inst->size_written, REG_SIZE) == 2 && - inst->regs_read(i) != 0 && inst->regs_read(i) != 2 && + if (inst->size_written > REG_SIZE && + inst->size_read(i) != 0 && inst->size_read(i) <= REG_SIZE && !is_uniform(inst->src[i]) && !(type_sz(inst->dst.type) == 4 && inst->dst.stride == 1 && type_sz(inst->src[i].type) == 2 && inst->src[i].stride == 1)) { @@ -5114,7 +5028,7 @@ needs_dst_copy(const fs_builder &lbld, const fs_inst *inst) * the data read from the same source by other lowered instructions. */ if (regions_overlap(inst->dst, inst->size_written, - inst->src[i], inst->regs_read(i) * REG_SIZE) && + inst->src[i], inst->size_read(i)) && !inst->dst.equals(inst->src[i])) return true; } @@ -5315,10 +5229,6 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file) switch (inst->dst.file) { case VGRF: fprintf(file, "vgrf%d", inst->dst.nr); - if (alloc.sizes[inst->dst.nr] * REG_SIZE != inst->size_written || - inst->dst.offset % REG_SIZE) - fprintf(file, "+%d.%d", - inst->dst.offset / REG_SIZE, inst->dst.offset % REG_SIZE); break; case FIXED_GRF: fprintf(file, "g%d", inst->dst.nr); @@ -5330,10 +5240,10 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file) fprintf(file, "(null)"); break; case UNIFORM: - fprintf(file, "***u%d***", inst->dst.nr + inst->dst.offset / 4); + fprintf(file, "***u%d***", inst->dst.nr); break; case ATTR: - fprintf(file, "***attr%d***", inst->dst.nr + inst->dst.offset / REG_SIZE); + fprintf(file, "***attr%d***", inst->dst.nr); break; case ARF: switch (inst->dst.nr) { @@ -5353,12 +5263,19 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file) fprintf(file, "arf%d.%d", inst->dst.nr & 0xf, inst->dst.subnr); break; } - if (inst->dst.subnr) - fprintf(file, "+%d", inst->dst.subnr); break; case IMM: unreachable("not reached"); } + + if (inst->dst.offset || + (inst->dst.file == VGRF && + alloc.sizes[inst->dst.nr] * REG_SIZE != inst->size_written)) { + const unsigned reg_size = (inst->dst.file == UNIFORM ? 4 : REG_SIZE); + fprintf(file, "+%d.%d", inst->dst.offset / reg_size, + inst->dst.offset % reg_size); + } + if (inst->dst.stride != 1) fprintf(file, "<%u>", inst->dst.stride); fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type)); @@ -5371,10 +5288,6 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file) switch (inst->src[i].file) { case VGRF: fprintf(file, "vgrf%d", inst->src[i].nr); - if (alloc.sizes[inst->src[i].nr] != (unsigned)inst->regs_read(i) || - inst->src[i].offset % REG_SIZE != 0) - fprintf(file, "+%d.%d", inst->src[i].offset / REG_SIZE, - inst->src[i].offset % REG_SIZE); break; case FIXED_GRF: fprintf(file, "g%d", inst->src[i].nr); @@ -5383,14 +5296,10 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file) fprintf(file, "***m%d***", inst->src[i].nr); break; case ATTR: - fprintf(file, "attr%d+%d", inst->src[i].nr, inst->src[i].offset / REG_SIZE); + fprintf(file, "attr%d", inst->src[i].nr); break; case UNIFORM: - fprintf(file, "u%d", inst->src[i].nr + inst->src[i].offset / 4); - if (inst->src[i].offset % 4 != 0) { - fprintf(file, "+%d.%d", inst->src[i].offset / 4, - inst->src[i].offset % 4); - } + fprintf(file, "u%d", inst->src[i].nr); break; case BAD_FILE: fprintf(file, "(null)"); @@ -5441,10 +5350,17 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file) fprintf(file, "arf%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr); break; } - if (inst->src[i].subnr) - fprintf(file, "+%d", inst->src[i].subnr); break; } + + if (inst->src[i].offset || + (inst->src[i].file == VGRF && + alloc.sizes[inst->src[i].nr] * REG_SIZE != inst->size_read(i))) { + const unsigned reg_size = (inst->src[i].file == UNIFORM ? 4 : REG_SIZE); + fprintf(file, "+%d.%d", inst->src[i].offset / reg_size, + inst->src[i].offset % reg_size); + } + if (inst->src[i].abs) fprintf(file, "|");