X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fintel%2Fcompiler%2Fbrw_fs.cpp;h=9d0546e57976017a8e2d1d684420ab2570d229cf;hb=c0ef14f5b1a59d016369a0d3322b9e783009b308;hp=814da8ad94da64b9d71c21951afe2e10262ac965;hpb=3dcbc5cdaa871f6ad2f123d0eb81f32fbfa7070b;p=mesa.git diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index 814da8ad94d..9d0546e5797 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -191,14 +191,21 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld, vec4_result, surf_index, vec4_offset); inst->size_written = 4 * vec4_result.component_size(inst->exec_size); - if (type_sz(dst.type) == 8) { - shuffle_32bit_load_result_to_64bit_data( - bld, retype(vec4_result, dst.type), vec4_result, 2); + fs_reg dw = offset(vec4_result, bld, (const_offset & 0xf) / 4); + switch (type_sz(dst.type)) { + case 2: + shuffle_32bit_load_result_to_16bit_data(bld, dst, dw, 1); + bld.MOV(dst, subscript(dw, dst.type, (const_offset / 2) & 1)); + break; + case 4: + bld.MOV(dst, retype(dw, dst.type)); + break; + case 8: + shuffle_32bit_load_result_to_64bit_data(bld, dst, dw, 1); + break; + default: + unreachable("Unsupported bit_size"); } - - vec4_result.type = dst.type; - bld.MOV(dst, offset(vec4_result, bld, - (const_offset & 0xf) / type_sz(vec4_result.type))); } /** @@ -250,6 +257,8 @@ fs_inst::is_send_from_grf() const case SHADER_OPCODE_UNTYPED_ATOMIC: case SHADER_OPCODE_UNTYPED_SURFACE_READ: case SHADER_OPCODE_UNTYPED_SURFACE_WRITE: + case SHADER_OPCODE_BYTE_SCATTERED_WRITE: + case SHADER_OPCODE_BYTE_SCATTERED_READ: case SHADER_OPCODE_TYPED_ATOMIC: case SHADER_OPCODE_TYPED_SURFACE_READ: case SHADER_OPCODE_TYPED_SURFACE_WRITE: @@ -454,6 +463,10 @@ type_size_scalar(const struct glsl_type *type) case GLSL_TYPE_FLOAT: case GLSL_TYPE_BOOL: return type->components(); + case GLSL_TYPE_UINT16: + case GLSL_TYPE_INT16: + case GLSL_TYPE_FLOAT16: + return DIV_ROUND_UP(type->components(), 2); case GLSL_TYPE_DOUBLE: case GLSL_TYPE_UINT64: case GLSL_TYPE_INT64: @@ -745,6 +758,23 @@ fs_inst::components_read(unsigned i) const else return 1; + case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL: + /* Scattered logical opcodes use the following params: + * src[0] Surface coordinates + * src[1] Surface operation source (ignored for reads) + * src[2] Surface + * src[3] IMM with always 1 dimension. + * src[4] IMM with arg bitsize for scattered read/write 8, 16, 32 + */ + assert(src[3].file == IMM && + src[4].file == IMM); + return i == 1 ? 0 : 1; + + case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL: + assert(src[3].file == IMM && + src[4].file == IMM); + return 1; + case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL: case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: { assert(src[3].file == IMM && @@ -787,6 +817,8 @@ fs_inst::size_read(int arg) const case SHADER_OPCODE_TYPED_SURFACE_READ: case SHADER_OPCODE_TYPED_SURFACE_WRITE: case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: + case SHADER_OPCODE_BYTE_SCATTERED_WRITE: + case SHADER_OPCODE_BYTE_SCATTERED_READ: if (arg == 0) return mlen * REG_SIZE; break; @@ -913,7 +945,7 @@ fs_inst::flags_written() const * instruction -- the FS opcodes often generate MOVs in addition. */ int -fs_visitor::implied_mrf_writes(fs_inst *inst) +fs_visitor::implied_mrf_writes(fs_inst *inst) const { if (inst->mlen == 0) return 0; @@ -996,6 +1028,7 @@ fs_visitor::import_uniforms(fs_visitor *v) this->push_constant_loc = v->push_constant_loc; this->pull_constant_loc = v->pull_constant_loc; this->uniforms = v->uniforms; + this->subgroup_id = v->subgroup_id; } void @@ -1873,76 +1906,123 @@ fs_visitor::compact_virtual_grfs() return progress; } -static void -set_push_pull_constant_loc(unsigned uniform, int *chunk_start, - unsigned *max_chunk_bitsize, - bool contiguous, unsigned bitsize, - const unsigned target_bitsize, - int *push_constant_loc, int *pull_constant_loc, - unsigned *num_push_constants, - unsigned *num_pull_constants, - const unsigned max_push_components, - const unsigned max_chunk_size, - bool allow_pull_constants, - struct brw_stage_prog_data *stage_prog_data) -{ - /* This is the first live uniform in the chunk */ - if (*chunk_start < 0) - *chunk_start = uniform; - - /* Keep track of the maximum bit size access in contiguous uniforms */ - *max_chunk_bitsize = MAX2(*max_chunk_bitsize, bitsize); - - /* If this element does not need to be contiguous with the next, we - * split at this point and everything between chunk_start and u forms a - * single chunk. - */ - if (!contiguous) { - /* If bitsize doesn't match the target one, skip it */ - if (*max_chunk_bitsize != target_bitsize) { - /* FIXME: right now we only support 32 and 64-bit accesses */ - assert(*max_chunk_bitsize == 4 || *max_chunk_bitsize == 8); - *max_chunk_bitsize = 0; - *chunk_start = -1; - return; - } - - unsigned chunk_size = uniform - *chunk_start + 1; - - /* Decide whether we should push or pull this parameter. In the - * Vulkan driver, push constants are explicitly exposed via the API - * so we push everything. In GL, we only push small arrays. - */ - if (!allow_pull_constants || - (*num_push_constants + chunk_size <= max_push_components && - chunk_size <= max_chunk_size)) { - assert(*num_push_constants + chunk_size <= max_push_components); - for (unsigned j = *chunk_start; j <= uniform; j++) - push_constant_loc[j] = (*num_push_constants)++; - } else { - for (unsigned j = *chunk_start; j <= uniform; j++) - pull_constant_loc[j] = (*num_pull_constants)++; - } - - *max_chunk_bitsize = 0; - *chunk_start = -1; - } -} - static int -get_thread_local_id_param_index(const brw_stage_prog_data *prog_data) +get_subgroup_id_param_index(const brw_stage_prog_data *prog_data) { if (prog_data->nr_params == 0) return -1; /* The local thread id is always the last parameter in the list */ uint32_t last_param = prog_data->param[prog_data->nr_params - 1]; - if (last_param == BRW_PARAM_BUILTIN_THREAD_LOCAL_ID) + if (last_param == BRW_PARAM_BUILTIN_SUBGROUP_ID) return prog_data->nr_params - 1; return -1; } +/** + * Struct for handling complex alignments. + * + * A complex alignment is stored as multiplier and an offset. A value is + * considered to be aligned if it is {offset} larger than a multiple of {mul}. + * For instance, with an alignment of {8, 2}, cplx_align_apply would do the + * following: + * + * N | cplx_align_apply({8, 2}, N) + * ----+----------------------------- + * 4 | 6 + * 6 | 6 + * 8 | 14 + * 10 | 14 + * 12 | 14 + * 14 | 14 + * 16 | 22 + */ +struct cplx_align { + unsigned mul:4; + unsigned offset:4; +}; + +#define CPLX_ALIGN_MAX_MUL 8 + +static void +cplx_align_assert_sane(struct cplx_align a) +{ + assert(a.mul > 0 && util_is_power_of_two(a.mul)); + assert(a.offset < a.mul); +} + +/** + * Combines two alignments to produce a least multiple of sorts. + * + * The returned alignment is the smallest (in terms of multiplier) such that + * anything aligned to both a and b will be aligned to the new alignment. + * This function will assert-fail if a and b are not compatible, i.e. if the + * offset parameters are such that no common alignment is possible. + */ +static struct cplx_align +cplx_align_combine(struct cplx_align a, struct cplx_align b) +{ + cplx_align_assert_sane(a); + cplx_align_assert_sane(b); + + /* Assert that the alignments agree. */ + assert((a.offset & (b.mul - 1)) == (b.offset & (a.mul - 1))); + + return a.mul > b.mul ? a : b; +} + +/** + * Apply a complex alignment + * + * This function will return the smallest number greater than or equal to + * offset that is aligned to align. + */ +static unsigned +cplx_align_apply(struct cplx_align align, unsigned offset) +{ + return ALIGN(offset - align.offset, align.mul) + align.offset; +} + +#define UNIFORM_SLOT_SIZE 4 + +struct uniform_slot_info { + /** True if the given uniform slot is live */ + unsigned is_live:1; + + /** True if this slot and the next slot must remain contiguous */ + unsigned contiguous:1; + + struct cplx_align align; +}; + +static void +mark_uniform_slots_read(struct uniform_slot_info *slots, + unsigned num_slots, unsigned alignment) +{ + assert(alignment > 0 && util_is_power_of_two(alignment)); + assert(alignment <= CPLX_ALIGN_MAX_MUL); + + /* We can't align a slot to anything less than the slot size */ + alignment = MAX2(alignment, UNIFORM_SLOT_SIZE); + + struct cplx_align align = {alignment, 0}; + cplx_align_assert_sane(align); + + for (unsigned i = 0; i < num_slots; i++) { + slots[i].is_live = true; + if (i < num_slots - 1) + slots[i].contiguous = true; + + align.offset = (i * UNIFORM_SLOT_SIZE) & (align.mul - 1); + if (slots[i].align.mul == 0) { + slots[i].align = align; + } else { + slots[i].align = cplx_align_combine(slots[i].align, align); + } + } +} + /** * Assign UNIFORM file registers to either push constants or pull constants. * @@ -1956,64 +2036,53 @@ void fs_visitor::assign_constant_locations() { /* Only the first compile gets to decide on locations. */ - if (dispatch_width != min_dispatch_width) + if (push_constant_loc) { + assert(pull_constant_loc); return; + } - bool is_live[uniforms]; - memset(is_live, 0, sizeof(is_live)); - unsigned bitsize_access[uniforms]; - memset(bitsize_access, 0, sizeof(bitsize_access)); - - /* For each uniform slot, a value of true indicates that the given slot and - * the next slot must remain contiguous. This is used to keep us from - * splitting arrays apart. - */ - bool contiguous[uniforms]; - memset(contiguous, 0, sizeof(contiguous)); + struct uniform_slot_info slots[uniforms]; + memset(slots, 0, sizeof(slots)); - /* First, we walk through the instructions and do two things: - * - * 1) Figure out which uniforms are live. - * - * 2) Mark any indirectly used ranges of registers as contiguous. - * - * Note that we don't move constant-indexed accesses to arrays. No - * testing has been done of the performance impact of this choice. - */ foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { for (int i = 0 ; i < inst->sources; i++) { if (inst->src[i].file != UNIFORM) continue; - int constant_nr = inst->src[i].nr + inst->src[i].offset / 4; + /* NIR tightly packs things so the uniform number might not be + * aligned (if we have a double right after a float, for instance). + * This is fine because the process of re-arranging them will ensure + * that things are properly aligned. The offset into that uniform, + * however, must be aligned. + * + * In Vulkan, we have explicit offsets but everything is crammed + * into a single "variable" so inst->src[i].nr will always be 0. + * Everything will be properly aligned relative to that one base. + */ + assert(inst->src[i].offset % type_sz(inst->src[i].type) == 0); + + unsigned u = inst->src[i].nr + + inst->src[i].offset / UNIFORM_SLOT_SIZE; + if (u >= uniforms) + continue; + + unsigned slots_read; if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) { - assert(inst->src[2].ud % 4 == 0); - unsigned last = constant_nr + (inst->src[2].ud / 4) - 1; - assert(last < uniforms); - - for (unsigned j = constant_nr; j < last; j++) { - is_live[j] = true; - contiguous[j] = true; - bitsize_access[j] = MAX2(bitsize_access[j], type_sz(inst->src[i].type)); - } - is_live[last] = true; - bitsize_access[last] = MAX2(bitsize_access[last], type_sz(inst->src[i].type)); + slots_read = DIV_ROUND_UP(inst->src[2].ud, UNIFORM_SLOT_SIZE); } else { - if (constant_nr >= 0 && constant_nr < (int) uniforms) { - int regs_read = inst->components_read(i) * - type_sz(inst->src[i].type) / 4; - for (int j = 0; j < regs_read; j++) { - is_live[constant_nr + j] = true; - bitsize_access[constant_nr + j] = - MAX2(bitsize_access[constant_nr + j], type_sz(inst->src[i].type)); - } - } + unsigned bytes_read = inst->components_read(i) * + type_sz(inst->src[i].type); + slots_read = DIV_ROUND_UP(bytes_read, UNIFORM_SLOT_SIZE); } + + assert(u + slots_read <= uniforms); + mark_uniform_slots_read(&slots[u], slots_read, + type_sz(inst->src[i].type)); } } - int thread_local_id_index = get_thread_local_id_param_index(stage_prog_data); + int subgroup_id_index = get_subgroup_id_param_index(stage_prog_data); /* Only allow 16 registers (128 uniform components) as push constants. * @@ -2024,9 +2093,18 @@ fs_visitor::assign_constant_locations() * brw_curbe.c. */ unsigned int max_push_components = 16 * 8; - if (thread_local_id_index >= 0) + if (subgroup_id_index >= 0) max_push_components--; /* Save a slot for the thread ID */ + /* FIXME: We currently have some GPU hangs that happen apparently when using + * push constants. Since we have no solution for such hangs yet, just + * go ahead and use pull constants for now. + */ + if (devinfo->gen == 10 && compiler->supports_pull_constants) { + compiler->shader_perf_log(log_data, "Disabling push constants."); + max_push_components = 0; + } + /* We push small arrays, but no bigger than 16 floats. This is big enough * for a vec4 but hopefully not large enough to push out other stuff. We * should probably use a better heuristic at some point. @@ -2044,59 +2122,87 @@ fs_visitor::assign_constant_locations() memset(pull_constant_loc, -1, uniforms * sizeof(*pull_constant_loc)); int chunk_start = -1; - unsigned max_chunk_bitsize = 0; - - /* First push 64-bit uniforms to ensure they are properly aligned */ - const unsigned uniform_64_bit_size = type_sz(BRW_REGISTER_TYPE_DF); + struct cplx_align align; for (unsigned u = 0; u < uniforms; u++) { - if (!is_live[u]) + if (!slots[u].is_live) { + assert(chunk_start == -1); continue; + } - set_push_pull_constant_loc(u, &chunk_start, &max_chunk_bitsize, - contiguous[u], bitsize_access[u], - uniform_64_bit_size, - push_constant_loc, pull_constant_loc, - &num_push_constants, &num_pull_constants, - max_push_components, max_chunk_size, - compiler->supports_pull_constants, - stage_prog_data); + /* Skip subgroup_id_index to put it in the last push register. */ + if (subgroup_id_index == (int)u) + continue; - } + if (chunk_start == -1) { + chunk_start = u; + align = slots[u].align; + } else { + /* Offset into the chunk */ + unsigned chunk_offset = (u - chunk_start) * UNIFORM_SLOT_SIZE; - /* Then push the rest of uniforms */ - const unsigned uniform_32_bit_size = type_sz(BRW_REGISTER_TYPE_F); - for (unsigned u = 0; u < uniforms; u++) { - if (!is_live[u]) - continue; + /* Shift the slot alignment down by the chunk offset so it is + * comparable with the base chunk alignment. + */ + struct cplx_align slot_align = slots[u].align; + slot_align.offset = + (slot_align.offset - chunk_offset) & (align.mul - 1); + + align = cplx_align_combine(align, slot_align); + } - /* Skip thread_local_id_index to put it in the last push register. */ - if (thread_local_id_index == (int)u) + /* Sanity check the alignment */ + cplx_align_assert_sane(align); + + if (slots[u].contiguous) continue; - set_push_pull_constant_loc(u, &chunk_start, &max_chunk_bitsize, - contiguous[u], bitsize_access[u], - uniform_32_bit_size, - push_constant_loc, pull_constant_loc, - &num_push_constants, &num_pull_constants, - max_push_components, max_chunk_size, - compiler->supports_pull_constants, - stage_prog_data); + /* Adjust the alignment to be in terms of slots, not bytes */ + assert((align.mul & (UNIFORM_SLOT_SIZE - 1)) == 0); + assert((align.offset & (UNIFORM_SLOT_SIZE - 1)) == 0); + align.mul /= UNIFORM_SLOT_SIZE; + align.offset /= UNIFORM_SLOT_SIZE; + + unsigned push_start_align = cplx_align_apply(align, num_push_constants); + unsigned chunk_size = u - chunk_start + 1; + if ((!compiler->supports_pull_constants && u < UBO_START) || + (chunk_size < max_chunk_size && + push_start_align + chunk_size <= max_push_components)) { + /* Align up the number of push constants */ + num_push_constants = push_start_align; + for (unsigned i = 0; i < chunk_size; i++) + push_constant_loc[chunk_start + i] = num_push_constants++; + } else { + /* We need to pull this one */ + num_pull_constants = cplx_align_apply(align, num_pull_constants); + for (unsigned i = 0; i < chunk_size; i++) + pull_constant_loc[chunk_start + i] = num_pull_constants++; + } + + /* Reset the chunk and start again */ + chunk_start = -1; } /* Add the CS local thread ID uniform at the end of the push constants */ - if (thread_local_id_index >= 0) - push_constant_loc[thread_local_id_index] = num_push_constants++; + if (subgroup_id_index >= 0) + push_constant_loc[subgroup_id_index] = num_push_constants++; /* As the uniforms are going to be reordered, stash the old array and * create two new arrays for push/pull params. */ uint32_t *param = stage_prog_data->param; stage_prog_data->nr_params = num_push_constants; - stage_prog_data->param = ralloc_array(mem_ctx, uint32_t, num_push_constants); + if (num_push_constants) { + stage_prog_data->param = rzalloc_array(mem_ctx, uint32_t, + num_push_constants); + } else { + stage_prog_data->param = NULL; + } + assert(stage_prog_data->nr_pull_params == 0); + assert(stage_prog_data->pull_param == NULL); if (num_pull_constants > 0) { stage_prog_data->nr_pull_params = num_pull_constants; - stage_prog_data->pull_param = ralloc_array(NULL, uint32_t, - num_pull_constants); + stage_prog_data->pull_param = rzalloc_array(mem_ctx, uint32_t, + num_pull_constants); } /* Now that we know how many regular uniforms we'll push, reduce the @@ -2416,8 +2522,17 @@ fs_visitor::opt_algebraic() progress = true; } else if (inst->src[1].file == IMM) { inst->opcode = BRW_OPCODE_MOV; - inst->src[0] = component(inst->src[0], - inst->src[1].ud); + /* It's possible that the selected component will be too large and + * overflow the register. This can happen if someone does a + * readInvocation() from GLSL or SPIR-V and provides an OOB + * invocationIndex. If this happens and we some how manage + * to constant fold it in and get here, then component() may cause + * us to start reading outside of the VGRF which will lead to an + * assert later. Instead, just let it wrap around if it goes over + * exec_size. + */ + const unsigned comp = inst->src[1].ud & (inst->exec_size - 1); + inst->src[0] = component(inst->src[0], comp); inst->sources = 1; inst->force_writemask_all = true; progress = true; @@ -3066,6 +3181,42 @@ fs_visitor::remove_duplicate_mrf_writes() return progress; } +/** + * Rounding modes for conversion instructions are included for each + * conversion, but right now it is a state. So once it is set, + * we don't need to call it again for subsequent calls. + * + * This is useful for vector/matrices conversions, as setting the + * mode once is enough for the full vector/matrix + */ +bool +fs_visitor::remove_extra_rounding_modes() +{ + bool progress = false; + + foreach_block (block, cfg) { + brw_rnd_mode prev_mode = BRW_RND_MODE_UNSPECIFIED; + + foreach_inst_in_block_safe (fs_inst, inst, block) { + if (inst->opcode == SHADER_OPCODE_RND_MODE) { + assert(inst->src[0].file == BRW_IMMEDIATE_VALUE); + const brw_rnd_mode mode = (brw_rnd_mode) inst->src[0].d; + if (mode == prev_mode) { + inst->remove(block); + progress = true; + } else { + prev_mode = mode; + } + } + } + } + + if (progress) + invalidate_live_intervals(); + + return progress; +} + static void clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len) { @@ -3480,14 +3631,22 @@ fs_visitor::lower_integer_multiplication() * schedule multi-component multiplications much better. */ + bool needs_mov = false; fs_reg orig_dst = inst->dst; - if (orig_dst.is_null() || orig_dst.file == MRF) { - inst->dst = fs_reg(VGRF, alloc.allocate(dispatch_width / 8), - inst->dst.type); - } fs_reg low = inst->dst; - fs_reg high(VGRF, alloc.allocate(dispatch_width / 8), - inst->dst.type); + if (orig_dst.is_null() || orig_dst.file == MRF || + regions_overlap(inst->dst, inst->size_written, + inst->src[0], inst->size_read(0)) || + regions_overlap(inst->dst, inst->size_written, + inst->src[1], inst->size_read(1))) { + needs_mov = true; + low.nr = alloc.allocate(regs_written(inst)); + low.offset = low.offset % REG_SIZE; + } + + fs_reg high = inst->dst; + high.nr = alloc.allocate(regs_written(inst)); + high.offset = high.offset % REG_SIZE; if (devinfo->gen >= 7) { if (inst->src[1].file == IMM) { @@ -3508,13 +3667,13 @@ fs_visitor::lower_integer_multiplication() inst->src[1]); } - ibld.ADD(subscript(inst->dst, BRW_REGISTER_TYPE_UW, 1), + ibld.ADD(subscript(low, BRW_REGISTER_TYPE_UW, 1), subscript(low, BRW_REGISTER_TYPE_UW, 1), subscript(high, BRW_REGISTER_TYPE_UW, 0)); - if (inst->conditional_mod || orig_dst.file == MRF) { + if (needs_mov || inst->conditional_mod) { set_condmod(inst->conditional_mod, - ibld.MOV(orig_dst, inst->dst)); + ibld.MOV(orig_dst, low)); } } @@ -4268,7 +4427,7 @@ emit_surface_header(const fs_builder &bld, const fs_reg &sample_mask) fs_builder ubld = bld.exec_all().group(8, 0); const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD); ubld.MOV(dst, brw_imm_d(0)); - ubld.MOV(component(dst, 7), sample_mask); + ubld.group(1, 0).MOV(component(dst, 7), sample_mask); return dst; } @@ -4468,6 +4627,18 @@ fs_visitor::lower_logical_sends() ibld.sample_mask_reg()); break; + case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL: + lower_surface_logical_send(ibld, inst, + SHADER_OPCODE_BYTE_SCATTERED_READ, + fs_reg()); + break; + + case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL: + lower_surface_logical_send(ibld, inst, + SHADER_OPCODE_BYTE_SCATTERED_WRITE, + ibld.sample_mask_reg()); + break; + case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL: lower_surface_logical_send(ibld, inst, SHADER_OPCODE_UNTYPED_ATOMIC, @@ -4836,7 +5007,7 @@ get_lowered_simd_width(const struct gen_device_info *devinfo, return MIN2(8, inst->exec_size); case FS_OPCODE_LINTERP: - case FS_OPCODE_GET_BUFFER_SIZE: + case SHADER_OPCODE_GET_BUFFER_SIZE: case FS_OPCODE_DDX_COARSE: case FS_OPCODE_DDX_FINE: case FS_OPCODE_DDY_COARSE: @@ -4952,6 +5123,8 @@ get_lowered_simd_width(const struct gen_device_info *devinfo, case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL: case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL: case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL: + case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL: + case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL: return MIN2(16, inst->exec_size); case SHADER_OPCODE_URB_READ_SIMD8: @@ -5021,12 +5194,10 @@ needs_src_copy(const fs_builder &lbld, const fs_inst *inst, unsigned i) /** * Extract the data that would be consumed by the channel group given by * lbld.group() from the i-th source region of instruction \p inst and return - * it as result in packed form. If any copy instructions are required they - * will be emitted before the given \p inst in \p block. + * it as result in packed form. */ static fs_reg -emit_unzip(const fs_builder &lbld, bblock_t *block, fs_inst *inst, - unsigned i) +emit_unzip(const fs_builder &lbld, fs_inst *inst, unsigned i) { /* Specified channel group from the source region. */ const fs_reg src = horiz_offset(inst->src[i], lbld.group()); @@ -5041,8 +5212,7 @@ emit_unzip(const fs_builder &lbld, bblock_t *block, fs_inst *inst, const fs_reg tmp = lbld.vgrf(inst->src[i].type, inst->components_read(i)); for (unsigned k = 0; k < inst->components_read(i); ++k) - cbld.at(block, inst) - .MOV(offset(tmp, lbld, k), offset(src, inst->exec_size, k)); + cbld.MOV(offset(tmp, lbld, k), offset(src, inst->exec_size, k)); return tmp; @@ -5108,40 +5278,51 @@ needs_dst_copy(const fs_builder &lbld, const fs_inst *inst) /** * Insert data from a packed temporary into the channel group given by * lbld.group() of the destination region of instruction \p inst and return - * the temporary as result. If any copy instructions are required they will - * be emitted around the given \p inst in \p block. + * the temporary as result. Any copy instructions that are required for + * unzipping the previous value (in the case of partial writes) will be + * inserted using \p lbld_before and any copy instructions required for + * zipping up the destination of \p inst will be inserted using \p lbld_after. */ static fs_reg -emit_zip(const fs_builder &lbld, bblock_t *block, fs_inst *inst) +emit_zip(const fs_builder &lbld_before, const fs_builder &lbld_after, + fs_inst *inst) { - /* Builder of the right width to perform the copy avoiding uninitialized - * data if the lowered execution size is greater than the original - * execution size of the instruction. - */ - const fs_builder cbld = lbld.group(MIN2(lbld.dispatch_width(), - inst->exec_size), 0); + assert(lbld_before.dispatch_width() == lbld_after.dispatch_width()); + assert(lbld_before.group() == lbld_after.group()); /* Specified channel group from the destination region. */ - const fs_reg dst = horiz_offset(inst->dst, lbld.group()); + const fs_reg dst = horiz_offset(inst->dst, lbld_after.group()); const unsigned dst_size = inst->size_written / inst->dst.component_size(inst->exec_size); - if (needs_dst_copy(lbld, inst)) { - const fs_reg tmp = lbld.vgrf(inst->dst.type, dst_size); + if (needs_dst_copy(lbld_after, inst)) { + const fs_reg tmp = lbld_after.vgrf(inst->dst.type, dst_size); if (inst->predicate) { /* Handle predication by copying the original contents of * the destination into the temporary before emitting the * lowered instruction. */ - for (unsigned k = 0; k < dst_size; ++k) - cbld.at(block, inst) - .MOV(offset(tmp, lbld, k), offset(dst, inst->exec_size, k)); + const fs_builder gbld_before = + lbld_before.group(MIN2(lbld_before.dispatch_width(), + inst->exec_size), 0); + for (unsigned k = 0; k < dst_size; ++k) { + gbld_before.MOV(offset(tmp, lbld_before, k), + offset(dst, inst->exec_size, k)); + } } - for (unsigned k = 0; k < dst_size; ++k) - cbld.at(block, inst->next) - .MOV(offset(dst, inst->exec_size, k), offset(tmp, lbld, k)); + const fs_builder gbld_after = + lbld_after.group(MIN2(lbld_after.dispatch_width(), + inst->exec_size), 0); + for (unsigned k = 0; k < dst_size; ++k) { + /* Use a builder of the right width to perform the copy avoiding + * uninitialized data if the lowered execution size is greater than + * the original execution size of the instruction. + */ + gbld_after.MOV(offset(dst, inst->exec_size, k), + offset(tmp, lbld_after, k)); + } return tmp; @@ -5181,6 +5362,20 @@ fs_visitor::lower_simd_width() assert(!inst->writes_accumulator && !inst->mlen); + /* Inserting the zip, unzip, and duplicated instructions in all of + * the right spots is somewhat tricky. All of the unzip and any + * instructions from the zip which unzip the destination prior to + * writing need to happen before all of the per-group instructions + * and the zip instructions need to happen after. In order to sort + * this all out, we insert the unzip instructions before \p inst, + * insert the per-group instructions after \p inst (i.e. before + * inst->next), and insert the zip instructions before the + * instruction after \p inst. Since we are inserting instructions + * after \p inst, inst->next is a moving target and we need to save + * it off here so that we insert the zip instructions in the right + * place. + */ + exec_node *const after_inst = inst->next; for (unsigned i = 0; i < n; i++) { /* Emit a copy of the original instruction with the lowered width. * If the EOT flag was set throw it away except for the last @@ -5188,7 +5383,7 @@ fs_visitor::lower_simd_width() */ fs_inst split_inst = *inst; split_inst.exec_size = lower_width; - split_inst.eot = inst->eot && i == n - 1; + split_inst.eot = inst->eot && i == 0; /* Select the correct channel enables for the i-th group, then * transform the sources and destination and emit the lowered @@ -5197,13 +5392,14 @@ fs_visitor::lower_simd_width() const fs_builder lbld = ibld.group(lower_width, i); for (unsigned j = 0; j < inst->sources; j++) - split_inst.src[j] = emit_unzip(lbld, block, inst, j); + split_inst.src[j] = emit_unzip(lbld.at(block, inst), inst, j); - split_inst.dst = emit_zip(lbld, block, inst); + split_inst.dst = emit_zip(lbld.at(block, inst), + lbld.at(block, after_inst), inst); split_inst.size_written = split_inst.dst.component_size(lower_width) * dst_size; - lbld.emit(split_inst); + lbld.at(block, inst->next).emit(split_inst); } inst->remove(block); @@ -5751,6 +5947,7 @@ fs_visitor::optimize() int pass_num = 0; OPT(opt_drop_redundant_mov_to_flags); + OPT(remove_extra_rounding_modes); do { progress = false; @@ -5863,7 +6060,7 @@ fs_visitor::fixup_3src_null_dest() } void -fs_visitor::allocate_registers(bool allow_spilling) +fs_visitor::allocate_registers(unsigned min_dispatch_width, bool allow_spilling) { bool allocated_without_spills; @@ -5929,6 +6126,8 @@ fs_visitor::allocate_registers(bool allow_spilling) if (failed) return; + opt_bank_conflicts(); + schedule_instructions(SCHEDULE_POST); if (last_scratch > 0) { @@ -5998,7 +6197,7 @@ fs_visitor::run_vs() assign_vs_urb_setup(); fixup_3src_null_dest(); - allocate_registers(true); + allocate_registers(8, true); return !failed; } @@ -6078,7 +6277,7 @@ fs_visitor::run_tcs_single_patch() assign_tcs_single_patch_urb_setup(); fixup_3src_null_dest(); - allocate_registers(true); + allocate_registers(8, true); return !failed; } @@ -6112,7 +6311,7 @@ fs_visitor::run_tes() assign_tes_urb_setup(); fixup_3src_null_dest(); - allocate_registers(true); + allocate_registers(8, true); return !failed; } @@ -6161,7 +6360,7 @@ fs_visitor::run_gs() assign_gs_urb_setup(); fixup_3src_null_dest(); - allocate_registers(true); + allocate_registers(8, true); return !failed; } @@ -6261,7 +6460,7 @@ fs_visitor::run_fs(bool allow_spilling, bool do_rep_send) assign_urb_setup(); fixup_3src_null_dest(); - allocate_registers(allow_spilling); + allocate_registers(8, allow_spilling); if (failed) return false; @@ -6271,9 +6470,10 @@ fs_visitor::run_fs(bool allow_spilling, bool do_rep_send) } bool -fs_visitor::run_cs() +fs_visitor::run_cs(unsigned min_dispatch_width) { assert(stage == MESA_SHADER_COMPUTE); + assert(dispatch_width >= min_dispatch_width); setup_cs_payload(); @@ -6304,7 +6504,7 @@ fs_visitor::run_cs() assign_curb_setup(); fixup_3src_null_dest(); - allocate_registers(true); + allocate_registers(min_dispatch_width, true); if (failed) return false; @@ -6724,20 +6924,20 @@ cs_fill_push_const_info(const struct gen_device_info *devinfo, struct brw_cs_prog_data *cs_prog_data) { const struct brw_stage_prog_data *prog_data = &cs_prog_data->base; - int thread_local_id_index = get_thread_local_id_param_index(prog_data); + int subgroup_id_index = get_subgroup_id_param_index(prog_data); bool cross_thread_supported = devinfo->gen > 7 || devinfo->is_haswell; /* The thread ID should be stored in the last param dword */ - assert(thread_local_id_index == -1 || - thread_local_id_index == (int)prog_data->nr_params - 1); + assert(subgroup_id_index == -1 || + subgroup_id_index == (int)prog_data->nr_params - 1); unsigned cross_thread_dwords, per_thread_dwords; if (!cross_thread_supported) { cross_thread_dwords = 0u; per_thread_dwords = prog_data->nr_params; - } else if (thread_local_id_index >= 0) { + } else if (subgroup_id_index >= 0) { /* Fill all but the last register with cross-thread payload */ - cross_thread_dwords = 8 * (thread_local_id_index / 8); + cross_thread_dwords = 8 * (subgroup_id_index / 8); per_thread_dwords = prog_data->nr_params - cross_thread_dwords; assert(per_thread_dwords > 0 && per_thread_dwords <= 8); } else { @@ -6770,6 +6970,20 @@ cs_set_simd_size(struct brw_cs_prog_data *cs_prog_data, unsigned size) cs_prog_data->threads = (group_size + size - 1) / size; } +static nir_shader * +compile_cs_to_nir(const struct brw_compiler *compiler, + void *mem_ctx, + const struct brw_cs_prog_key *key, + struct brw_cs_prog_data *prog_data, + const nir_shader *src_shader, + unsigned dispatch_width) +{ + nir_shader *shader = nir_shader_clone(mem_ctx, src_shader); + shader = brw_nir_apply_sampler_key(shader, compiler, &key->tex, true); + brw_nir_lower_cs_intrinsics(shader, dispatch_width); + return brw_postprocess_nir(shader, compiler, true); +} + const unsigned * brw_compile_cs(const struct brw_compiler *compiler, void *log_data, void *mem_ctx, @@ -6779,116 +6993,134 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data, int shader_time_index, char **error_str) { - nir_shader *shader = nir_shader_clone(mem_ctx, src_shader); - shader = brw_nir_apply_sampler_key(shader, compiler, &key->tex, true); - - brw_nir_lower_cs_intrinsics(shader, prog_data); - shader = brw_postprocess_nir(shader, compiler, true); - - prog_data->local_size[0] = shader->info.cs.local_size[0]; - prog_data->local_size[1] = shader->info.cs.local_size[1]; - prog_data->local_size[2] = shader->info.cs.local_size[2]; + prog_data->local_size[0] = src_shader->info.cs.local_size[0]; + prog_data->local_size[1] = src_shader->info.cs.local_size[1]; + prog_data->local_size[2] = src_shader->info.cs.local_size[2]; unsigned local_workgroup_size = - shader->info.cs.local_size[0] * shader->info.cs.local_size[1] * - shader->info.cs.local_size[2]; + src_shader->info.cs.local_size[0] * src_shader->info.cs.local_size[1] * + src_shader->info.cs.local_size[2]; - unsigned max_cs_threads = compiler->devinfo->max_cs_threads; - unsigned simd_required = DIV_ROUND_UP(local_workgroup_size, max_cs_threads); + unsigned min_dispatch_width = + DIV_ROUND_UP(local_workgroup_size, compiler->devinfo->max_cs_threads); + min_dispatch_width = MAX2(8, min_dispatch_width); + min_dispatch_width = util_next_power_of_two(min_dispatch_width); + assert(min_dispatch_width <= 32); + fs_visitor *v8 = NULL, *v16 = NULL, *v32 = NULL; cfg_t *cfg = NULL; const char *fail_msg = NULL; + unsigned promoted_constants; /* Now the main event: Visit the shader IR and generate our CS IR for it. */ - fs_visitor v8(compiler, log_data, mem_ctx, key, &prog_data->base, - NULL, /* Never used in core profile */ - shader, 8, shader_time_index); - if (simd_required <= 8) { - if (!v8.run_cs()) { - fail_msg = v8.fail_msg; + if (min_dispatch_width <= 8) { + nir_shader *nir8 = compile_cs_to_nir(compiler, mem_ctx, key, + prog_data, src_shader, 8); + v8 = new fs_visitor(compiler, log_data, mem_ctx, key, &prog_data->base, + NULL, /* Never used in core profile */ + nir8, 8, shader_time_index); + if (!v8->run_cs(min_dispatch_width)) { + fail_msg = v8->fail_msg; } else { - cfg = v8.cfg; + /* We should always be able to do SIMD32 for compute shaders */ + assert(v8->max_dispatch_width >= 32); + + cfg = v8->cfg; cs_set_simd_size(prog_data, 8); cs_fill_push_const_info(compiler->devinfo, prog_data); - prog_data->base.dispatch_grf_start_reg = v8.payload.num_regs; + promoted_constants = v8->promoted_constants; } } - fs_visitor v16(compiler, log_data, mem_ctx, key, &prog_data->base, - NULL, /* Never used in core profile */ - shader, 16, shader_time_index); if (likely(!(INTEL_DEBUG & DEBUG_NO16)) && - !fail_msg && v8.max_dispatch_width >= 16 && - simd_required <= 16) { + !fail_msg && min_dispatch_width <= 16) { /* Try a SIMD16 compile */ - if (simd_required <= 8) - v16.import_uniforms(&v8); - if (!v16.run_cs()) { + nir_shader *nir16 = compile_cs_to_nir(compiler, mem_ctx, key, + prog_data, src_shader, 16); + v16 = new fs_visitor(compiler, log_data, mem_ctx, key, &prog_data->base, + NULL, /* Never used in core profile */ + nir16, 16, shader_time_index); + if (v8) + v16->import_uniforms(v8); + + if (!v16->run_cs(min_dispatch_width)) { compiler->shader_perf_log(log_data, "SIMD16 shader failed to compile: %s", - v16.fail_msg); + v16->fail_msg); if (!cfg) { fail_msg = "Couldn't generate SIMD16 program and not " "enough threads for SIMD8"; } } else { - cfg = v16.cfg; + /* We should always be able to do SIMD32 for compute shaders */ + assert(v16->max_dispatch_width >= 32); + + cfg = v16->cfg; cs_set_simd_size(prog_data, 16); cs_fill_push_const_info(compiler->devinfo, prog_data); - prog_data->dispatch_grf_start_reg_16 = v16.payload.num_regs; + promoted_constants = v16->promoted_constants; } } - fs_visitor v32(compiler, log_data, mem_ctx, key, &prog_data->base, - NULL, /* Never used in core profile */ - shader, 32, shader_time_index); - if (!fail_msg && v8.max_dispatch_width >= 32 && - (simd_required > 16 || (INTEL_DEBUG & DEBUG_DO32))) { - /* Try a SIMD32 compile */ - if (simd_required <= 8) - v32.import_uniforms(&v8); - else if (simd_required <= 16) - v32.import_uniforms(&v16); + /* We should always be able to do SIMD32 for compute shaders */ + assert(!v16 || v16->max_dispatch_width >= 32); - if (!v32.run_cs()) { + if (!fail_msg && (min_dispatch_width > 16 || (INTEL_DEBUG & DEBUG_DO32))) { + /* Try a SIMD32 compile */ + nir_shader *nir32 = compile_cs_to_nir(compiler, mem_ctx, key, + prog_data, src_shader, 32); + v32 = new fs_visitor(compiler, log_data, mem_ctx, key, &prog_data->base, + NULL, /* Never used in core profile */ + nir32, 32, shader_time_index); + if (v8) + v32->import_uniforms(v8); + else if (v16) + v32->import_uniforms(v16); + + if (!v32->run_cs(min_dispatch_width)) { compiler->shader_perf_log(log_data, "SIMD32 shader failed to compile: %s", - v16.fail_msg); + v16->fail_msg); if (!cfg) { fail_msg = "Couldn't generate SIMD32 program and not " "enough threads for SIMD16"; } } else { - cfg = v32.cfg; + cfg = v32->cfg; cs_set_simd_size(prog_data, 32); cs_fill_push_const_info(compiler->devinfo, prog_data); + promoted_constants = v32->promoted_constants; } } + const unsigned *ret = NULL; if (unlikely(cfg == NULL)) { assert(fail_msg); if (error_str) *error_str = ralloc_strdup(mem_ctx, fail_msg); + } else { + fs_generator g(compiler, log_data, mem_ctx, (void*) key, &prog_data->base, + promoted_constants, false, MESA_SHADER_COMPUTE); + if (INTEL_DEBUG & DEBUG_CS) { + char *name = ralloc_asprintf(mem_ctx, "%s compute shader %s", + src_shader->info.label ? + src_shader->info.label : "unnamed", + src_shader->info.name); + g.enable_debug(name); + } - return NULL; - } + g.generate_code(cfg, prog_data->simd_size); - fs_generator g(compiler, log_data, mem_ctx, (void*) key, &prog_data->base, - v8.promoted_constants, v8.runtime_check_aads_emit, - MESA_SHADER_COMPUTE); - if (INTEL_DEBUG & DEBUG_CS) { - char *name = ralloc_asprintf(mem_ctx, "%s compute shader %s", - shader->info.label ? shader->info.label : - "unnamed", - shader->info.name); - g.enable_debug(name); + ret = g.get_assembly(&prog_data->base.program_size); } - g.generate_code(cfg, prog_data->simd_size); + delete v8; + delete v16; + delete v32; - return g.get_assembly(&prog_data->base.program_size); + return ret; } /**