X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fmesa%2Fdrivers%2Fdri%2Fi965%2Fbrw_fs.cpp;h=68e73cc5cd801a672bbb73e884444c5f9e6556d6;hb=a497ab6838ae5a9898abfed82f7bc8295b490911;hp=f839d368f3c8a0185566a6d3928d0cdd2cb9a222;hpb=91d61fbf7cb61a44adcaae51ee08ad0dd6b2a03b;p=mesa.git diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index f839d368f3c..68e73cc5cd8 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -462,6 +462,8 @@ type_size_scalar(const struct glsl_type *type) case GLSL_TYPE_BOOL: return type->components(); case GLSL_TYPE_DOUBLE: + case GLSL_TYPE_UINT64: + case GLSL_TYPE_INT64: return type->components() * 2; case GLSL_TYPE_ARRAY: return type_size_scalar(type->fields.array) * type->length; @@ -492,19 +494,6 @@ type_size_scalar(const struct glsl_type *type) return 0; } -/* Attribute arrays are loaded as one vec4 per element (or matrix column), - * except for double-precision types, which are loaded as one dvec4. - */ -extern "C" int -type_size_vs_input(const struct glsl_type *type) -{ - if (type->is_double()) { - return type_size_dvec4(type); - } else { - return type_size_vec4(type); - } -} - /** * Create a MOV to read the timestamp register. * @@ -730,7 +719,7 @@ fs_inst::components_read(unsigned i) const opcode == SHADER_OPCODE_TXD_LOGICAL) return src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud; /* Texture offset. */ - else if (i == TEX_LOGICAL_SRC_OFFSET_VALUE) + else if (i == TEX_LOGICAL_SRC_TG4_OFFSET) return 2; /* MCS */ else if (i == TEX_LOGICAL_SRC_MCS && opcode == SHADER_OPCODE_TXF_CMS_W_LOGICAL) @@ -1864,7 +1853,10 @@ fs_visitor::compact_virtual_grfs() } static void -set_push_pull_constant_loc(unsigned uniform, int *chunk_start, bool contiguous, +set_push_pull_constant_loc(unsigned uniform, int *chunk_start, + unsigned *max_chunk_bitsize, + bool contiguous, unsigned bitsize, + const unsigned target_bitsize, int *push_constant_loc, int *pull_constant_loc, unsigned *num_push_constants, unsigned *num_pull_constants, @@ -1876,11 +1868,23 @@ set_push_pull_constant_loc(unsigned uniform, int *chunk_start, bool contiguous, if (*chunk_start < 0) *chunk_start = uniform; + /* Keep track of the maximum bit size access in contiguous uniforms */ + *max_chunk_bitsize = MAX2(*max_chunk_bitsize, bitsize); + /* If this element does not need to be contiguous with the next, we * split at this point and everything between chunk_start and u forms a * single chunk. */ if (!contiguous) { + /* If bitsize doesn't match the target one, skip it */ + if (*max_chunk_bitsize != target_bitsize) { + /* FIXME: right now we only support 32 and 64-bit accesses */ + assert(*max_chunk_bitsize == 4 || *max_chunk_bitsize == 8); + *max_chunk_bitsize = 0; + *chunk_start = -1; + return; + } + unsigned chunk_size = uniform - *chunk_start + 1; /* Decide whether we should push or pull this parameter. In the @@ -1898,6 +1902,7 @@ set_push_pull_constant_loc(unsigned uniform, int *chunk_start, bool contiguous, pull_constant_loc[j] = (*num_pull_constants)++; } + *max_chunk_bitsize = 0; *chunk_start = -1; } } @@ -1920,8 +1925,8 @@ fs_visitor::assign_constant_locations() bool is_live[uniforms]; memset(is_live, 0, sizeof(is_live)); - bool is_live_64bit[uniforms]; - memset(is_live_64bit, 0, sizeof(is_live_64bit)); + unsigned bitsize_access[uniforms]; + memset(bitsize_access, 0, sizeof(bitsize_access)); /* For each uniform slot, a value of true indicates that the given slot and * the next slot must remain contiguous. This is used to keep us from @@ -1958,20 +1963,18 @@ fs_visitor::assign_constant_locations() for (unsigned j = constant_nr; j < last; j++) { is_live[j] = true; contiguous[j] = true; - if (type_sz(inst->src[i].type) == 8) { - is_live_64bit[j] = true; - } + bitsize_access[j] = MAX2(bitsize_access[j], type_sz(inst->src[i].type)); } is_live[last] = true; + bitsize_access[last] = MAX2(bitsize_access[last], type_sz(inst->src[i].type)); } else { if (constant_nr >= 0 && constant_nr < (int) uniforms) { int regs_read = inst->components_read(i) * type_sz(inst->src[i].type) / 4; for (int j = 0; j < regs_read; j++) { is_live[constant_nr + j] = true; - if (type_sz(inst->src[i].type) == 8) { - is_live_64bit[constant_nr + j] = true; - } + bitsize_access[constant_nr + j] = + MAX2(bitsize_access[constant_nr + j], type_sz(inst->src[i].type)); } } } @@ -2010,13 +2013,17 @@ fs_visitor::assign_constant_locations() memset(pull_constant_loc, -1, uniforms * sizeof(*pull_constant_loc)); int chunk_start = -1; + unsigned max_chunk_bitsize = 0; /* First push 64-bit uniforms to ensure they are properly aligned */ + const unsigned uniform_64_bit_size = type_sz(BRW_REGISTER_TYPE_DF); for (unsigned u = 0; u < uniforms; u++) { - if (!is_live[u] || !is_live_64bit[u]) + if (!is_live[u]) continue; - set_push_pull_constant_loc(u, &chunk_start, contiguous[u], + set_push_pull_constant_loc(u, &chunk_start, &max_chunk_bitsize, + contiguous[u], bitsize_access[u], + uniform_64_bit_size, push_constant_loc, pull_constant_loc, &num_push_constants, &num_pull_constants, max_push_components, max_chunk_size, @@ -2025,15 +2032,18 @@ fs_visitor::assign_constant_locations() } /* Then push the rest of uniforms */ + const unsigned uniform_32_bit_size = type_sz(BRW_REGISTER_TYPE_F); for (unsigned u = 0; u < uniforms; u++) { - if (!is_live[u] || is_live_64bit[u]) + if (!is_live[u]) continue; /* Skip thread_local_id_index to put it in the last push register. */ if (thread_local_id_index == (int)u) continue; - set_push_pull_constant_loc(u, &chunk_start, contiguous[u], + set_push_pull_constant_loc(u, &chunk_start, &max_chunk_bitsize, + contiguous[u], bitsize_access[u], + uniform_32_bit_size, push_constant_loc, pull_constant_loc, &num_push_constants, &num_pull_constants, max_push_components, max_chunk_size, @@ -2111,25 +2121,22 @@ fs_visitor::lower_constant_loads() if (pull_index == -1) continue; - const unsigned index = stage_prog_data->binding_table.pull_constants_start; - fs_reg dst; - - if (type_sz(inst->src[i].type) <= 4) - dst = vgrf(glsl_type::float_type); - else - dst = vgrf(glsl_type::double_type); - assert(inst->src[i].stride == 0); - const fs_builder ubld = ibld.exec_all().group(8, 0); - struct brw_reg offset = brw_imm_ud((unsigned)(pull_index * 4) & ~15); + const unsigned index = stage_prog_data->binding_table.pull_constants_start; + const unsigned block_sz = 64; /* Fetch one cacheline at a time. */ + const fs_builder ubld = ibld.exec_all().group(block_sz / 4, 0); + const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD); + const unsigned base = pull_index * 4; + ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, - dst, brw_imm_ud(index), offset); + dst, brw_imm_ud(index), brw_imm_ud(base & ~(block_sz - 1))); /* Rewrite the instruction to use the temporary VGRF. */ inst->src[i].file = VGRF; inst->src[i].nr = dst.nr; - inst->src[i].offset = (pull_index & 3) * 4 + inst->src[i].offset % 4; + inst->src[i].offset = (base & (block_sz - 1)) + + inst->src[i].offset % 4; brw_mark_surface_used(prog_data, index); } @@ -3202,44 +3209,18 @@ fs_visitor::lower_uniform_pull_constant_loads() continue; if (devinfo->gen >= 7) { - /* The offset arg is a vec4-aligned immediate byte offset. */ - fs_reg const_offset_reg = inst->src[1]; - assert(const_offset_reg.file == IMM && - const_offset_reg.type == BRW_REGISTER_TYPE_UD); - assert(const_offset_reg.ud % 16 == 0); - - fs_reg payload, offset; - if (devinfo->gen >= 9) { - /* We have to use a message header on Skylake to get SIMD4x2 - * mode. Reserve space for the register. - */ - offset = payload = fs_reg(VGRF, alloc.allocate(2)); - offset.offset += REG_SIZE; - inst->mlen = 2; - } else { - offset = payload = fs_reg(VGRF, alloc.allocate(1)); - inst->mlen = 1; - } - - /* This is actually going to be a MOV, but since only the first dword - * is accessed, we have a special opcode to do just that one. Note - * that this needs to be an operation that will be considered a def - * by live variable analysis, or register allocation will explode. - */ - fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET, - 8, offset, const_offset_reg); - setup->force_writemask_all = true; + const fs_builder ubld = fs_builder(this, block, inst).exec_all(); + const fs_reg payload = ubld.group(8, 0).vgrf(BRW_REGISTER_TYPE_UD); - setup->ir = inst->ir; - setup->annotation = inst->annotation; - inst->insert_before(block, setup); + ubld.group(8, 0).MOV(payload, + retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); + ubld.group(1, 0).MOV(component(payload, 2), + brw_imm_ud(inst->src[1].ud / 16)); - /* Similarly, this will only populate the first 4 channels of the - * result register (since we only use smear values from 0-3), but we - * don't tell the optimizer. - */ inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7; inst->src[1] = payload; + inst->header_size = 1; + inst->mlen = 1; invalidate_live_intervals(); } else { @@ -3673,6 +3654,12 @@ lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst, */ setup_color_payload(bld, key, &sources[length], src0_alpha, 1); length++; + } else if (key->replicate_alpha && inst->target != 0) { + /* Handle the case when fragment shader doesn't write to draw buffer + * zero. No need to call setup_color_payload() for src0_alpha because + * alpha value will be undefined. + */ + length++; } setup_color_payload(bld, key, &sources[length], color0, components); @@ -3827,8 +3814,8 @@ lower_sampler_logical_send_gen4(const fs_builder &bld, fs_inst *inst, opcode op, } if (has_lod) { - /* Bias/LOD with shadow comparitor is unsupported in SIMD16 -- *Without* - * shadow comparitor (including RESINFO) it's unsupported in SIMD8 mode. + /* Bias/LOD with shadow comparator is unsupported in SIMD16 -- *Without* + * shadow comparator (including RESINFO) it's unsupported in SIMD8 mode. */ assert(shadow_c.file != BAD_FILE ? bld.dispatch_width() == 8 : bld.dispatch_width() == 16); @@ -3871,7 +3858,6 @@ lower_sampler_logical_send_gen5(const fs_builder &bld, fs_inst *inst, opcode op, const fs_reg &sample_index, const fs_reg &surface, const fs_reg &sampler, - const fs_reg &offset_value, unsigned coord_components, unsigned grad_components) { @@ -3879,7 +3865,7 @@ lower_sampler_logical_send_gen5(const fs_builder &bld, fs_inst *inst, opcode op, fs_reg msg_coords = message; unsigned header_size = 0; - if (offset_value.file != BAD_FILE) { + if (inst->offset != 0) { /* The offsets set up by the visitor are in the m1 header, so we can't * go headerless. */ @@ -3979,7 +3965,7 @@ lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op, const fs_reg &mcs, const fs_reg &surface, const fs_reg &sampler, - const fs_reg &offset_value, + const fs_reg &tg4_offset, unsigned coord_components, unsigned grad_components) { @@ -3991,7 +3977,7 @@ lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op, sources[i] = bld.vgrf(BRW_REGISTER_TYPE_F); if (op == SHADER_OPCODE_TG4 || op == SHADER_OPCODE_TG4_OFFSET || - offset_value.file != BAD_FILE || inst->eot || + inst->offset != 0 || inst->eot || op == SHADER_OPCODE_SAMPLEINFO || is_high_sampler(devinfo, sampler)) { /* For general texture offsets (no txf workaround), we need a header to @@ -4136,7 +4122,7 @@ lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op, for (unsigned i = 0; i < 2; i++) /* offu, offv */ bld.MOV(retype(sources[length++], BRW_REGISTER_TYPE_D), - offset(offset_value, bld, i)); + offset(tg4_offset, bld, i)); if (coord_components == 3) /* r if present */ bld.MOV(sources[length++], offset(coordinate, bld, 2)); @@ -4188,7 +4174,7 @@ lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op) const fs_reg &mcs = inst->src[TEX_LOGICAL_SRC_MCS]; const fs_reg &surface = inst->src[TEX_LOGICAL_SRC_SURFACE]; const fs_reg &sampler = inst->src[TEX_LOGICAL_SRC_SAMPLER]; - const fs_reg &offset_value = inst->src[TEX_LOGICAL_SRC_OFFSET_VALUE]; + const fs_reg &tg4_offset = inst->src[TEX_LOGICAL_SRC_TG4_OFFSET]; assert(inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM); const unsigned coord_components = inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud; assert(inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM); @@ -4197,12 +4183,12 @@ lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op) if (devinfo->gen >= 7) { lower_sampler_logical_send_gen7(bld, inst, op, coordinate, shadow_c, lod, lod2, sample_index, - mcs, surface, sampler, offset_value, + mcs, surface, sampler, tg4_offset, coord_components, grad_components); } else if (devinfo->gen >= 5) { lower_sampler_logical_send_gen5(bld, inst, op, coordinate, shadow_c, lod, lod2, sample_index, - surface, sampler, offset_value, + surface, sampler, coord_components, grad_components); } else { lower_sampler_logical_send_gen4(bld, inst, op, coordinate, @@ -4671,7 +4657,7 @@ get_sampler_lowered_simd_width(const struct gen_device_info *devinfo, inst->components_read(TEX_LOGICAL_SRC_LOD2) + inst->components_read(TEX_LOGICAL_SRC_SAMPLE_INDEX) + (inst->opcode == SHADER_OPCODE_TG4_OFFSET_LOGICAL ? - inst->components_read(TEX_LOGICAL_SRC_OFFSET_VALUE) : 0) + + inst->components_read(TEX_LOGICAL_SRC_TG4_OFFSET) : 0) + inst->components_read(TEX_LOGICAL_SRC_MCS); /* SIMD16 messages with more than five arguments exceed the maximum message @@ -5686,7 +5672,7 @@ fs_visitor::optimize() OPT(opt_algebraic); OPT(opt_cse); - OPT(opt_copy_propagate); + OPT(opt_copy_propagation); OPT(opt_predicated_break, this); OPT(opt_cmod_propagation); OPT(dead_code_eliminate); @@ -5710,7 +5696,7 @@ fs_visitor::optimize() } if (OPT(lower_d2x)) { - OPT(opt_copy_propagate); + OPT(opt_copy_propagation); OPT(dead_code_eliminate); } @@ -5722,12 +5708,12 @@ fs_visitor::optimize() OPT(lower_logical_sends); if (progress) { - OPT(opt_copy_propagate); + OPT(opt_copy_propagation); /* Only run after logical send lowering because it's easier to implement * in terms of physical sends. */ if (OPT(opt_zero_samples)) - OPT(opt_copy_propagate); + OPT(opt_copy_propagation); /* Run after logical send lowering to give it a chance to CSE the * LOAD_PAYLOAD instructions created to construct the payloads of * e.g. texturing messages in cases where it wasn't possible to CSE the @@ -5756,7 +5742,7 @@ fs_visitor::optimize() if (devinfo->gen <= 5 && OPT(lower_minmax)) { OPT(opt_cmod_propagation); OPT(opt_cse); - OPT(opt_copy_propagate); + OPT(opt_copy_propagation); OPT(dead_code_eliminate); } @@ -5962,15 +5948,15 @@ fs_visitor::run_tcs_single_patch() } /* Fix the disptach mask */ - if (nir->info->tcs.vertices_out % 8) { + if (nir->info->tess.tcs_vertices_out % 8) { bld.CMP(bld.null_reg_ud(), invocation_id, - brw_imm_ud(nir->info->tcs.vertices_out), BRW_CONDITIONAL_L); + brw_imm_ud(nir->info->tess.tcs_vertices_out), BRW_CONDITIONAL_L); bld.IF(BRW_PREDICATE_NORMAL); } emit_nir_code(); - if (nir->info->tcs.vertices_out % 8) { + if (nir->info->tess.tcs_vertices_out % 8) { bld.emit(BRW_OPCODE_ENDIF); } @@ -6419,15 +6405,21 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data, unsigned *final_assembly_size, char **error_str) { + const struct gen_device_info *devinfo = compiler->devinfo; + nir_shader *shader = nir_shader_clone(mem_ctx, src_shader); - shader = brw_nir_apply_sampler_key(shader, compiler->devinfo, &key->tex, - true); - brw_nir_lower_fs_inputs(shader, vue_map, prog, compiler->devinfo, key); + shader = brw_nir_apply_sampler_key(shader, compiler, &key->tex, true); + brw_nir_lower_fs_inputs(shader, devinfo, key); brw_nir_lower_fs_outputs(shader); + + if (devinfo->gen < 6) { + brw_setup_vue_interpolation(vue_map, shader, prog_data, devinfo); + } + if (!key->multisample_fbo) NIR_PASS_V(shader, demote_sample_qualifiers); NIR_PASS_V(shader, move_interpolation_to_top); - shader = brw_postprocess_nir(shader, compiler->devinfo, true); + shader = brw_postprocess_nir(shader, compiler, true); /* key->alpha_test_func means simulating alpha testing via discards, * so the shader definitely kills pixels. @@ -6449,6 +6441,8 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data, shader->info->outputs_read); prog_data->early_fragment_tests = shader->info->fs.early_fragment_tests; + prog_data->post_depth_coverage = shader->info->fs.post_depth_coverage; + prog_data->inner_coverage = shader->info->fs.inner_coverage; prog_data->barycentric_interp_modes = brw_compute_barycentric_interp_modes(compiler->devinfo, shader); @@ -6650,8 +6644,7 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data, char **error_str) { nir_shader *shader = nir_shader_clone(mem_ctx, src_shader); - shader = brw_nir_apply_sampler_key(shader, compiler->devinfo, &key->tex, - true); + shader = brw_nir_apply_sampler_key(shader, compiler, &key->tex, true); brw_nir_lower_cs_shared(shader); prog_data->base.total_shared += shader->num_shared; @@ -6664,7 +6657,7 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data, (unsigned)4 * (prog_data->thread_local_id_index + 1)); brw_nir_lower_intrinsics(shader, &prog_data->base); - shader = brw_postprocess_nir(shader, compiler->devinfo, true); + shader = brw_postprocess_nir(shader, compiler, true); prog_data->local_size[0] = shader->info->cs.local_size[0]; prog_data->local_size[1] = shader->info->cs.local_size[1];