case GLSL_TYPE_BOOL:
return type->components();
case GLSL_TYPE_DOUBLE:
+ case GLSL_TYPE_UINT64:
+ case GLSL_TYPE_INT64:
return type->components() * 2;
case GLSL_TYPE_ARRAY:
return type_size_scalar(type->fields.array) * type->length;
return 0;
}
-/* Attribute arrays are loaded as one vec4 per element (or matrix column),
- * except for double-precision types, which are loaded as one dvec4.
- */
-extern "C" int
-type_size_vs_input(const struct glsl_type *type)
-{
- if (type->is_double()) {
- return type_size_dvec4(type);
- } else {
- return type_size_vec4(type);
- }
-}
-
/**
* Create a MOV to read the timestamp register.
*
opcode == SHADER_OPCODE_TXD_LOGICAL)
return src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
/* Texture offset. */
- else if (i == TEX_LOGICAL_SRC_OFFSET_VALUE)
+ else if (i == TEX_LOGICAL_SRC_TG4_OFFSET)
return 2;
/* MCS */
else if (i == TEX_LOGICAL_SRC_MCS && opcode == SHADER_OPCODE_TXF_CMS_W_LOGICAL)
}
static void
-set_push_pull_constant_loc(unsigned uniform, int *chunk_start, bool contiguous,
+set_push_pull_constant_loc(unsigned uniform, int *chunk_start,
+ unsigned *max_chunk_bitsize,
+ bool contiguous, unsigned bitsize,
+ const unsigned target_bitsize,
int *push_constant_loc, int *pull_constant_loc,
unsigned *num_push_constants,
unsigned *num_pull_constants,
if (*chunk_start < 0)
*chunk_start = uniform;
+ /* Keep track of the maximum bit size access in contiguous uniforms */
+ *max_chunk_bitsize = MAX2(*max_chunk_bitsize, bitsize);
+
/* If this element does not need to be contiguous with the next, we
* split at this point and everything between chunk_start and u forms a
* single chunk.
*/
if (!contiguous) {
+ /* If bitsize doesn't match the target one, skip it */
+ if (*max_chunk_bitsize != target_bitsize) {
+ /* FIXME: right now we only support 32 and 64-bit accesses */
+ assert(*max_chunk_bitsize == 4 || *max_chunk_bitsize == 8);
+ *max_chunk_bitsize = 0;
+ *chunk_start = -1;
+ return;
+ }
+
unsigned chunk_size = uniform - *chunk_start + 1;
/* Decide whether we should push or pull this parameter. In the
pull_constant_loc[j] = (*num_pull_constants)++;
}
+ *max_chunk_bitsize = 0;
*chunk_start = -1;
}
}
bool is_live[uniforms];
memset(is_live, 0, sizeof(is_live));
- bool is_live_64bit[uniforms];
- memset(is_live_64bit, 0, sizeof(is_live_64bit));
+ unsigned bitsize_access[uniforms];
+ memset(bitsize_access, 0, sizeof(bitsize_access));
/* For each uniform slot, a value of true indicates that the given slot and
* the next slot must remain contiguous. This is used to keep us from
for (unsigned j = constant_nr; j < last; j++) {
is_live[j] = true;
contiguous[j] = true;
- if (type_sz(inst->src[i].type) == 8) {
- is_live_64bit[j] = true;
- }
+ bitsize_access[j] = MAX2(bitsize_access[j], type_sz(inst->src[i].type));
}
is_live[last] = true;
+ bitsize_access[last] = MAX2(bitsize_access[last], type_sz(inst->src[i].type));
} else {
if (constant_nr >= 0 && constant_nr < (int) uniforms) {
int regs_read = inst->components_read(i) *
type_sz(inst->src[i].type) / 4;
for (int j = 0; j < regs_read; j++) {
is_live[constant_nr + j] = true;
- if (type_sz(inst->src[i].type) == 8) {
- is_live_64bit[constant_nr + j] = true;
- }
+ bitsize_access[constant_nr + j] =
+ MAX2(bitsize_access[constant_nr + j], type_sz(inst->src[i].type));
}
}
}
memset(pull_constant_loc, -1, uniforms * sizeof(*pull_constant_loc));
int chunk_start = -1;
+ unsigned max_chunk_bitsize = 0;
/* First push 64-bit uniforms to ensure they are properly aligned */
+ const unsigned uniform_64_bit_size = type_sz(BRW_REGISTER_TYPE_DF);
for (unsigned u = 0; u < uniforms; u++) {
- if (!is_live[u] || !is_live_64bit[u])
+ if (!is_live[u])
continue;
- set_push_pull_constant_loc(u, &chunk_start, contiguous[u],
+ set_push_pull_constant_loc(u, &chunk_start, &max_chunk_bitsize,
+ contiguous[u], bitsize_access[u],
+ uniform_64_bit_size,
push_constant_loc, pull_constant_loc,
&num_push_constants, &num_pull_constants,
max_push_components, max_chunk_size,
}
/* Then push the rest of uniforms */
+ const unsigned uniform_32_bit_size = type_sz(BRW_REGISTER_TYPE_F);
for (unsigned u = 0; u < uniforms; u++) {
- if (!is_live[u] || is_live_64bit[u])
+ if (!is_live[u])
continue;
/* Skip thread_local_id_index to put it in the last push register. */
if (thread_local_id_index == (int)u)
continue;
- set_push_pull_constant_loc(u, &chunk_start, contiguous[u],
+ set_push_pull_constant_loc(u, &chunk_start, &max_chunk_bitsize,
+ contiguous[u], bitsize_access[u],
+ uniform_32_bit_size,
push_constant_loc, pull_constant_loc,
&num_push_constants, &num_pull_constants,
max_push_components, max_chunk_size,
if (pull_index == -1)
continue;
- const unsigned index = stage_prog_data->binding_table.pull_constants_start;
- fs_reg dst;
-
- if (type_sz(inst->src[i].type) <= 4)
- dst = vgrf(glsl_type::float_type);
- else
- dst = vgrf(glsl_type::double_type);
-
assert(inst->src[i].stride == 0);
- const fs_builder ubld = ibld.exec_all().group(8, 0);
- struct brw_reg offset = brw_imm_ud((unsigned)(pull_index * 4) & ~15);
+ const unsigned index = stage_prog_data->binding_table.pull_constants_start;
+ const unsigned block_sz = 64; /* Fetch one cacheline at a time. */
+ const fs_builder ubld = ibld.exec_all().group(block_sz / 4, 0);
+ const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD);
+ const unsigned base = pull_index * 4;
+
ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
- dst, brw_imm_ud(index), offset);
+ dst, brw_imm_ud(index), brw_imm_ud(base & ~(block_sz - 1)));
/* Rewrite the instruction to use the temporary VGRF. */
inst->src[i].file = VGRF;
inst->src[i].nr = dst.nr;
- inst->src[i].offset = (pull_index & 3) * 4 + inst->src[i].offset % 4;
+ inst->src[i].offset = (base & (block_sz - 1)) +
+ inst->src[i].offset % 4;
brw_mark_surface_used(prog_data, index);
}
continue;
if (devinfo->gen >= 7) {
- /* The offset arg is a vec4-aligned immediate byte offset. */
- fs_reg const_offset_reg = inst->src[1];
- assert(const_offset_reg.file == IMM &&
- const_offset_reg.type == BRW_REGISTER_TYPE_UD);
- assert(const_offset_reg.ud % 16 == 0);
-
- fs_reg payload, offset;
- if (devinfo->gen >= 9) {
- /* We have to use a message header on Skylake to get SIMD4x2
- * mode. Reserve space for the register.
- */
- offset = payload = fs_reg(VGRF, alloc.allocate(2));
- offset.offset += REG_SIZE;
- inst->mlen = 2;
- } else {
- offset = payload = fs_reg(VGRF, alloc.allocate(1));
- inst->mlen = 1;
- }
-
- /* This is actually going to be a MOV, but since only the first dword
- * is accessed, we have a special opcode to do just that one. Note
- * that this needs to be an operation that will be considered a def
- * by live variable analysis, or register allocation will explode.
- */
- fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
- 8, offset, const_offset_reg);
- setup->force_writemask_all = true;
+ const fs_builder ubld = fs_builder(this, block, inst).exec_all();
+ const fs_reg payload = ubld.group(8, 0).vgrf(BRW_REGISTER_TYPE_UD);
- setup->ir = inst->ir;
- setup->annotation = inst->annotation;
- inst->insert_before(block, setup);
+ ubld.group(8, 0).MOV(payload,
+ retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
+ ubld.group(1, 0).MOV(component(payload, 2),
+ brw_imm_ud(inst->src[1].ud / 16));
- /* Similarly, this will only populate the first 4 channels of the
- * result register (since we only use smear values from 0-3), but we
- * don't tell the optimizer.
- */
inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
inst->src[1] = payload;
+ inst->header_size = 1;
+ inst->mlen = 1;
invalidate_live_intervals();
} else {
*/
setup_color_payload(bld, key, &sources[length], src0_alpha, 1);
length++;
+ } else if (key->replicate_alpha && inst->target != 0) {
+ /* Handle the case when fragment shader doesn't write to draw buffer
+ * zero. No need to call setup_color_payload() for src0_alpha because
+ * alpha value will be undefined.
+ */
+ length++;
}
setup_color_payload(bld, key, &sources[length], color0, components);
}
if (has_lod) {
- /* Bias/LOD with shadow comparitor is unsupported in SIMD16 -- *Without*
- * shadow comparitor (including RESINFO) it's unsupported in SIMD8 mode.
+ /* Bias/LOD with shadow comparator is unsupported in SIMD16 -- *Without*
+ * shadow comparator (including RESINFO) it's unsupported in SIMD8 mode.
*/
assert(shadow_c.file != BAD_FILE ? bld.dispatch_width() == 8 :
bld.dispatch_width() == 16);
const fs_reg &sample_index,
const fs_reg &surface,
const fs_reg &sampler,
- const fs_reg &offset_value,
unsigned coord_components,
unsigned grad_components)
{
fs_reg msg_coords = message;
unsigned header_size = 0;
- if (offset_value.file != BAD_FILE) {
+ if (inst->offset != 0) {
/* The offsets set up by the visitor are in the m1 header, so we can't
* go headerless.
*/
const fs_reg &mcs,
const fs_reg &surface,
const fs_reg &sampler,
- const fs_reg &offset_value,
+ const fs_reg &tg4_offset,
unsigned coord_components,
unsigned grad_components)
{
sources[i] = bld.vgrf(BRW_REGISTER_TYPE_F);
if (op == SHADER_OPCODE_TG4 || op == SHADER_OPCODE_TG4_OFFSET ||
- offset_value.file != BAD_FILE || inst->eot ||
+ inst->offset != 0 || inst->eot ||
op == SHADER_OPCODE_SAMPLEINFO ||
is_high_sampler(devinfo, sampler)) {
/* For general texture offsets (no txf workaround), we need a header to
for (unsigned i = 0; i < 2; i++) /* offu, offv */
bld.MOV(retype(sources[length++], BRW_REGISTER_TYPE_D),
- offset(offset_value, bld, i));
+ offset(tg4_offset, bld, i));
if (coord_components == 3) /* r if present */
bld.MOV(sources[length++], offset(coordinate, bld, 2));
const fs_reg &mcs = inst->src[TEX_LOGICAL_SRC_MCS];
const fs_reg &surface = inst->src[TEX_LOGICAL_SRC_SURFACE];
const fs_reg &sampler = inst->src[TEX_LOGICAL_SRC_SAMPLER];
- const fs_reg &offset_value = inst->src[TEX_LOGICAL_SRC_OFFSET_VALUE];
+ const fs_reg &tg4_offset = inst->src[TEX_LOGICAL_SRC_TG4_OFFSET];
assert(inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM);
const unsigned coord_components = inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
assert(inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM);
if (devinfo->gen >= 7) {
lower_sampler_logical_send_gen7(bld, inst, op, coordinate,
shadow_c, lod, lod2, sample_index,
- mcs, surface, sampler, offset_value,
+ mcs, surface, sampler, tg4_offset,
coord_components, grad_components);
} else if (devinfo->gen >= 5) {
lower_sampler_logical_send_gen5(bld, inst, op, coordinate,
shadow_c, lod, lod2, sample_index,
- surface, sampler, offset_value,
+ surface, sampler,
coord_components, grad_components);
} else {
lower_sampler_logical_send_gen4(bld, inst, op, coordinate,
inst->components_read(TEX_LOGICAL_SRC_LOD2) +
inst->components_read(TEX_LOGICAL_SRC_SAMPLE_INDEX) +
(inst->opcode == SHADER_OPCODE_TG4_OFFSET_LOGICAL ?
- inst->components_read(TEX_LOGICAL_SRC_OFFSET_VALUE) : 0) +
+ inst->components_read(TEX_LOGICAL_SRC_TG4_OFFSET) : 0) +
inst->components_read(TEX_LOGICAL_SRC_MCS);
/* SIMD16 messages with more than five arguments exceed the maximum message
OPT(opt_algebraic);
OPT(opt_cse);
- OPT(opt_copy_propagate);
+ OPT(opt_copy_propagation);
OPT(opt_predicated_break, this);
OPT(opt_cmod_propagation);
OPT(dead_code_eliminate);
}
if (OPT(lower_d2x)) {
- OPT(opt_copy_propagate);
+ OPT(opt_copy_propagation);
OPT(dead_code_eliminate);
}
OPT(lower_logical_sends);
if (progress) {
- OPT(opt_copy_propagate);
+ OPT(opt_copy_propagation);
/* Only run after logical send lowering because it's easier to implement
* in terms of physical sends.
*/
if (OPT(opt_zero_samples))
- OPT(opt_copy_propagate);
+ OPT(opt_copy_propagation);
/* Run after logical send lowering to give it a chance to CSE the
* LOAD_PAYLOAD instructions created to construct the payloads of
* e.g. texturing messages in cases where it wasn't possible to CSE the
if (devinfo->gen <= 5 && OPT(lower_minmax)) {
OPT(opt_cmod_propagation);
OPT(opt_cse);
- OPT(opt_copy_propagate);
+ OPT(opt_copy_propagation);
OPT(dead_code_eliminate);
}
}
/* Fix the disptach mask */
- if (nir->info->tcs.vertices_out % 8) {
+ if (nir->info->tess.tcs_vertices_out % 8) {
bld.CMP(bld.null_reg_ud(), invocation_id,
- brw_imm_ud(nir->info->tcs.vertices_out), BRW_CONDITIONAL_L);
+ brw_imm_ud(nir->info->tess.tcs_vertices_out), BRW_CONDITIONAL_L);
bld.IF(BRW_PREDICATE_NORMAL);
}
emit_nir_code();
- if (nir->info->tcs.vertices_out % 8) {
+ if (nir->info->tess.tcs_vertices_out % 8) {
bld.emit(BRW_OPCODE_ENDIF);
}
unsigned *final_assembly_size,
char **error_str)
{
+ const struct gen_device_info *devinfo = compiler->devinfo;
+
nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
- shader = brw_nir_apply_sampler_key(shader, compiler->devinfo, &key->tex,
- true);
- brw_nir_lower_fs_inputs(shader, vue_map, prog, compiler->devinfo, key);
+ shader = brw_nir_apply_sampler_key(shader, compiler, &key->tex, true);
+ brw_nir_lower_fs_inputs(shader, devinfo, key);
brw_nir_lower_fs_outputs(shader);
+
+ if (devinfo->gen < 6) {
+ brw_setup_vue_interpolation(vue_map, shader, prog_data, devinfo);
+ }
+
if (!key->multisample_fbo)
NIR_PASS_V(shader, demote_sample_qualifiers);
NIR_PASS_V(shader, move_interpolation_to_top);
- shader = brw_postprocess_nir(shader, compiler->devinfo, true);
+ shader = brw_postprocess_nir(shader, compiler, true);
/* key->alpha_test_func means simulating alpha testing via discards,
* so the shader definitely kills pixels.
shader->info->outputs_read);
prog_data->early_fragment_tests = shader->info->fs.early_fragment_tests;
+ prog_data->post_depth_coverage = shader->info->fs.post_depth_coverage;
+ prog_data->inner_coverage = shader->info->fs.inner_coverage;
prog_data->barycentric_interp_modes =
brw_compute_barycentric_interp_modes(compiler->devinfo, shader);
char **error_str)
{
nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
- shader = brw_nir_apply_sampler_key(shader, compiler->devinfo, &key->tex,
- true);
+ shader = brw_nir_apply_sampler_key(shader, compiler, &key->tex, true);
brw_nir_lower_cs_shared(shader);
prog_data->base.total_shared += shader->num_shared;
(unsigned)4 * (prog_data->thread_local_id_index + 1));
brw_nir_lower_intrinsics(shader, &prog_data->base);
- shader = brw_postprocess_nir(shader, compiler->devinfo, true);
+ shader = brw_postprocess_nir(shader, compiler, true);
prog_data->local_size[0] = shader->info->cs.local_size[0];
prog_data->local_size[1] = shader->info->cs.local_size[1];