#include "brw_program.h"
#include "brw_dead_control_flow.h"
#include "compiler/glsl_types.h"
+#include "program/prog_parameter.h"
using namespace brw;
* CSE can later notice that those loads are all the same and eliminate
* the redundant ones.
*/
- fs_reg vec4_offset = vgrf(glsl_type::int_type);
+ fs_reg vec4_offset = vgrf(glsl_type::uint_type);
bld.ADD(vec4_offset, varying_offset, brw_imm_ud(const_offset & ~0xf));
int scale = 1;
else
op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
+ /* The pull load message will load a vec4 (16 bytes). If we are loading
+ * a double this means we are only loading 2 elements worth of data.
+ * We also want to use a 32-bit data type for the dst of the load operation
+ * so other parts of the driver don't get confused about the size of the
+ * result.
+ */
int regs_written = 4 * (bld.dispatch_width() / 8) * scale;
- fs_reg vec4_result = fs_reg(VGRF, alloc.allocate(regs_written), dst.type);
+ fs_reg vec4_result = fs_reg(VGRF, alloc.allocate(regs_written),
+ BRW_REGISTER_TYPE_F);
fs_inst *inst = bld.emit(op, vec4_result, surf_index, vec4_offset);
inst->regs_written = regs_written;
inst->mlen = 1 + bld.dispatch_width() / 8;
}
- bld.MOV(dst, offset(vec4_result, bld, ((const_offset & 0xf) / 4) * scale));
+ if (type_sz(dst.type) == 8) {
+ assert(scale == 1);
+ shuffle_32bit_load_result_to_64bit_data(
+ bld, retype(vec4_result, dst.type), vec4_result, 2);
+ }
+
+ vec4_result.type = dst.type;
+ bld.MOV(dst, offset(vec4_result, bld,
+ (const_offset & 0xf) / type_sz(vec4_result.type) * scale));
}
/**
if (i < this->header_size) {
reg.reg_offset += 1;
} else {
- reg.reg_offset += this->exec_size / 8;
+ reg = horiz_offset(reg, this->exec_size);
}
}
{
this->reg_offset = 0;
this->subreg_offset = 0;
- this->reladdr = NULL;
this->stride = 1;
if (this->file == IMM &&
(this->type != BRW_REGISTER_TYPE_V &&
{
return (this->backend_reg::equals(r) &&
subreg_offset == r.subreg_offset &&
- !reladdr && !r.reladdr &&
stride == r.stride);
}
case GLSL_TYPE_FLOAT:
case GLSL_TYPE_BOOL:
return type->components();
+ case GLSL_TYPE_DOUBLE:
+ return type->components() * 2;
case GLSL_TYPE_ARRAY:
return type_size_scalar(type->fields.array) * type->length;
case GLSL_TYPE_STRUCT:
case GLSL_TYPE_VOID:
case GLSL_TYPE_ERROR:
case GLSL_TYPE_INTERFACE:
- case GLSL_TYPE_DOUBLE:
+ case GLSL_TYPE_FUNCTION:
unreachable("not reached");
}
return 4 * type_size_vec4(type);
}
+/* Attribute arrays are loaded as one vec4 per element (or matrix column),
+ * except for double-precision types, which are loaded as one dvec4.
+ */
+extern "C" int
+type_size_vs_input(const struct glsl_type *type)
+{
+ if (type->is_double()) {
+ return type_size_dvec4(type);
+ } else {
+ return type_size_vec4(type);
+ }
+}
+
/**
* Create a MOV to read the timestamp register.
*
{
return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
(this->exec_size * type_sz(this->dst.type)) < 32 ||
- !this->dst.is_contiguous());
+ !this->dst.is_contiguous() ||
+ this->dst.subreg_offset > 0);
}
unsigned
assert(src[2].file == IMM);
unsigned region_length = src[2].ud;
- if (src[0].file == FIXED_GRF) {
+ if (src[0].file == UNIFORM) {
+ assert(region_length % 4 == 0);
+ return region_length / 4;
+ } else if (src[0].file == FIXED_GRF) {
/* If the start of the region is not register aligned, then
* there's some portion of the register that's technically
* unread at the beginning.
* unread portion at the beginning.
*/
if (src[0].subnr)
- region_length += src[0].subnr * type_sz(src[0].type);
+ region_length += src[0].subnr;
return DIV_ROUND_UP(region_length, REG_SIZE);
} else {
case FS_OPCODE_TXB:
case SHADER_OPCODE_TXD:
case SHADER_OPCODE_TXF:
+ case SHADER_OPCODE_TXF_LZ:
case SHADER_OPCODE_TXF_CMS:
case SHADER_OPCODE_TXF_CMS_W:
case SHADER_OPCODE_TXF_MCS:
case SHADER_OPCODE_TG4:
case SHADER_OPCODE_TG4_OFFSET:
case SHADER_OPCODE_TXL:
+ case SHADER_OPCODE_TXL_LZ:
case SHADER_OPCODE_TXS:
case SHADER_OPCODE_LOD:
case SHADER_OPCODE_SAMPLEINFO:
this->push_constant_loc = v->push_constant_loc;
this->pull_constant_loc = v->pull_constant_loc;
this->uniforms = v->uniforms;
- this->param_size = v->param_size;
}
fs_reg *
-fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
- bool origin_upper_left)
+fs_visitor::emit_fragcoord_interpolation()
{
assert(stage == MESA_SHADER_FRAGMENT);
- brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
fs_reg wpos = *reg;
- bool flip = !origin_upper_left ^ key->render_to_fbo;
/* gl_FragCoord.x */
- if (pixel_center_integer) {
- bld.MOV(wpos, this->pixel_x);
- } else {
- bld.ADD(wpos, this->pixel_x, brw_imm_f(0.5f));
- }
+ bld.MOV(wpos, this->pixel_x);
wpos = offset(wpos, bld, 1);
/* gl_FragCoord.y */
- if (!flip && pixel_center_integer) {
- bld.MOV(wpos, this->pixel_y);
- } else {
- fs_reg pixel_y = this->pixel_y;
- float offset = (pixel_center_integer ? 0.0f : 0.5f);
-
- if (flip) {
- pixel_y.negate = true;
- offset += key->drawable_height - 1.0f;
- }
-
- bld.ADD(wpos, pixel_y, brw_imm_f(offset));
- }
+ bld.MOV(wpos, this->pixel_y);
wpos = offset(wpos, bld, 1);
/* gl_FragCoord.z */
inst->no_dd_clear = true;
inst = emit_linterp(*attr, fs_reg(interp), interpolation_mode,
- mod_centroid && !key->persample_shading,
- mod_sample || key->persample_shading);
+ mod_centroid && !key->persample_interp,
+ mod_sample || key->persample_interp);
inst->predicate = BRW_PREDICATE_NORMAL;
inst->predicate_inverse = false;
if (devinfo->has_pln)
} else {
emit_linterp(*attr, fs_reg(interp), interpolation_mode,
- mod_centroid && !key->persample_shading,
- mod_sample || key->persample_shading);
+ mod_centroid && !key->persample_interp,
+ mod_sample || key->persample_interp);
}
if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
bld.MUL(*attr, *attr, this->pixel_w);
fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
{
assert(stage == MESA_SHADER_FRAGMENT);
- brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
+ brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
assert(dst.type == BRW_REGISTER_TYPE_F);
- if (key->compute_pos_offset) {
+ if (wm_prog_data->persample_dispatch) {
/* Convert int_sample_pos to floating point */
bld.MOV(dst, int_sample_pos);
/* Scale to the range [0, 1] */
const fs_builder abld = bld.annotate("compute sample id");
fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
- if (key->compute_sample_id) {
+ if (!key->multisample_fbo) {
+ /* As per GL_ARB_sample_shading specification:
+ * "When rendering to a non-multisample buffer, or if multisample
+ * rasterization is disabled, gl_SampleID will always be zero."
+ */
+ abld.MOV(*reg, brw_imm_d(0));
+ } else if (devinfo->gen >= 8) {
+ /* Sample ID comes in as 4-bit numbers in g1.0:
+ *
+ * 15:12 Slot 3 SampleID (only used in SIMD16)
+ * 11:8 Slot 2 SampleID (only used in SIMD16)
+ * 7:4 Slot 1 SampleID
+ * 3:0 Slot 0 SampleID
+ *
+ * Each slot corresponds to four channels, so we want to replicate each
+ * half-byte value to 4 channels in a row:
+ *
+ * dst+0: .7 .6 .5 .4 .3 .2 .1 .0
+ * 7:4 7:4 7:4 7:4 3:0 3:0 3:0 3:0
+ *
+ * dst+1: .7 .6 .5 .4 .3 .2 .1 .0 (if SIMD16)
+ * 15:12 15:12 15:12 15:12 11:8 11:8 11:8 11:8
+ *
+ * First, we read g1.0 with a <1,8,0>UB region, causing the first 8
+ * channels to read the first byte (7:0), and the second group of 8
+ * channels to read the second byte (15:8). Then, we shift right by
+ * a vector immediate of <4, 4, 4, 4, 0, 0, 0, 0>, moving the slot 1 / 3
+ * values into place. Finally, we AND with 0xf to keep the low nibble.
+ *
+ * shr(16) tmp<1>W g1.0<1,8,0>B 0x44440000:V
+ * and(16) dst<1>D tmp<8,8,1>W 0xf:W
+ *
+ * TODO: These payload bits exist on Gen7 too, but they appear to always
+ * be zero, so this code fails to work. We should find out why.
+ */
+ fs_reg tmp(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_W);
+
+ abld.SHR(tmp, fs_reg(stride(retype(brw_vec1_grf(1, 0),
+ BRW_REGISTER_TYPE_B), 1, 8, 0)),
+ brw_imm_v(0x44440000));
+ abld.AND(*reg, tmp, brw_imm_w(0xf));
+ } else {
fs_reg t1(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_D);
t1.set_smear(0);
fs_reg t2(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_W);
/* SKL+ has an extra bit for the Starting Sample Pair Index to
* accomodate 16x MSAA.
*/
- unsigned sspi_mask = devinfo->gen >= 9 ? 0x1c0 : 0xc0;
-
abld.exec_all().group(1, 0)
.AND(t1, fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
- brw_imm_ud(sspi_mask));
+ brw_imm_ud(0xc0));
abld.exec_all().group(1, 0).SHR(t1, t1, brw_imm_d(5));
/* This works for both SIMD8 and SIMD16 */
- abld.exec_all().group(4, 0)
- .MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210));
+ abld.exec_all().group(4, 0).MOV(t2, brw_imm_v(0x3210));
/* This special instruction takes care of setting vstride=1,
* width=4, hstride=0 of t2 during an ADD instruction.
*/
abld.emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
- } else {
- /* As per GL_ARB_sample_shading specification:
- * "When rendering to a non-multisample buffer, or if multisample
- * rasterization is disabled, gl_SampleID will always be zero."
- */
- abld.MOV(*reg, brw_imm_d(0));
}
return reg;
}
+fs_reg *
+fs_visitor::emit_samplemaskin_setup()
+{
+ assert(stage == MESA_SHADER_FRAGMENT);
+ brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
+ assert(devinfo->gen >= 6);
+
+ fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
+
+ fs_reg coverage_mask(retype(brw_vec8_grf(payload.sample_mask_in_reg, 0),
+ BRW_REGISTER_TYPE_D));
+
+ if (wm_prog_data->persample_dispatch) {
+ /* gl_SampleMaskIn[] comes from two sources: the input coverage mask,
+ * and a mask representing which sample is being processed by the
+ * current shader invocation.
+ *
+ * From the OES_sample_variables specification:
+ * "When per-sample shading is active due to the use of a fragment input
+ * qualified by "sample" or due to the use of the gl_SampleID or
+ * gl_SamplePosition variables, only the bit for the current sample is
+ * set in gl_SampleMaskIn."
+ */
+ const fs_builder abld = bld.annotate("compute gl_SampleMaskIn");
+
+ if (nir_system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE)
+ nir_system_values[SYSTEM_VALUE_SAMPLE_ID] = *emit_sampleid_setup();
+
+ fs_reg one = vgrf(glsl_type::int_type);
+ fs_reg enabled_mask = vgrf(glsl_type::int_type);
+ abld.MOV(one, brw_imm_d(1));
+ abld.SHL(enabled_mask, one, nir_system_values[SYSTEM_VALUE_SAMPLE_ID]);
+ abld.AND(*reg, enabled_mask, coverage_mask);
+ } else {
+ /* In per-pixel mode, the coverage mask is sufficient. */
+ *reg = coverage_mask;
+ }
+ return reg;
+}
+
fs_reg
fs_visitor::resolve_source_modifiers(const fs_reg &src)
{
void
fs_visitor::assign_curb_setup()
{
- if (dispatch_width == 8) {
- prog_data->dispatch_grf_start_reg = payload.num_regs;
- } else {
- if (stage == MESA_SHADER_FRAGMENT) {
- brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
- prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
- } else if (stage == MESA_SHADER_COMPUTE) {
- brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
- prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
- } else {
- unreachable("Unsupported shader type!");
- }
- }
-
prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
/* Map the offsets in the UNIFORM file to fixed HW regs. */
inst->src[i].nr +
inst->src[i].reg_offset;
- unsigned width = inst->src[i].stride == 0 ? 1 : inst->exec_size;
+ /* As explained at brw_reg_from_fs_reg, From the Haswell PRM:
+ *
+ * VertStride must be used to cross GRF register boundaries. This
+ * rule implies that elements within a 'Width' cannot cross GRF
+ * boundaries.
+ *
+ * So, for registers that are large enough, we have to split the exec
+ * size in two and trust the compression state to sort it out.
+ */
+ unsigned total_size = inst->exec_size *
+ inst->src[i].stride *
+ type_sz(inst->src[i].type);
+
+ assert(total_size <= 2 * REG_SIZE);
+ const unsigned exec_size =
+ (total_size <= REG_SIZE) ? inst->exec_size : inst->exec_size / 2;
+
+ unsigned width = inst->src[i].stride == 0 ? 1 : exec_size;
struct brw_reg reg =
stride(byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
inst->src[i].subreg_offset),
- inst->exec_size * inst->src[i].stride,
+ exec_size * inst->src[i].stride,
width, inst->src[i].stride);
reg.abs = inst->src[i].abs;
reg.negate = inst->src[i].negate;
assert(stage == MESA_SHADER_VERTEX);
/* Each attribute is 4 regs. */
- this->first_non_payload_grf += 4 * vs_prog_data->nr_attributes;
+ this->first_non_payload_grf += 4 * vs_prog_data->nr_attribute_slots;
assert(vs_prog_data->base.urb_read_length <= 15);
}
}
+void
+fs_visitor::assign_tcs_single_patch_urb_setup()
+{
+ assert(stage == MESA_SHADER_TESS_CTRL);
+
+ /* Rewrite all ATTR file references to HW_REGs. */
+ foreach_block_and_inst(block, fs_inst, inst, cfg) {
+ convert_attr_sources_to_hw_regs(inst);
+ }
+}
+
void
fs_visitor::assign_tes_urb_setup()
{
return progress;
}
+static void
+set_push_pull_constant_loc(unsigned uniform, int *chunk_start, bool contiguous,
+ int *push_constant_loc, int *pull_constant_loc,
+ unsigned *num_push_constants,
+ unsigned *num_pull_constants,
+ const unsigned max_push_components,
+ const unsigned max_chunk_size,
+ struct brw_stage_prog_data *stage_prog_data)
+{
+ /* This is the first live uniform in the chunk */
+ if (*chunk_start < 0)
+ *chunk_start = uniform;
+
+ /* If this element does not need to be contiguous with the next, we
+ * split at this point and everything between chunk_start and u forms a
+ * single chunk.
+ */
+ if (!contiguous) {
+ unsigned chunk_size = uniform - *chunk_start + 1;
+
+ /* Decide whether we should push or pull this parameter. In the
+ * Vulkan driver, push constants are explicitly exposed via the API
+ * so we push everything. In GL, we only push small arrays.
+ */
+ if (stage_prog_data->pull_param == NULL ||
+ (*num_push_constants + chunk_size <= max_push_components &&
+ chunk_size <= max_chunk_size)) {
+ assert(*num_push_constants + chunk_size <= max_push_components);
+ for (unsigned j = *chunk_start; j <= uniform; j++)
+ push_constant_loc[j] = (*num_push_constants)++;
+ } else {
+ for (unsigned j = *chunk_start; j <= uniform; j++)
+ pull_constant_loc[j] = (*num_pull_constants)++;
+ }
+
+ *chunk_start = -1;
+ }
+}
+
/**
* Assign UNIFORM file registers to either push constants or pull constants.
*
* maximum number of fragment shader uniform components (64). If
* there are too many of these, they'd fill up all of register space.
* So, this will push some of them out to the pull constant buffer and
- * update the program to load them. We also use pull constants for all
- * indirect constant loads because we don't support indirect accesses in
- * registers yet.
+ * update the program to load them.
*/
void
fs_visitor::assign_constant_locations()
{
- /* Only the first compile (SIMD8 mode) gets to decide on locations. */
- if (dispatch_width != 8)
+ /* Only the first compile gets to decide on locations. */
+ if (dispatch_width != min_dispatch_width)
return;
- unsigned int num_pull_constants = 0;
-
- pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
- memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
-
bool is_live[uniforms];
memset(is_live, 0, sizeof(is_live));
+ bool is_live_64bit[uniforms];
+ memset(is_live_64bit, 0, sizeof(is_live_64bit));
+
+ /* For each uniform slot, a value of true indicates that the given slot and
+ * the next slot must remain contiguous. This is used to keep us from
+ * splitting arrays apart.
+ */
+ bool contiguous[uniforms];
+ memset(contiguous, 0, sizeof(contiguous));
/* First, we walk through the instructions and do two things:
*
* 1) Figure out which uniforms are live.
*
- * 2) Find all indirect access of uniform arrays and flag them as needing
- * to go into the pull constant buffer.
+ * 2) Mark any indirectly used ranges of registers as contiguous.
*
* Note that we don't move constant-indexed accesses to arrays. No
* testing has been done of the performance impact of this choice.
if (inst->src[i].file != UNIFORM)
continue;
- if (inst->src[i].reladdr) {
- int uniform = inst->src[i].nr;
+ int constant_nr = inst->src[i].nr + inst->src[i].reg_offset;
- /* If this array isn't already present in the pull constant buffer,
- * add it.
- */
- if (pull_constant_loc[uniform] == -1) {
- assert(param_size[uniform]);
- for (int j = 0; j < param_size[uniform]; j++)
- pull_constant_loc[uniform + j] = num_pull_constants++;
+ if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) {
+ assert(inst->src[2].ud % 4 == 0);
+ unsigned last = constant_nr + (inst->src[2].ud / 4) - 1;
+ assert(last < uniforms);
+
+ for (unsigned j = constant_nr; j < last; j++) {
+ is_live[j] = true;
+ contiguous[j] = true;
+ if (type_sz(inst->src[i].type) == 8) {
+ is_live_64bit[j] = true;
+ }
}
+ is_live[last] = true;
} else {
- /* Mark the the one accessed uniform as live */
- int constant_nr = inst->src[i].nr + inst->src[i].reg_offset;
- if (constant_nr >= 0 && constant_nr < (int) uniforms)
- is_live[constant_nr] = true;
+ if (constant_nr >= 0 && constant_nr < (int) uniforms) {
+ int regs_read = inst->components_read(i) *
+ type_sz(inst->src[i].type) / 4;
+ for (int j = 0; j < regs_read; j++) {
+ is_live[constant_nr + j] = true;
+ if (type_sz(inst->src[i].type) == 8) {
+ is_live_64bit[constant_nr + j] = true;
+ }
+ }
+ }
}
}
}
* If changing this value, note the limitation about total_regs in
* brw_curbe.c.
*/
- unsigned int max_push_components = 16 * 8;
+ const unsigned int max_push_components = 16 * 8;
+
+ /* We push small arrays, but no bigger than 16 floats. This is big enough
+ * for a vec4 but hopefully not large enough to push out other stuff. We
+ * should probably use a better heuristic at some point.
+ */
+ const unsigned int max_chunk_size = 16;
+
unsigned int num_push_constants = 0;
+ unsigned int num_pull_constants = 0;
push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
+ pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
- for (unsigned int i = 0; i < uniforms; i++) {
- if (!is_live[i] || pull_constant_loc[i] != -1) {
- /* This UNIFORM register is either dead, or has already been demoted
- * to a pull const. Mark it as no longer living in the param[] array.
- */
- push_constant_loc[i] = -1;
+ /* Default to -1 meaning no location */
+ memset(push_constant_loc, -1, uniforms * sizeof(*push_constant_loc));
+ memset(pull_constant_loc, -1, uniforms * sizeof(*pull_constant_loc));
+
+ int chunk_start = -1;
+
+ /* First push 64-bit uniforms to ensure they are properly aligned */
+ for (unsigned u = 0; u < uniforms; u++) {
+ if (!is_live[u] || !is_live_64bit[u])
continue;
- }
- if (num_push_constants < max_push_components) {
- /* Retain as a push constant. Record the location in the params[]
- * array.
- */
- push_constant_loc[i] = num_push_constants++;
- } else {
- /* Demote to a pull constant. */
- push_constant_loc[i] = -1;
- pull_constant_loc[i] = num_pull_constants++;
- }
+ set_push_pull_constant_loc(u, &chunk_start, contiguous[u],
+ push_constant_loc, pull_constant_loc,
+ &num_push_constants, &num_pull_constants,
+ max_push_components, max_chunk_size,
+ stage_prog_data);
+
+ }
+
+ /* Then push the rest of uniforms */
+ for (unsigned u = 0; u < uniforms; u++) {
+ if (!is_live[u] || is_live_64bit[u])
+ continue;
+
+ set_push_pull_constant_loc(u, &chunk_start, contiguous[u],
+ push_constant_loc, pull_constant_loc,
+ &num_push_constants, &num_pull_constants,
+ max_push_components, max_chunk_size,
+ stage_prog_data);
}
+ /* As the uniforms are going to be reordered, take the data from a temporary
+ * copy of the original param[].
+ */
+ gl_constant_value **param = ralloc_array(NULL, gl_constant_value*,
+ stage_prog_data->nr_params);
+ memcpy(param, stage_prog_data->param,
+ sizeof(gl_constant_value*) * stage_prog_data->nr_params);
stage_prog_data->nr_params = num_push_constants;
stage_prog_data->nr_pull_params = num_pull_constants;
* having to make a copy.
*/
for (unsigned int i = 0; i < uniforms; i++) {
- const gl_constant_value *value = stage_prog_data->param[i];
+ const gl_constant_value *value = param[i];
if (pull_constant_loc[i] != -1) {
stage_prog_data->pull_param[pull_constant_loc[i]] = value;
stage_prog_data->param[push_constant_loc[i]] = value;
}
}
+ ralloc_free(param);
}
/**
* or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
*/
void
-fs_visitor::demote_pull_constants()
+fs_visitor::lower_constant_loads()
{
- foreach_block_and_inst (block, fs_inst, inst, cfg) {
+ const unsigned index = stage_prog_data->binding_table.pull_constants_start;
+
+ foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
+ /* Set up the annotation tracking for new generated instructions. */
+ const fs_builder ibld(this, block, inst);
+
for (int i = 0; i < inst->sources; i++) {
if (inst->src[i].file != UNIFORM)
continue;
- int pull_index;
+ /* We'll handle this case later */
+ if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0)
+ continue;
+
unsigned location = inst->src[i].nr + inst->src[i].reg_offset;
- if (location >= uniforms) /* Out of bounds access */
- pull_index = -1;
- else
- pull_index = pull_constant_loc[location];
+ if (location >= uniforms)
+ continue; /* Out of bounds access */
+
+ int pull_index = pull_constant_loc[location];
if (pull_index == -1)
continue;
- /* Set up the annotation tracking for new generated instructions. */
- const fs_builder ibld(this, block, inst);
const unsigned index = stage_prog_data->binding_table.pull_constants_start;
- fs_reg dst = vgrf(glsl_type::float_type);
+ fs_reg dst;
+
+ if (type_sz(inst->src[i].type) <= 4)
+ dst = vgrf(glsl_type::float_type);
+ else
+ dst = vgrf(glsl_type::double_type);
assert(inst->src[i].stride == 0);
- /* Generate a pull load into dst. */
- if (inst->src[i].reladdr) {
- VARYING_PULL_CONSTANT_LOAD(ibld, dst,
- brw_imm_ud(index),
- *inst->src[i].reladdr,
- pull_index * 4);
- inst->src[i].reladdr = NULL;
- inst->src[i].stride = 1;
- } else {
- const fs_builder ubld = ibld.exec_all().group(8, 0);
- struct brw_reg offset = brw_imm_ud((unsigned)(pull_index * 4) & ~15);
- ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
- dst, brw_imm_ud(index), offset);
- inst->src[i].set_smear(pull_index & 3);
- }
- brw_mark_surface_used(prog_data, index);
+ const fs_builder ubld = ibld.exec_all().group(8, 0);
+ struct brw_reg offset = brw_imm_ud((unsigned)(pull_index * 4) & ~15);
+ ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
+ dst, brw_imm_ud(index), offset);
/* Rewrite the instruction to use the temporary VGRF. */
inst->src[i].file = VGRF;
inst->src[i].nr = dst.nr;
inst->src[i].reg_offset = 0;
+ inst->src[i].set_smear((pull_index & 3) * 4 /
+ type_sz(inst->src[i].type));
+
+ brw_mark_surface_used(prog_data, index);
+ }
+
+ if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT &&
+ inst->src[0].file == UNIFORM) {
+
+ unsigned location = inst->src[0].nr + inst->src[0].reg_offset;
+ if (location >= uniforms)
+ continue; /* Out of bounds access */
+
+ int pull_index = pull_constant_loc[location];
+
+ if (pull_index == -1)
+ continue;
+
+ VARYING_PULL_CONSTANT_LOAD(ibld, inst->dst,
+ brw_imm_ud(index),
+ inst->src[1],
+ pull_index * 4);
+ inst->remove(block);
+
+ brw_mark_surface_used(prog_data, index);
}
}
invalidate_live_intervals();
progress = true;
}
break;
- case SHADER_OPCODE_RCP: {
- fs_inst *prev = (fs_inst *)inst->prev;
- if (prev->opcode == SHADER_OPCODE_SQRT) {
- if (inst->src[0].equals(prev->dst)) {
- inst->opcode = SHADER_OPCODE_RSQ;
- inst->src[0] = prev->src[0];
- progress = true;
- }
- }
- break;
- }
case SHADER_OPCODE_BROADCAST:
if (is_uniform(inst->src[0])) {
inst->opcode = BRW_OPCODE_MOV;
tex_inst->offset |= fb_write->target << 24;
tex_inst->eot = true;
tex_inst->dst = ibld.null_reg_ud();
+ tex_inst->regs_written = 0;
fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
/* If a header is present, marking the eot is sufficient. Otherwise, we need
* we have enough space, but it will make sure the dead code eliminator kills
* the instruction that this will replace.
*/
- if (tex_inst->header_size != 0)
+ if (tex_inst->header_size != 0) {
+ invalidate_live_intervals();
return true;
+ }
fs_reg send_header = ibld.vgrf(BRW_REGISTER_TYPE_F,
load_payload->sources + 1);
tex_inst->insert_before(cfg->blocks[cfg->num_blocks - 1], new_load_payload);
tex_inst->src[0] = send_header;
+ invalidate_live_intervals();
return true;
}
fs_visitor::emit_repclear_shader()
{
brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
- int base_mrf = 1;
+ int base_mrf = 0;
int color_mrf = base_mrf + 2;
+ fs_inst *mov;
- fs_inst *mov = bld.exec_all().group(4, 0)
- .MOV(brw_message_reg(color_mrf),
- fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F));
+ if (uniforms > 0) {
+ mov = bld.exec_all().group(4, 0)
+ .MOV(brw_message_reg(color_mrf),
+ fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F));
+ } else {
+ struct brw_reg reg =
+ brw_reg(BRW_GENERAL_REGISTER_FILE, 2, 3, 0, 0, BRW_REGISTER_TYPE_F,
+ BRW_VERTICAL_STRIDE_8, BRW_WIDTH_2, BRW_HORIZONTAL_STRIDE_4,
+ BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
+
+ mov = bld.exec_all().group(4, 0)
+ .MOV(vec4(brw_message_reg(color_mrf)), fs_reg(reg));
+ }
fs_inst *write;
if (key->nr_color_regions == 1) {
assign_curb_setup();
/* Now that we have the uniform assigned, go ahead and force it to a vec4. */
- assert(mov->src[0].file == FIXED_GRF);
- mov->src[0] = brw_vec4_grf(mov->src[0].nr, 0);
+ if (uniforms > 0) {
+ assert(mov->src[0].file == FIXED_GRF);
+ mov->src[0] = brw_vec4_grf(mov->src[0].nr, 0);
+ }
}
/**
return progress;
}
+bool
+fs_visitor::lower_minmax()
+{
+ assert(devinfo->gen < 6);
+
+ bool progress = false;
+
+ foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
+ const fs_builder ibld(this, block, inst);
+
+ if (inst->opcode == BRW_OPCODE_SEL &&
+ inst->predicate == BRW_PREDICATE_NONE) {
+ /* FIXME: Using CMP doesn't preserve the NaN propagation semantics of
+ * the original SEL.L/GE instruction
+ */
+ ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],
+ inst->conditional_mod);
+ inst->predicate = BRW_PREDICATE_NORMAL;
+ inst->conditional_mod = BRW_CONDITIONAL_NONE;
+
+ progress = true;
+ }
+ }
+
+ if (progress)
+ invalidate_live_intervals();
+
+ return progress;
+}
+
static void
setup_color_payload(const fs_builder &bld, const brw_wm_prog_key *key,
fs_reg *dst, fs_reg color, unsigned components)
header_size = 1;
sources[0] = fs_reg();
length++;
+
+ /* If we're requesting fewer than four channels worth of response,
+ * and we have an explicit header, we need to set up the sampler
+ * writemask. It's reversed from normal: 1 means "don't write".
+ */
+ if (inst->regs_written != 4 * reg_width) {
+ assert((inst->regs_written % reg_width) == 0);
+ unsigned mask = ~((1 << (inst->regs_written / reg_width)) - 1) & 0xf;
+ inst->offset |= mask << 12;
+ }
}
if (shadow_c.file != BAD_FILE) {
switch (op) {
case FS_OPCODE_TXB:
case SHADER_OPCODE_TXL:
+ if (devinfo->gen >= 9 && op == SHADER_OPCODE_TXL && lod.is_zero()) {
+ op = SHADER_OPCODE_TXL_LZ;
+ break;
+ }
bld.MOV(sources[length], lod);
length++;
break;
length++;
}
- bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), lod);
- length++;
+ if (devinfo->gen >= 9 && lod.is_zero()) {
+ op = SHADER_OPCODE_TXF_LZ;
+ } else {
+ bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), lod);
+ length++;
+ }
for (unsigned i = devinfo->gen >= 9 ? 2 : 1; i < coord_components; i++) {
bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
coordinate_done = true;
break;
+
case SHADER_OPCODE_TXF_CMS:
case SHADER_OPCODE_TXF_CMS_W:
case SHADER_OPCODE_TXF_UMS:
case SHADER_OPCODE_INT_QUOTIENT:
case SHADER_OPCODE_INT_REMAINDER:
case SHADER_OPCODE_SIN:
- case SHADER_OPCODE_COS: {
+ case SHADER_OPCODE_COS:
+ case FS_OPCODE_PACK: {
/* According to the PRMs:
* "A. In Direct Addressing mode, a source cannot span more than 2
* adjacent GRF registers.
case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
return 8;
- default:
- return inst->exec_size;
- }
-}
+ case SHADER_OPCODE_MOV_INDIRECT:
+ /* Prior to Broadwell, we only have 8 address subregisters */
+ if (devinfo->gen < 8)
+ return 8;
-/**
- * The \p rows array of registers represents a \p num_rows by \p num_columns
- * matrix in row-major order, write it in column-major order into the register
- * passed as destination. \p stride gives the separation between matrix
- * elements in the input in fs_builder::dispatch_width() units.
- */
-static void
-emit_transpose(const fs_builder &bld,
- const fs_reg &dst, const fs_reg *rows,
- unsigned num_rows, unsigned num_columns, unsigned stride)
-{
- fs_reg *const components = new fs_reg[num_rows * num_columns];
+ if (inst->exec_size < 16) {
+ return inst->exec_size;
+ } else {
+ assert(type_sz(inst->dst.type) >= 4);
+ return MIN2(inst->exec_size / (type_sz(inst->dst.type) / 4), 16);
+ }
- for (unsigned i = 0; i < num_columns; ++i) {
- for (unsigned j = 0; j < num_rows; ++j)
- components[num_rows * i + j] = offset(rows[j], bld, stride * i);
+ default:
+ return inst->exec_size;
}
-
- bld.LOAD_PAYLOAD(dst, components, num_rows * num_columns, 0);
-
- delete[] components;
}
bool
if (inst->src[j].file != BAD_FILE &&
!is_uniform(inst->src[j])) {
/* Get the i-th copy_width-wide chunk of the source. */
- const fs_reg src = horiz_offset(inst->src[j], copy_width * i);
+ const fs_builder cbld = lbld.group(copy_width, 0);
+ const fs_reg src = offset(inst->src[j], cbld, i);
const unsigned src_size = inst->components_read(j);
- /* Use a trivial transposition to copy one every n
- * copy_width-wide components of the register into a
- * temporary passed as source to the lowered instruction.
+ /* Copy one every n copy_width-wide components of the
+ * register into a temporary passed as source to the lowered
+ * instruction.
*/
split_inst.src[j] = lbld.vgrf(inst->src[j].type, src_size);
- emit_transpose(lbld.group(copy_width, 0),
- split_inst.src[j], &src, 1, src_size, n);
+
+ for (unsigned k = 0; k < src_size; ++k)
+ cbld.MOV(offset(split_inst.src[j], lbld, k),
+ offset(src, cbld, n * k));
}
}
split_inst.dst = dsts[i] =
lbld.vgrf(inst->dst.type, dst_size);
split_inst.regs_written =
- DIV_ROUND_UP(inst->regs_written * lower_width,
- inst->exec_size);
+ DIV_ROUND_UP(type_sz(inst->dst.type) * dst_size * lower_width,
+ REG_SIZE);
}
lbld.emit(split_inst);
}
if (inst->regs_written) {
- /* Distance between useful channels in the temporaries, skipping
- * garbage if the lowered instruction is wider than the original.
- */
- const unsigned m = lower_width / copy_width;
+ const fs_builder lbld = ibld.group(lower_width, 0);
/* Interleave the components of the result from the lowered
- * instructions. We need to set exec_all() when copying more than
- * one half per component, because LOAD_PAYLOAD (in terms of which
- * emit_transpose is implemented) can only use the same channel
- * enable signals for all of its non-header sources.
+ * instructions.
*/
- emit_transpose(ibld.exec_all(inst->exec_size > copy_width)
- .group(copy_width, 0),
- inst->dst, dsts, n, dst_size, m);
+ for (unsigned i = 0; i < dst_size; ++i) {
+ for (unsigned j = 0; j < n; ++j) {
+ const fs_builder cbld = ibld.group(copy_width, j);
+ cbld.MOV(offset(inst->dst, cbld, n * i + j),
+ offset(dsts[j], lbld, i));
+ }
+ }
}
inst->remove(block);
inst->flag_subreg);
}
- fprintf(file, "%s", brw_instruction_name(inst->opcode));
+ fprintf(file, "%s", brw_instruction_name(devinfo, inst->opcode));
if (inst->saturate)
fprintf(file, ".sat");
if (inst->conditional_mod) {
break;
case UNIFORM:
fprintf(file, "u%d", inst->src[i].nr + inst->src[i].reg_offset);
- if (inst->src[i].reladdr) {
- fprintf(file, "+reladdr");
- } else if (inst->src[i].subreg_offset) {
+ if (inst->src[i].subreg_offset) {
fprintf(file, "+%d.%d", inst->src[i].reg_offset,
inst->src[i].subreg_offset);
}
case BRW_REGISTER_TYPE_F:
fprintf(file, "%-gf", inst->src[i].f);
break;
+ case BRW_REGISTER_TYPE_DF:
+ fprintf(file, "%fdf", inst->src[i].df);
+ break;
case BRW_REGISTER_TYPE_W:
case BRW_REGISTER_TYPE_D:
fprintf(file, "%dd", inst->src[i].d);
{
if (end == start ||
end->is_partial_write() ||
- reg.reladdr ||
!reg.equals(end->dst)) {
return NULL;
} else {
}
void
-fs_visitor::setup_payload_gen6()
+fs_visitor::setup_fs_payload_gen6()
{
- bool uses_depth =
- (nir->info.inputs_read & (1 << VARYING_SLOT_POS)) != 0;
+ assert(stage == MESA_SHADER_FRAGMENT);
+ brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
+
unsigned barycentric_interp_modes =
(stage == MESA_SHADER_FRAGMENT) ?
((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
}
/* R27: interpolated depth if uses source depth */
- if (uses_depth) {
+ prog_data->uses_src_depth =
+ (nir->info.inputs_read & (1 << VARYING_SLOT_POS)) != 0;
+ if (prog_data->uses_src_depth) {
payload.source_depth_reg = payload.num_regs;
payload.num_regs++;
if (dispatch_width == 16) {
payload.num_regs++;
}
}
+
/* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
- if (uses_depth) {
+ prog_data->uses_src_w =
+ (nir->info.inputs_read & (1 << VARYING_SLOT_POS)) != 0;
+ if (prog_data->uses_src_w) {
payload.source_w_reg = payload.num_regs;
payload.num_regs++;
if (dispatch_width == 16) {
}
}
- if (stage == MESA_SHADER_FRAGMENT) {
- brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
- brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
- prog_data->uses_pos_offset = key->compute_pos_offset;
- /* R31: MSAA position offsets. */
- if (prog_data->uses_pos_offset) {
- payload.sample_pos_reg = payload.num_regs;
- payload.num_regs++;
- }
+ /* R31: MSAA position offsets. */
+ if (prog_data->persample_dispatch &&
+ (nir->info.system_values_read & SYSTEM_BIT_SAMPLE_POS)) {
+ /* From the Ivy Bridge PRM documentation for 3DSTATE_PS:
+ *
+ * "MSDISPMODE_PERSAMPLE is required in order to select
+ * POSOFFSET_SAMPLE"
+ *
+ * So we can only really get sample positions if we are doing real
+ * per-sample dispatch. If we need gl_SamplePosition and we don't have
+ * persample dispatch, we hard-code it to 0.5.
+ */
+ prog_data->uses_pos_offset = true;
+ payload.sample_pos_reg = payload.num_regs;
+ payload.num_regs++;
}
/* R32: MSAA input coverage mask */
- if (nir->info.system_values_read & SYSTEM_BIT_SAMPLE_MASK_IN) {
+ prog_data->uses_sample_mask =
+ (nir->info.system_values_read & SYSTEM_BIT_SAMPLE_MASK_IN) != 0;
+ if (prog_data->uses_sample_mask) {
assert(devinfo->gen >= 7);
payload.sample_mask_in_reg = payload.num_regs;
payload.num_regs++;
payload.num_regs++;
}
- /* Use a maximum of 32 registers for push-model inputs. */
- const unsigned max_push_components = 32;
+ /* Use a maximum of 24 registers for push-model inputs. */
+ const unsigned max_push_components = 24;
/* If pushing our inputs would take too many registers, reduce the URB read
* length (which is in HWords, or 8 registers), and resort to pulling.
}
}
+/**
+ * Look for repeated FS_OPCODE_MOV_DISPATCH_TO_FLAGS and drop the later ones.
+ *
+ * The needs_unlit_centroid_workaround ends up producing one of these per
+ * channel of centroid input, so it's good to clean them up.
+ *
+ * An assumption here is that nothing ever modifies the dispatched pixels
+ * value that FS_OPCODE_MOV_DISPATCH_TO_FLAGS reads from, but the hardware
+ * dictates that anyway.
+ */
+bool
+fs_visitor::opt_drop_redundant_mov_to_flags()
+{
+ bool flag_mov_found[2] = {false};
+ bool progress = false;
+
+ /* Instructions removed by this pass can only be added if this were true */
+ if (!devinfo->needs_unlit_centroid_workaround)
+ return false;
+
+ foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
+ if (inst->is_control_flow()) {
+ memset(flag_mov_found, 0, sizeof(flag_mov_found));
+ } else if (inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
+ if (!flag_mov_found[inst->flag_subreg]) {
+ flag_mov_found[inst->flag_subreg] = true;
+ } else {
+ inst->remove(block);
+ progress = true;
+ }
+ } else if (inst->writes_flag()) {
+ flag_mov_found[inst->flag_subreg] = false;
+ }
+ }
+
+ return progress;
+}
+
void
fs_visitor::optimize()
{
bld = fs_builder(this, 64);
assign_constant_locations();
- demote_pull_constants();
+ lower_constant_loads();
validate();
if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
char filename[64];
- snprintf(filename, 64, "%s%d-%s-00-start",
+ snprintf(filename, 64, "%s%d-%s-00-00-start",
stage_abbrev, dispatch_width, nir->info.name);
backend_shader::dump_instructions(filename);
int iteration = 0;
int pass_num = 0;
+ OPT(opt_drop_redundant_mov_to_flags);
+
OPT(lower_simd_width);
OPT(lower_logical_sends);
OPT(dead_code_eliminate);
}
+ if (OPT(lower_pack)) {
+ OPT(register_coalesce);
+ OPT(dead_code_eliminate);
+ }
+
+ if (OPT(lower_d2x)) {
+ OPT(opt_copy_propagate);
+ OPT(dead_code_eliminate);
+ }
+
OPT(opt_combine_constants);
OPT(lower_integer_multiplication);
+ if (devinfo->gen <= 5 && OPT(lower_minmax)) {
+ OPT(opt_cmod_propagation);
+ OPT(opt_cse);
+ OPT(opt_copy_propagate);
+ OPT(dead_code_eliminate);
+ }
+
lower_uniform_pull_constant_loads();
validate();
void
fs_visitor::fixup_3src_null_dest()
{
+ bool progress = false;
+
foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
- if (inst->is_3src() && inst->dst.is_null()) {
+ if (inst->is_3src(devinfo) && inst->dst.is_null()) {
inst->dst = fs_reg(VGRF, alloc.allocate(dispatch_width / 8),
inst->dst.type);
+ progress = true;
}
}
+
+ if (progress)
+ invalidate_live_intervals();
}
void
-fs_visitor::allocate_registers()
+fs_visitor::allocate_registers(bool allow_spilling)
{
bool allocated_without_spills;
SCHEDULE_PRE_LIFO,
};
+ bool spill_all = allow_spilling && (INTEL_DEBUG & DEBUG_SPILL_FS);
+
/* Try each scheduling heuristic to see if it can successfully register
* allocate without spilling. They should be ordered by decreasing
* performance but increasing likelihood of allocating.
assign_regs_trivial();
allocated_without_spills = true;
} else {
- allocated_without_spills = assign_regs(false);
+ allocated_without_spills = assign_regs(false, spill_all);
}
if (allocated_without_spills)
break;
* SIMD8. There's probably actually some intermediate point where
* SIMD16 with a couple of spills is still better.
*/
- if (dispatch_width == 16) {
+ if (dispatch_width == 16 && min_dispatch_width <= 8) {
fail("Failure to register allocate. Reduce number of "
"live scalar values to avoid this.");
} else {
/* Since we're out of heuristics, just go spill registers until we
* get an allocation.
*/
- while (!assign_regs(true)) {
+ while (!assign_regs(true, spill_all)) {
if (failed)
break;
}
}
+ assert(last_scratch == 0 || allow_spilling);
+
/* This must come after all optimization and register allocation, since
* it inserts dead code that happens to have side effects, and it does
* so based on the actual physical registers in use.
assign_vs_urb_setup();
fixup_3src_null_dest();
- allocate_registers();
+ allocate_registers(true);
+
+ return !failed;
+}
+
+bool
+fs_visitor::run_tcs_single_patch()
+{
+ assert(stage == MESA_SHADER_TESS_CTRL);
+
+ struct brw_tcs_prog_data *tcs_prog_data =
+ (struct brw_tcs_prog_data *) prog_data;
+
+ /* r1-r4 contain the ICP handles. */
+ payload.num_regs = 5;
+
+ if (shader_time_index >= 0)
+ emit_shader_time_begin();
+
+ /* Initialize gl_InvocationID */
+ fs_reg channels_uw = bld.vgrf(BRW_REGISTER_TYPE_UW);
+ fs_reg channels_ud = bld.vgrf(BRW_REGISTER_TYPE_UD);
+ bld.MOV(channels_uw, fs_reg(brw_imm_uv(0x76543210)));
+ bld.MOV(channels_ud, channels_uw);
+
+ if (tcs_prog_data->instances == 1) {
+ invocation_id = channels_ud;
+ } else {
+ invocation_id = bld.vgrf(BRW_REGISTER_TYPE_UD);
+
+ /* Get instance number from g0.2 bits 23:17, and multiply it by 8. */
+ fs_reg t = bld.vgrf(BRW_REGISTER_TYPE_UD);
+ fs_reg instance_times_8 = bld.vgrf(BRW_REGISTER_TYPE_UD);
+ bld.AND(t, fs_reg(retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD)),
+ brw_imm_ud(INTEL_MASK(23, 17)));
+ bld.SHR(instance_times_8, t, brw_imm_ud(17 - 3));
+
+ bld.ADD(invocation_id, instance_times_8, channels_ud);
+ }
+
+ /* Fix the disptach mask */
+ if (nir->info.tcs.vertices_out % 8) {
+ bld.CMP(bld.null_reg_ud(), invocation_id,
+ brw_imm_ud(nir->info.tcs.vertices_out), BRW_CONDITIONAL_L);
+ bld.IF(BRW_PREDICATE_NORMAL);
+ }
+
+ emit_nir_code();
+
+ if (nir->info.tcs.vertices_out % 8) {
+ bld.emit(BRW_OPCODE_ENDIF);
+ }
+
+ /* Emit EOT write; set TR DS Cache bit */
+ fs_reg srcs[3] = {
+ fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
+ fs_reg(brw_imm_ud(WRITEMASK_X << 16)),
+ fs_reg(brw_imm_ud(0)),
+ };
+ fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 3);
+ bld.LOAD_PAYLOAD(payload, srcs, 3, 2);
+
+ fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_SIMD8_MASKED,
+ bld.null_reg_ud(), payload);
+ inst->mlen = 3;
+ inst->base_mrf = -1;
+ inst->eot = true;
+
+ if (shader_time_index >= 0)
+ emit_shader_time_end();
+
+ if (failed)
+ return false;
+
+ calculate_cfg();
+
+ optimize();
+
+ assign_curb_setup();
+ assign_tcs_single_patch_urb_setup();
+
+ fixup_3src_null_dest();
+ allocate_registers(true);
return !failed;
}
assign_tes_urb_setup();
fixup_3src_null_dest();
- allocate_registers();
+ allocate_registers(true);
return !failed;
}
assign_gs_urb_setup();
fixup_3src_null_dest();
- allocate_registers();
+ allocate_registers(true);
return !failed;
}
bool
-fs_visitor::run_fs(bool do_rep_send)
+fs_visitor::run_fs(bool allow_spilling, bool do_rep_send)
{
brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
assert(stage == MESA_SHADER_FRAGMENT);
if (devinfo->gen >= 6)
- setup_payload_gen6();
+ setup_fs_payload_gen6();
else
- setup_payload_gen4();
+ setup_fs_payload_gen4();
if (0) {
emit_dummy_fs();
assign_urb_setup();
fixup_3src_null_dest();
- allocate_registers();
+ allocate_registers(allow_spilling);
if (failed)
return false;
}
- if (dispatch_width == 8)
- wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
- else
- wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
-
return !failed;
}
if (shader_time_index >= 0)
emit_shader_time_begin();
+ if (devinfo->is_haswell && prog_data->total_shared > 0) {
+ /* Move SLM index from g0.0[27:24] to sr0.1[11:8] */
+ const fs_builder abld = bld.exec_all().group(1, 0);
+ abld.MOV(retype(suboffset(brw_sr0_reg(), 1), BRW_REGISTER_TYPE_UW),
+ suboffset(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW), 1));
+ }
+
emit_nir_code();
if (failed)
assign_curb_setup();
fixup_3src_null_dest();
- allocate_registers();
+ allocate_registers(true);
if (failed)
return false;
return barycentric_interp_modes;
}
+static void
+brw_compute_flat_inputs(struct brw_wm_prog_data *prog_data,
+ bool shade_model_flat, const nir_shader *shader)
+{
+ prog_data->flat_inputs = 0;
+
+ nir_foreach_variable(var, &shader->inputs) {
+ enum glsl_interp_qualifier interp_qualifier =
+ (enum glsl_interp_qualifier)var->data.interpolation;
+ bool is_gl_Color = (var->data.location == VARYING_SLOT_COL0) ||
+ (var->data.location == VARYING_SLOT_COL1);
+
+ int input_index = prog_data->urb_setup[var->data.location];
+
+ if (input_index < 0)
+ continue;
+
+ /* flat shading */
+ if (interp_qualifier == INTERP_QUALIFIER_FLAT ||
+ (shade_model_flat && is_gl_Color &&
+ interp_qualifier == INTERP_QUALIFIER_NONE))
+ prog_data->flat_inputs |= (1 << input_index);
+ }
+}
+
static uint8_t
computed_depth_mode(const nir_shader *shader)
{
const nir_shader *src_shader,
struct gl_program *prog,
int shader_time_index8, int shader_time_index16,
+ bool allow_spilling,
bool use_rep_send,
unsigned *final_assembly_size,
char **error_str)
nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
shader = brw_nir_apply_sampler_key(shader, compiler->devinfo, &key->tex,
true);
+ brw_nir_lower_fs_inputs(shader);
+ brw_nir_lower_fs_outputs(shader);
shader = brw_postprocess_nir(shader, compiler->devinfo, true);
/* key->alpha_test_func means simulating alpha testing via discards,
* so the shader definitely kills pixels.
*/
prog_data->uses_kill = shader->info.fs.uses_discard || key->alpha_test_func;
- prog_data->uses_omask =
+ prog_data->uses_omask = key->multisample_fbo &&
shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK);
prog_data->computed_depth_mode = computed_depth_mode(shader);
prog_data->computed_stencil =
shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL);
+ prog_data->persample_dispatch =
+ key->multisample_fbo &&
+ (key->persample_interp ||
+ (shader->info.system_values_read & (SYSTEM_BIT_SAMPLE_ID |
+ SYSTEM_BIT_SAMPLE_POS)) ||
+ shader->info.fs.uses_sample_qualifier);
+
prog_data->early_fragment_tests = shader->info.fs.early_fragment_tests;
prog_data->barycentric_interp_modes =
brw_compute_barycentric_interp_modes(compiler->devinfo,
key->flat_shade,
- key->persample_shading,
+ key->persample_interp,
shader);
- fs_visitor v(compiler, log_data, mem_ctx, key,
- &prog_data->base, prog, shader, 8,
- shader_time_index8);
- if (!v.run_fs(false /* do_rep_send */)) {
+ cfg_t *simd8_cfg = NULL, *simd16_cfg = NULL;
+ uint8_t simd8_grf_start = 0, simd16_grf_start = 0;
+ unsigned simd8_grf_used = 0, simd16_grf_used = 0;
+
+ fs_visitor v8(compiler, log_data, mem_ctx, key,
+ &prog_data->base, prog, shader, 8,
+ shader_time_index8);
+ if (!v8.run_fs(allow_spilling, false /* do_rep_send */)) {
if (error_str)
- *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
+ *error_str = ralloc_strdup(mem_ctx, v8.fail_msg);
return NULL;
+ } else if (likely(!(INTEL_DEBUG & DEBUG_NO8))) {
+ simd8_cfg = v8.cfg;
+ simd8_grf_start = v8.payload.num_regs;
+ simd8_grf_used = v8.grf_used;
}
- cfg_t *simd16_cfg = NULL;
- fs_visitor v2(compiler, log_data, mem_ctx, key,
- &prog_data->base, prog, shader, 16,
- shader_time_index16);
- if (likely(!(INTEL_DEBUG & DEBUG_NO16) || use_rep_send)) {
- if (!v.simd16_unsupported) {
- /* Try a SIMD16 compile */
- v2.import_uniforms(&v);
- if (!v2.run_fs(use_rep_send)) {
- compiler->shader_perf_log(log_data,
- "SIMD16 shader failed to compile: %s",
- v2.fail_msg);
- } else {
- simd16_cfg = v2.cfg;
- }
+ if (!v8.simd16_unsupported &&
+ likely(!(INTEL_DEBUG & DEBUG_NO16) || use_rep_send)) {
+ /* Try a SIMD16 compile */
+ fs_visitor v16(compiler, log_data, mem_ctx, key,
+ &prog_data->base, prog, shader, 16,
+ shader_time_index16);
+ v16.import_uniforms(&v8);
+ if (!v16.run_fs(allow_spilling, use_rep_send)) {
+ compiler->shader_perf_log(log_data,
+ "SIMD16 shader failed to compile: %s",
+ v16.fail_msg);
+ } else {
+ simd16_cfg = v16.cfg;
+ simd16_grf_start = v16.payload.num_regs;
+ simd16_grf_used = v16.grf_used;
}
}
- cfg_t *simd8_cfg;
- int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || use_rep_send;
- if ((no_simd8 || compiler->devinfo->gen < 5) && simd16_cfg) {
+ /* When the caller requests a repclear shader, they want SIMD16-only */
+ if (use_rep_send)
simd8_cfg = NULL;
- prog_data->no_8 = true;
- } else {
- simd8_cfg = v.cfg;
- prog_data->no_8 = false;
+
+ /* Prior to Iron Lake, the PS had a single shader offset with a jump table
+ * at the top to select the shader. We've never implemented that.
+ * Instead, we just give them exactly one shader and we pick the widest one
+ * available.
+ */
+ if (compiler->devinfo->gen < 5 && simd16_cfg)
+ simd8_cfg = NULL;
+
+ if (prog_data->persample_dispatch) {
+ /* Starting with SandyBridge (where we first get MSAA), the different
+ * pixel dispatch combinations are grouped into classifications A
+ * through F (SNB PRM Vol. 2 Part 1 Section 7.7.1). On all hardware
+ * generations, the only configurations supporting persample dispatch
+ * are are this in which only one dispatch width is enabled.
+ *
+ * If computed depth is enabled, SNB only allows SIMD8 while IVB+
+ * allow SIMD8 or SIMD16 so we choose SIMD16 if available.
+ */
+ if (compiler->devinfo->gen == 6 &&
+ prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF) {
+ simd16_cfg = NULL;
+ } else if (simd16_cfg) {
+ simd8_cfg = NULL;
+ }
}
+ /* We have to compute the flat inputs after the visitor is finished running
+ * because it relies on prog_data->urb_setup which is computed in
+ * fs_visitor::calculate_urb_setup().
+ */
+ brw_compute_flat_inputs(prog_data, key->flat_shade, shader);
+
fs_generator g(compiler, log_data, mem_ctx, (void *) key, &prog_data->base,
- v.promoted_constants, v.runtime_check_aads_emit,
+ v8.promoted_constants, v8.runtime_check_aads_emit,
MESA_SHADER_FRAGMENT);
if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
shader->info.name));
}
- if (simd8_cfg)
+ if (simd8_cfg) {
+ prog_data->dispatch_8 = true;
g.generate_code(simd8_cfg, 8);
- if (simd16_cfg)
- prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
+ prog_data->base.dispatch_grf_start_reg = simd8_grf_start;
+ prog_data->reg_blocks_0 = brw_register_blocks(simd8_grf_used);
+
+ if (simd16_cfg) {
+ prog_data->dispatch_16 = true;
+ prog_data->prog_offset_2 = g.generate_code(simd16_cfg, 16);
+ prog_data->dispatch_grf_start_reg_2 = simd16_grf_start;
+ prog_data->reg_blocks_2 = brw_register_blocks(simd16_grf_used);
+ }
+ } else if (simd16_cfg) {
+ prog_data->dispatch_16 = true;
+ g.generate_code(simd16_cfg, 16);
+ prog_data->base.dispatch_grf_start_reg = simd16_grf_start;
+ prog_data->reg_blocks_0 = brw_register_blocks(simd16_grf_used);
+ }
return g.get_assembly(final_assembly_size);
}
nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
shader = brw_nir_apply_sampler_key(shader, compiler->devinfo, &key->tex,
true);
+ brw_nir_lower_cs_shared(shader);
+ prog_data->base.total_shared += shader->num_shared;
shader = brw_postprocess_nir(shader, compiler->devinfo, true);
prog_data->local_size[0] = shader->info.cs.local_size[0];
shader->info.cs.local_size[2];
unsigned max_cs_threads = compiler->devinfo->max_cs_threads;
+ unsigned simd_required = DIV_ROUND_UP(local_workgroup_size, max_cs_threads);
cfg_t *cfg = NULL;
const char *fail_msg = NULL;
fs_visitor v8(compiler, log_data, mem_ctx, key, &prog_data->base,
NULL, /* Never used in core profile */
shader, 8, shader_time_index);
- if (!v8.run_cs()) {
- fail_msg = v8.fail_msg;
- } else if (local_workgroup_size <= 8 * max_cs_threads) {
- cfg = v8.cfg;
- prog_data->simd_size = 8;
+ if (simd_required <= 8) {
+ if (!v8.run_cs()) {
+ fail_msg = v8.fail_msg;
+ } else {
+ cfg = v8.cfg;
+ prog_data->simd_size = 8;
+ prog_data->base.dispatch_grf_start_reg = v8.payload.num_regs;
+ }
}
fs_visitor v16(compiler, log_data, mem_ctx, key, &prog_data->base,
!fail_msg && !v8.simd16_unsupported &&
local_workgroup_size <= 16 * max_cs_threads) {
/* Try a SIMD16 compile */
- v16.import_uniforms(&v8);
+ if (simd_required <= 8)
+ v16.import_uniforms(&v8);
if (!v16.run_cs()) {
compiler->shader_perf_log(log_data,
"SIMD16 shader failed to compile: %s",
} else {
cfg = v16.cfg;
prog_data->simd_size = 16;
+ prog_data->dispatch_grf_start_reg_16 = v16.payload.num_regs;
}
}