* from the LIR.
*/
-#include <sys/types.h>
-
-#include "util/hash_table.h"
#include "main/macros.h"
-#include "main/shaderobj.h"
-#include "main/fbobject.h"
-#include "program/prog_parameter.h"
-#include "program/prog_print.h"
-#include "util/register_allocate.h"
-#include "program/hash_table.h"
#include "brw_context.h"
#include "brw_eu.h"
-#include "brw_wm.h"
#include "brw_fs.h"
#include "brw_cs.h"
+#include "brw_nir.h"
#include "brw_vec4_gs_visitor.h"
#include "brw_cfg.h"
+#include "brw_program.h"
#include "brw_dead_control_flow.h"
-#include "main/uniforms.h"
-#include "brw_fs_live_variables.h"
-#include "glsl/nir/glsl_types.h"
-#include "program/sampler.h"
+#include "compiler/glsl_types.h"
using namespace brw;
* CSE can later notice that those loads are all the same and eliminate
* the redundant ones.
*/
- fs_reg vec4_offset = vgrf(glsl_type::int_type);
- bld.ADD(vec4_offset, varying_offset, fs_reg(const_offset & ~3));
+ fs_reg vec4_offset = vgrf(glsl_type::uint_type);
+ bld.ADD(vec4_offset, varying_offset, brw_imm_ud(const_offset & ~0xf));
int scale = 1;
if (devinfo->gen == 4 && bld.dispatch_width() == 8) {
inst->mlen = 1 + bld.dispatch_width() / 8;
}
- bld.MOV(dst, offset(vec4_result, bld, (const_offset & 3) * scale));
+ bld.MOV(dst, offset(vec4_result, bld, ((const_offset & 0xf) / 4) * scale));
}
/**
}
}
+/**
+ * Returns true if this instruction's sources and destinations cannot
+ * safely be the same register.
+ *
+ * In most cases, a register can be written over safely by the same
+ * instruction that is its last use. For a single instruction, the
+ * sources are dereferenced before writing of the destination starts
+ * (naturally).
+ *
+ * However, there are a few cases where this can be problematic:
+ *
+ * - Virtual opcodes that translate to multiple instructions in the
+ * code generator: if src == dst and one instruction writes the
+ * destination before a later instruction reads the source, then
+ * src will have been clobbered.
+ *
+ * - SIMD16 compressed instructions with certain regioning (see below).
+ *
+ * The register allocator uses this information to set up conflicts between
+ * GRF sources and the destination.
+ */
+bool
+fs_inst::has_source_and_destination_hazard() const
+{
+ switch (opcode) {
+ case FS_OPCODE_PACK_HALF_2x16_SPLIT:
+ /* Multiple partial writes to the destination */
+ return true;
+ default:
+ /* The SIMD16 compressed instruction
+ *
+ * add(16) g4<1>F g4<8,8,1>F g6<8,8,1>F
+ *
+ * is actually decoded in hardware as:
+ *
+ * add(8) g4<1>F g4<8,8,1>F g6<8,8,1>F
+ * add(8) g5<1>F g5<8,8,1>F g7<8,8,1>F
+ *
+ * Which is safe. However, if we have uniform accesses
+ * happening, we get into trouble:
+ *
+ * add(8) g4<1>F g4<0,1,0>F g6<8,8,1>F
+ * add(8) g5<1>F g4<0,1,0>F g7<8,8,1>F
+ *
+ * Now our destination for the first instruction overwrote the
+ * second instruction's src0, and we get garbage for those 8
+ * pixels. There's a similar issue for the pre-gen6
+ * pixel_x/pixel_y, which are registers of 16-bit values and thus
+ * would get stomped by the first decode as well.
+ */
+ if (exec_size == 16) {
+ for (int i = 0; i < sources; i++) {
+ if (src[i].file == VGRF && (src[i].stride == 0 ||
+ src[i].type == BRW_REGISTER_TYPE_UW ||
+ src[i].type == BRW_REGISTER_TYPE_W ||
+ src[i].type == BRW_REGISTER_TYPE_UB ||
+ src[i].type == BRW_REGISTER_TYPE_B)) {
+ return true;
+ }
+ }
+ }
+ return false;
+ }
+}
+
bool
fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const
{
this->file = BAD_FILE;
}
-/** Immediate value constructor. */
-fs_reg::fs_reg(float f)
-{
- init();
- this->file = IMM;
- this->type = BRW_REGISTER_TYPE_F;
- this->stride = 0;
- this->f = f;
-}
-
-/** Immediate value constructor. */
-fs_reg::fs_reg(int32_t i)
-{
- init();
- this->file = IMM;
- this->type = BRW_REGISTER_TYPE_D;
- this->stride = 0;
- this->d = i;
-}
-
-/** Immediate value constructor. */
-fs_reg::fs_reg(uint32_t u)
-{
- init();
- this->file = IMM;
- this->type = BRW_REGISTER_TYPE_UD;
- this->stride = 0;
- this->ud = u;
-}
-
-/** Vector float immediate value constructor. */
-fs_reg::fs_reg(uint8_t vf[4])
-{
- init();
- this->file = IMM;
- this->type = BRW_REGISTER_TYPE_VF;
- memcpy(&this->ud, vf, sizeof(unsigned));
-}
-
-/** Vector float immediate value constructor. */
-fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
-{
- init();
- this->file = IMM;
- this->type = BRW_REGISTER_TYPE_VF;
- this->ud = (vf0 << 0) | (vf1 << 8) | (vf2 << 16) | (vf3 << 24);
-}
-
-fs_reg::fs_reg(struct brw_reg reg) :
+fs_reg::fs_reg(struct ::brw_reg reg) :
backend_reg(reg)
{
this->reg_offset = 0;
this->subreg_offset = 0;
- this->reladdr = NULL;
this->stride = 1;
if (this->file == IMM &&
(this->type != BRW_REGISTER_TYPE_V &&
bool
fs_reg::equals(const fs_reg &r) const
{
- return (memcmp((brw_reg *)this, (brw_reg *)&r, sizeof(brw_reg)) == 0 &&
- reg_offset == r.reg_offset &&
+ return (this->backend_reg::equals(r) &&
subreg_offset == r.subreg_offset &&
- !reladdr && !r.reladdr &&
stride == r.stride);
}
case GLSL_TYPE_ERROR:
case GLSL_TYPE_INTERFACE:
case GLSL_TYPE_DOUBLE:
+ case GLSL_TYPE_FUNCTION:
unreachable("not reached");
}
fs_reg reset = shader_end_time;
reset.set_smear(2);
set_condmod(BRW_CONDITIONAL_Z,
- ibld.AND(ibld.null_reg_ud(), reset, fs_reg(1u)));
+ ibld.AND(ibld.null_reg_ud(), reset, brw_imm_ud(1u)));
ibld.IF(BRW_PREDICATE_NORMAL);
fs_reg start = shader_start_time;
* is 2 cycles. Remove that overhead, so I can forget about that when
* trying to determine the time taken for single instructions.
*/
- cbld.ADD(diff, diff, fs_reg(-2u));
+ cbld.ADD(diff, diff, brw_imm_ud(-2u));
SHADER_TIME_ADD(cbld, 0, diff);
- SHADER_TIME_ADD(cbld, 1, fs_reg(1u));
+ SHADER_TIME_ADD(cbld, 1, brw_imm_ud(1u));
ibld.emit(BRW_OPCODE_ELSE);
- SHADER_TIME_ADD(cbld, 2, fs_reg(1u));
+ SHADER_TIME_ADD(cbld, 2, brw_imm_ud(1u));
ibld.emit(BRW_OPCODE_ENDIF);
}
fs_reg value)
{
int index = shader_time_index * 3 + shader_time_subindex;
- fs_reg offset = fs_reg(index * SHADER_TIME_STRIDE);
+ struct brw_reg offset = brw_imm_d(index * SHADER_TIME_STRIDE);
fs_reg payload;
if (dispatch_width == 8)
case SHADER_OPCODE_LOD_LOGICAL:
case SHADER_OPCODE_TG4_LOGICAL:
case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
- assert(src[8].file == IMM && src[9].file == IMM);
+ assert(src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM &&
+ src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM);
/* Texture coordinates. */
- if (i == 0)
- return src[8].ud;
+ if (i == TEX_LOGICAL_SRC_COORDINATE)
+ return src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
/* Texture derivatives. */
- else if ((i == 2 || i == 3) && opcode == SHADER_OPCODE_TXD_LOGICAL)
- return src[9].ud;
+ else if ((i == TEX_LOGICAL_SRC_LOD || i == TEX_LOGICAL_SRC_LOD2) &&
+ opcode == SHADER_OPCODE_TXD_LOGICAL)
+ return src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
/* Texture offset. */
- else if (i == 7)
+ else if (i == TEX_LOGICAL_SRC_OFFSET_VALUE)
return 2;
/* MCS */
- else if (i == 5 && opcode == SHADER_OPCODE_TXF_CMS_W_LOGICAL)
+ else if (i == TEX_LOGICAL_SRC_MCS && opcode == SHADER_OPCODE_TXF_CMS_W_LOGICAL)
return 2;
else
return 1;
assert(src[2].file == IMM);
unsigned region_length = src[2].ud;
- if (src[0].file == FIXED_GRF) {
+ if (src[0].file == UNIFORM) {
+ assert(region_length % 4 == 0);
+ return region_length / 4;
+ } else if (src[0].file == FIXED_GRF) {
/* If the start of the region is not register aligned, then
* there's some portion of the register that's technically
* unread at the beginning.
* unread portion at the beginning.
*/
if (src[0].subnr)
- region_length += src[0].subnr * type_sz(src[0].type);
+ region_length += src[0].subnr;
return DIV_ROUND_UP(region_length, REG_SIZE);
} else {
this->push_constant_loc = v->push_constant_loc;
this->pull_constant_loc = v->pull_constant_loc;
this->uniforms = v->uniforms;
- this->param_size = v->param_size;
}
fs_reg *
if (pixel_center_integer) {
bld.MOV(wpos, this->pixel_x);
} else {
- bld.ADD(wpos, this->pixel_x, fs_reg(0.5f));
+ bld.ADD(wpos, this->pixel_x, brw_imm_f(0.5f));
}
wpos = offset(wpos, bld, 1);
offset += key->drawable_height - 1.0f;
}
- bld.ADD(wpos, pixel_y, fs_reg(offset));
+ bld.ADD(wpos, pixel_y, brw_imm_f(offset));
}
wpos = offset(wpos, bld, 1);
}
void
-fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
+fs_visitor::emit_general_interpolation(fs_reg *attr, const char *name,
const glsl_type *type,
glsl_interp_qualifier interpolation_mode,
- int location, bool mod_centroid,
+ int *location, bool mod_centroid,
bool mod_sample)
{
- attr.type = brw_type_for_base_type(type->get_scalar_type());
-
assert(stage == MESA_SHADER_FRAGMENT);
brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
- unsigned int array_elements;
-
- if (type->is_array()) {
- array_elements = type->arrays_of_arrays_size();
- if (array_elements == 0) {
- fail("dereferenced array '%s' has length 0\n", name);
- }
- type = type->without_array();
- } else {
- array_elements = 1;
- }
-
if (interpolation_mode == INTERP_QUALIFIER_NONE) {
bool is_gl_Color =
- location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
+ *location == VARYING_SLOT_COL0 || *location == VARYING_SLOT_COL1;
if (key->flat_shade && is_gl_Color) {
interpolation_mode = INTERP_QUALIFIER_FLAT;
} else {
}
}
- for (unsigned int i = 0; i < array_elements; i++) {
- for (unsigned int j = 0; j < type->matrix_columns; j++) {
- if (prog_data->urb_setup[location] == -1) {
- /* If there's no incoming setup data for this slot, don't
- * emit interpolation for it.
- */
- attr = offset(attr, bld, type->vector_elements);
- location++;
- continue;
- }
+ if (type->is_array() || type->is_matrix()) {
+ const glsl_type *elem_type = glsl_get_array_element(type);
+ const unsigned length = glsl_get_length(type);
- if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
- /* Constant interpolation (flat shading) case. The SF has
- * handed us defined values in only the constant offset
- * field of the setup reg.
- */
- for (unsigned int k = 0; k < type->vector_elements; k++) {
- struct brw_reg interp = interp_reg(location, k);
- interp = suboffset(interp, 3);
- interp.type = attr.type;
- bld.emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
- attr = offset(attr, bld, 1);
- }
- } else {
- /* Smooth/noperspective interpolation case. */
- for (unsigned int k = 0; k < type->vector_elements; k++) {
- struct brw_reg interp = interp_reg(location, k);
- if (devinfo->needs_unlit_centroid_workaround && mod_centroid) {
- /* Get the pixel/sample mask into f0 so that we know
- * which pixels are lit. Then, for each channel that is
- * unlit, replace the centroid data with non-centroid
- * data.
- */
- bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
-
- fs_inst *inst;
- inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
- false, false);
- inst->predicate = BRW_PREDICATE_NORMAL;
- inst->predicate_inverse = true;
- if (devinfo->has_pln)
- inst->no_dd_clear = true;
-
- inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
- mod_centroid && !key->persample_shading,
- mod_sample || key->persample_shading);
- inst->predicate = BRW_PREDICATE_NORMAL;
- inst->predicate_inverse = false;
- if (devinfo->has_pln)
- inst->no_dd_check = true;
+ for (unsigned i = 0; i < length; i++) {
+ emit_general_interpolation(attr, name, elem_type, interpolation_mode,
+ location, mod_centroid, mod_sample);
+ }
+ } else if (type->is_record()) {
+ for (unsigned i = 0; i < type->length; i++) {
+ const glsl_type *field_type = type->fields.structure[i].type;
+ emit_general_interpolation(attr, name, field_type, interpolation_mode,
+ location, mod_centroid, mod_sample);
+ }
+ } else {
+ assert(type->is_scalar() || type->is_vector());
- } else {
- emit_linterp(attr, fs_reg(interp), interpolation_mode,
- mod_centroid && !key->persample_shading,
- mod_sample || key->persample_shading);
- }
- if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
- bld.MUL(attr, attr, this->pixel_w);
- }
- attr = offset(attr, bld, 1);
- }
+ if (prog_data->urb_setup[*location] == -1) {
+ /* If there's no incoming setup data for this slot, don't
+ * emit interpolation for it.
+ */
+ *attr = offset(*attr, bld, type->vector_elements);
+ (*location)++;
+ return;
+ }
- }
- location++;
+ attr->type = brw_type_for_base_type(type->get_scalar_type());
+
+ if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
+ /* Constant interpolation (flat shading) case. The SF has
+ * handed us defined values in only the constant offset
+ * field of the setup reg.
+ */
+ for (unsigned int i = 0; i < type->vector_elements; i++) {
+ struct brw_reg interp = interp_reg(*location, i);
+ interp = suboffset(interp, 3);
+ interp.type = attr->type;
+ bld.emit(FS_OPCODE_CINTERP, *attr, fs_reg(interp));
+ *attr = offset(*attr, bld, 1);
+ }
+ } else {
+ /* Smooth/noperspective interpolation case. */
+ for (unsigned int i = 0; i < type->vector_elements; i++) {
+ struct brw_reg interp = interp_reg(*location, i);
+ if (devinfo->needs_unlit_centroid_workaround && mod_centroid) {
+ /* Get the pixel/sample mask into f0 so that we know
+ * which pixels are lit. Then, for each channel that is
+ * unlit, replace the centroid data with non-centroid
+ * data.
+ */
+ bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
+
+ fs_inst *inst;
+ inst = emit_linterp(*attr, fs_reg(interp), interpolation_mode,
+ false, false);
+ inst->predicate = BRW_PREDICATE_NORMAL;
+ inst->predicate_inverse = true;
+ if (devinfo->has_pln)
+ inst->no_dd_clear = true;
+
+ inst = emit_linterp(*attr, fs_reg(interp), interpolation_mode,
+ mod_centroid && !key->persample_shading,
+ mod_sample || key->persample_shading);
+ inst->predicate = BRW_PREDICATE_NORMAL;
+ inst->predicate_inverse = false;
+ if (devinfo->has_pln)
+ inst->no_dd_check = true;
+
+ } else {
+ emit_linterp(*attr, fs_reg(interp), interpolation_mode,
+ mod_centroid && !key->persample_shading,
+ mod_sample || key->persample_shading);
+ }
+ if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
+ bld.MUL(*attr, *attr, this->pixel_w);
+ }
+ *attr = offset(*attr, bld, 1);
+ }
}
+ (*location)++;
}
}
fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
g0.negate = true;
- bld.ASR(*reg, g0, fs_reg(15));
+ bld.ASR(*reg, g0, brw_imm_d(15));
} else {
/* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
* a boolean result from this (1/true or 0/false).
fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
g1_6.negate = true;
- bld.ASR(*reg, g1_6, fs_reg(31));
+ bld.ASR(*reg, g1_6, brw_imm_d(31));
}
return reg;
/* Convert int_sample_pos to floating point */
bld.MOV(dst, int_sample_pos);
/* Scale to the range [0, 1] */
- bld.MUL(dst, dst, fs_reg(1 / 16.0f));
+ bld.MUL(dst, dst, brw_imm_f(1 / 16.0f));
}
else {
/* From ARB_sample_shading specification:
* rasterization is disabled, gl_SamplePosition will always be
* (0.5, 0.5).
*/
- bld.MOV(dst, fs_reg(0.5f));
+ bld.MOV(dst, brw_imm_f(0.5f));
}
}
abld.exec_all().group(1, 0)
.AND(t1, fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
- fs_reg(sspi_mask));
- abld.exec_all().group(1, 0).SHR(t1, t1, fs_reg(5));
+ brw_imm_ud(sspi_mask));
+ abld.exec_all().group(1, 0).SHR(t1, t1, brw_imm_d(5));
/* This works for both SIMD8 and SIMD16 */
abld.exec_all().group(4, 0)
* "When rendering to a non-multisample buffer, or if multisample
* rasterization is disabled, gl_SampleID will always be zero."
*/
- abld.MOV(*reg, fs_reg(0));
+ abld.MOV(*reg, brw_imm_d(0));
}
return reg;
brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
assert(stage == MESA_SHADER_VERTEX);
- int count = _mesa_bitcount_64(vs_prog_data->inputs_read);
- if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
- count++;
/* Each attribute is 4 regs. */
this->first_non_payload_grf += 4 * vs_prog_data->nr_attributes;
}
}
+void
+fs_visitor::assign_tes_urb_setup()
+{
+ assert(stage == MESA_SHADER_TESS_EVAL);
+
+ brw_vue_prog_data *vue_prog_data = (brw_vue_prog_data *) prog_data;
+
+ first_non_payload_grf += 8 * vue_prog_data->urb_read_length;
+
+ /* Rewrite all ATTR file references to HW_REGs. */
+ foreach_block_and_inst(block, fs_inst, inst, cfg) {
+ convert_attr_sources_to_hw_regs(inst);
+ }
+}
+
void
fs_visitor::assign_gs_urb_setup()
{
* maximum number of fragment shader uniform components (64). If
* there are too many of these, they'd fill up all of register space.
* So, this will push some of them out to the pull constant buffer and
- * update the program to load them. We also use pull constants for all
- * indirect constant loads because we don't support indirect accesses in
- * registers yet.
+ * update the program to load them.
*/
void
fs_visitor::assign_constant_locations()
{
- /* Only the first compile (SIMD8 mode) gets to decide on locations. */
- if (dispatch_width != 8)
+ /* Only the first compile gets to decide on locations. */
+ if (dispatch_width != min_dispatch_width)
return;
- unsigned int num_pull_constants = 0;
-
- pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
- memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
-
bool is_live[uniforms];
memset(is_live, 0, sizeof(is_live));
+ /* For each uniform slot, a value of true indicates that the given slot and
+ * the next slot must remain contiguous. This is used to keep us from
+ * splitting arrays apart.
+ */
+ bool contiguous[uniforms];
+ memset(contiguous, 0, sizeof(contiguous));
+
/* First, we walk through the instructions and do two things:
*
* 1) Figure out which uniforms are live.
*
- * 2) Find all indirect access of uniform arrays and flag them as needing
- * to go into the pull constant buffer.
+ * 2) Mark any indirectly used ranges of registers as contiguous.
*
* Note that we don't move constant-indexed accesses to arrays. No
* testing has been done of the performance impact of this choice.
if (inst->src[i].file != UNIFORM)
continue;
- if (inst->src[i].reladdr) {
- int uniform = inst->src[i].nr;
+ int constant_nr = inst->src[i].nr + inst->src[i].reg_offset;
- /* If this array isn't already present in the pull constant buffer,
- * add it.
- */
- if (pull_constant_loc[uniform] == -1) {
- assert(param_size[uniform]);
- for (int j = 0; j < param_size[uniform]; j++)
- pull_constant_loc[uniform + j] = num_pull_constants++;
+ if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) {
+ assert(inst->src[2].ud % 4 == 0);
+ unsigned last = constant_nr + (inst->src[2].ud / 4) - 1;
+ assert(last < uniforms);
+
+ for (unsigned j = constant_nr; j < last; j++) {
+ is_live[j] = true;
+ contiguous[j] = true;
}
+ is_live[last] = true;
} else {
- /* Mark the the one accessed uniform as live */
- int constant_nr = inst->src[i].nr + inst->src[i].reg_offset;
if (constant_nr >= 0 && constant_nr < (int) uniforms)
is_live[constant_nr] = true;
}
* If changing this value, note the limitation about total_regs in
* brw_curbe.c.
*/
- unsigned int max_push_components = 16 * 8;
+ const unsigned int max_push_components = 16 * 8;
+
+ /* For vulkan we don't limit the max_chunk_size. We set it to 32 float =
+ * 128 bytes, which is the maximum vulkan push constant size.
+ */
+ const unsigned int max_chunk_size = 32;
+
unsigned int num_push_constants = 0;
+ unsigned int num_pull_constants = 0;
push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
+ pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
- for (unsigned int i = 0; i < uniforms; i++) {
- if (!is_live[i] || pull_constant_loc[i] != -1) {
- /* This UNIFORM register is either dead, or has already been demoted
- * to a pull const. Mark it as no longer living in the param[] array.
- */
- push_constant_loc[i] = -1;
+ int chunk_start = -1;
+ for (unsigned u = 0; u < uniforms; u++) {
+ push_constant_loc[u] = -1;
+ pull_constant_loc[u] = -1;
+
+ if (!is_live[u])
continue;
- }
- if (num_push_constants < max_push_components) {
- /* Retain as a push constant. Record the location in the params[]
- * array.
- */
- push_constant_loc[i] = num_push_constants++;
- } else {
- /* Demote to a pull constant. */
- push_constant_loc[i] = -1;
- pull_constant_loc[i] = num_pull_constants++;
+ /* This is the first live uniform in the chunk */
+ if (chunk_start < 0)
+ chunk_start = u;
+
+ /* If this element does not need to be contiguous with the next, we
+ * split at this point and everthing between chunk_start and u forms a
+ * single chunk.
+ */
+ if (!contiguous[u]) {
+ unsigned chunk_size = u - chunk_start + 1;
+
+ if (num_push_constants + chunk_size <= max_push_components &&
+ chunk_size <= max_chunk_size) {
+ for (unsigned j = chunk_start; j <= u; j++)
+ push_constant_loc[j] = num_push_constants++;
+ } else {
+ for (unsigned j = chunk_start; j <= u; j++)
+ pull_constant_loc[j] = num_pull_constants++;
+ }
+
+ chunk_start = -1;
}
}
* or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
*/
void
-fs_visitor::demote_pull_constants()
+fs_visitor::lower_constant_loads()
{
- foreach_block_and_inst (block, fs_inst, inst, cfg) {
+ const unsigned index = stage_prog_data->binding_table.pull_constants_start;
+
+ foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
+ /* Set up the annotation tracking for new generated instructions. */
+ const fs_builder ibld(this, block, inst);
+
for (int i = 0; i < inst->sources; i++) {
if (inst->src[i].file != UNIFORM)
continue;
- int pull_index;
+ /* We'll handle this case later */
+ if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0)
+ continue;
+
unsigned location = inst->src[i].nr + inst->src[i].reg_offset;
- if (location >= uniforms) /* Out of bounds access */
- pull_index = -1;
- else
- pull_index = pull_constant_loc[location];
+ if (location >= uniforms)
+ continue; /* Out of bounds access */
+
+ int pull_index = pull_constant_loc[location];
if (pull_index == -1)
continue;
- /* Set up the annotation tracking for new generated instructions. */
- const fs_builder ibld(this, block, inst);
- const unsigned index = stage_prog_data->binding_table.pull_constants_start;
- fs_reg dst = vgrf(glsl_type::float_type);
-
assert(inst->src[i].stride == 0);
- /* Generate a pull load into dst. */
- if (inst->src[i].reladdr) {
- VARYING_PULL_CONSTANT_LOAD(ibld, dst,
- fs_reg(index),
- *inst->src[i].reladdr,
- pull_index);
- inst->src[i].reladdr = NULL;
- inst->src[i].stride = 1;
- } else {
- const fs_builder ubld = ibld.exec_all().group(8, 0);
- fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
- ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
- dst, fs_reg(index), offset);
- inst->src[i].set_smear(pull_index & 3);
- }
- brw_mark_surface_used(prog_data, index);
+ fs_reg dst = vgrf(glsl_type::float_type);
+ const fs_builder ubld = ibld.exec_all().group(8, 0);
+ struct brw_reg offset = brw_imm_ud((unsigned)(pull_index * 4) & ~15);
+ ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
+ dst, brw_imm_ud(index), offset);
/* Rewrite the instruction to use the temporary VGRF. */
inst->src[i].file = VGRF;
inst->src[i].nr = dst.nr;
inst->src[i].reg_offset = 0;
+ inst->src[i].set_smear(pull_index & 3);
+
+ brw_mark_surface_used(prog_data, index);
+ }
+
+ if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT &&
+ inst->src[0].file == UNIFORM) {
+
+ unsigned location = inst->src[0].nr + inst->src[0].reg_offset;
+ if (location >= uniforms)
+ continue; /* Out of bounds access */
+
+ int pull_index = pull_constant_loc[location];
+
+ if (pull_index == -1)
+ continue;
+
+ VARYING_PULL_CONSTANT_LOAD(ibld, inst->dst,
+ brw_imm_ud(index),
+ inst->src[1],
+ pull_index * 4);
+ inst->remove(block);
+
+ brw_mark_surface_used(prog_data, index);
}
}
invalidate_live_intervals();
if (inst->dst.type != inst->src[0].type)
assert(!"unimplemented: saturate mixed types");
- if (brw_saturate_immediate(inst->dst.type, &inst->src[0])) {
+ if (brw_saturate_immediate(inst->dst.type,
+ &inst->src[0].as_brw_reg())) {
inst->saturate = false;
progress = true;
}
progress = true;
}
break;
- case SHADER_OPCODE_RCP: {
- fs_inst *prev = (fs_inst *)inst->prev;
- if (prev->opcode == SHADER_OPCODE_SQRT) {
- if (inst->src[0].equals(prev->dst)) {
- inst->opcode = SHADER_OPCODE_RSQ;
- inst->src[0] = prev->src[0];
- progress = true;
- }
- }
- break;
- }
case SHADER_OPCODE_BROADCAST:
if (is_uniform(inst->src[0])) {
inst->opcode = BRW_OPCODE_MOV;
* we have enough space, but it will make sure the dead code eliminator kills
* the instruction that this will replace.
*/
- if (tex_inst->header_size != 0)
+ if (tex_inst->header_size != 0) {
+ invalidate_live_intervals();
return true;
+ }
fs_reg send_header = ibld.vgrf(BRW_REGISTER_TYPE_F,
load_payload->sources + 1);
tex_inst->insert_before(cfg->blocks[cfg->num_blocks - 1], new_load_payload);
tex_inst->src[0] = send_header;
+ invalidate_live_intervals();
return true;
}
case SHADER_OPCODE_FIND_LIVE_CHANNEL:
if (depth == 0) {
inst->opcode = BRW_OPCODE_MOV;
- inst->src[0] = fs_reg(0u);
+ inst->src[0] = brw_imm_ud(0u);
inst->sources = 1;
inst->force_writemask_all = true;
progress = true;
brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
int base_mrf = 1;
int color_mrf = base_mrf + 2;
+ fs_inst *mov;
- fs_inst *mov = bld.exec_all().group(4, 0)
- .MOV(brw_message_reg(color_mrf),
- fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F));
+ if (uniforms > 0) {
+ mov = bld.exec_all().group(4, 0)
+ .MOV(brw_message_reg(color_mrf),
+ fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F));
+ } else {
+ struct brw_reg reg =
+ brw_reg(BRW_GENERAL_REGISTER_FILE, 2, 3, 0, 0, BRW_REGISTER_TYPE_F,
+ BRW_VERTICAL_STRIDE_8, BRW_WIDTH_2, BRW_HORIZONTAL_STRIDE_4,
+ BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
+
+ mov = bld.exec_all().group(4, 0)
+ .MOV(vec4(brw_message_reg(color_mrf)), fs_reg(reg));
+ }
fs_inst *write;
if (key->nr_color_regions == 1) {
assign_curb_setup();
/* Now that we have the uniform assigned, go ahead and force it to a vec4. */
- assert(mov->src[0].file == FIXED_GRF);
- mov->src[0] = brw_vec4_grf(mov->src[0].nr, 0);
+ if (uniforms > 0) {
+ assert(mov->src[0].file == FIXED_GRF);
+ mov->src[0] = brw_vec4_grf(mov->src[0].nr, 0);
+ }
}
/**
continue;
if (devinfo->gen >= 7) {
- /* The offset arg before was a vec4-aligned byte offset. We need to
- * turn it into a dword offset.
- */
+ /* The offset arg is a vec4-aligned immediate byte offset. */
fs_reg const_offset_reg = inst->src[1];
assert(const_offset_reg.file == IMM &&
const_offset_reg.type == BRW_REGISTER_TYPE_UD);
- const_offset_reg.ud /= 4;
+ assert(const_offset_reg.ud % 16 == 0);
fs_reg payload, offset;
if (devinfo->gen >= 9) {
*/
assert(mul->src[1].type == BRW_REGISTER_TYPE_D ||
mul->src[1].type == BRW_REGISTER_TYPE_UD);
- mul->src[1].type = (type_is_signed(mul->src[1].type) ?
- BRW_REGISTER_TYPE_W : BRW_REGISTER_TYPE_UW);
+ mul->src[1].type = BRW_REGISTER_TYPE_UW;
mul->src[1].stride *= 2;
} else if (devinfo->gen == 7 && !devinfo->is_haswell &&
return progress;
}
+bool
+fs_visitor::lower_minmax()
+{
+ assert(devinfo->gen < 6);
+
+ bool progress = false;
+
+ foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
+ const fs_builder ibld(this, block, inst);
+
+ if (inst->opcode == BRW_OPCODE_SEL &&
+ inst->predicate == BRW_PREDICATE_NONE) {
+ /* FIXME: Using CMP doesn't preserve the NaN propagation semantics of
+ * the original SEL.L/GE instruction
+ */
+ ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],
+ inst->conditional_mod);
+ inst->predicate = BRW_PREDICATE_NORMAL;
+ inst->conditional_mod = BRW_CONDITIONAL_NONE;
+
+ progress = true;
+ }
+ }
+
+ if (progress)
+ invalidate_live_intervals();
+
+ return progress;
+}
+
static void
setup_color_payload(const fs_builder &bld, const brw_wm_prog_key *key,
fs_reg *dst, fs_reg color, unsigned components)
const fs_reg &coordinate,
const fs_reg &shadow_c,
const fs_reg &lod, const fs_reg &lod2,
+ const fs_reg &surface,
const fs_reg &sampler,
unsigned coord_components,
unsigned grad_components)
(has_lod || shadow_c.file != BAD_FILE ||
(op == SHADER_OPCODE_TEX && bld.dispatch_width() == 8))) {
for (unsigned i = coord_components; i < 3; i++)
- bld.MOV(offset(msg_end, bld, i), fs_reg(0.0f));
+ bld.MOV(offset(msg_end, bld, i), brw_imm_f(0.0f));
msg_end = offset(msg_end, bld, 3 - coord_components);
}
/* There's no plain shadow compare message, so we use shadow
* compare with a bias of 0.0.
*/
- bld.MOV(msg_end, fs_reg(0.0f));
+ bld.MOV(msg_end, brw_imm_f(0.0f));
msg_end = offset(msg_end, bld, 1);
}
inst->opcode = op;
inst->src[0] = reg_undef;
- inst->src[1] = sampler;
- inst->resize_sources(2);
+ inst->src[1] = surface;
+ inst->src[2] = sampler;
+ inst->resize_sources(3);
inst->base_mrf = msg_begin.nr;
inst->mlen = msg_end.nr - msg_begin.nr;
inst->header_size = 1;
const fs_reg &shadow_c,
fs_reg lod, fs_reg lod2,
const fs_reg &sample_index,
+ const fs_reg &surface,
const fs_reg &sampler,
const fs_reg &offset_value,
unsigned coord_components,
case SHADER_OPCODE_TXF_CMS:
msg_lod = offset(msg_coords, bld, 3);
/* lod */
- bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), fs_reg(0u));
+ bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u));
/* sample index */
bld.MOV(retype(offset(msg_lod, bld, 1), BRW_REGISTER_TYPE_UD), sample_index);
msg_end = offset(msg_lod, bld, 2);
inst->opcode = op;
inst->src[0] = reg_undef;
- inst->src[1] = sampler;
- inst->resize_sources(2);
+ inst->src[1] = surface;
+ inst->src[2] = sampler;
+ inst->resize_sources(3);
inst->base_mrf = message.nr;
inst->mlen = msg_end.nr - message.nr;
inst->header_size = header_size;
const fs_reg &shadow_c,
fs_reg lod, fs_reg lod2,
const fs_reg &sample_index,
- const fs_reg &mcs, const fs_reg &sampler,
+ const fs_reg &mcs,
+ const fs_reg &surface,
+ const fs_reg &sampler,
fs_reg offset_value,
unsigned coord_components,
unsigned grad_components)
if (bld.shader->stage != MESA_SHADER_FRAGMENT &&
op == SHADER_OPCODE_TEX) {
op = SHADER_OPCODE_TXL;
- lod = fs_reg(0.0f);
+ lod = brw_imm_f(0.0f);
}
/* Set up the LOD info */
/* Generate the SEND. */
inst->opcode = op;
inst->src[0] = src_payload;
- inst->src[1] = sampler;
- inst->resize_sources(2);
+ inst->src[1] = surface;
+ inst->src[2] = sampler;
+ inst->resize_sources(3);
inst->base_mrf = -1;
inst->mlen = mlen;
inst->header_size = header_size;
lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op)
{
const brw_device_info *devinfo = bld.shader->devinfo;
- const fs_reg &coordinate = inst->src[0];
- const fs_reg &shadow_c = inst->src[1];
- const fs_reg &lod = inst->src[2];
- const fs_reg &lod2 = inst->src[3];
- const fs_reg &sample_index = inst->src[4];
- const fs_reg &mcs = inst->src[5];
- const fs_reg &sampler = inst->src[6];
- const fs_reg &offset_value = inst->src[7];
- assert(inst->src[8].file == IMM && inst->src[9].file == IMM);
- const unsigned coord_components = inst->src[8].ud;
- const unsigned grad_components = inst->src[9].ud;
+ const fs_reg &coordinate = inst->src[TEX_LOGICAL_SRC_COORDINATE];
+ const fs_reg &shadow_c = inst->src[TEX_LOGICAL_SRC_SHADOW_C];
+ const fs_reg &lod = inst->src[TEX_LOGICAL_SRC_LOD];
+ const fs_reg &lod2 = inst->src[TEX_LOGICAL_SRC_LOD2];
+ const fs_reg &sample_index = inst->src[TEX_LOGICAL_SRC_SAMPLE_INDEX];
+ const fs_reg &mcs = inst->src[TEX_LOGICAL_SRC_MCS];
+ const fs_reg &surface = inst->src[TEX_LOGICAL_SRC_SURFACE];
+ const fs_reg &sampler = inst->src[TEX_LOGICAL_SRC_SAMPLER];
+ const fs_reg &offset_value = inst->src[TEX_LOGICAL_SRC_OFFSET_VALUE];
+ assert(inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM);
+ const unsigned coord_components = inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
+ assert(inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM);
+ const unsigned grad_components = inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
if (devinfo->gen >= 7) {
lower_sampler_logical_send_gen7(bld, inst, op, coordinate,
shadow_c, lod, lod2, sample_index,
- mcs, sampler, offset_value,
+ mcs, surface, sampler, offset_value,
coord_components, grad_components);
} else if (devinfo->gen >= 5) {
lower_sampler_logical_send_gen5(bld, inst, op, coordinate,
shadow_c, lod, lod2, sample_index,
- sampler, offset_value,
+ surface, sampler, offset_value,
coord_components, grad_components);
} else {
lower_sampler_logical_send_gen4(bld, inst, op, coordinate,
- shadow_c, lod, lod2, sampler,
+ shadow_c, lod, lod2,
+ surface, sampler,
coord_components, grad_components);
}
}
{
fs_builder ubld = bld.exec_all().group(8, 0);
const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD);
- ubld.MOV(dst, fs_reg(0));
+ ubld.MOV(dst, brw_imm_d(0));
ubld.MOV(component(dst, 7), sample_mask);
return dst;
}
case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
lower_surface_logical_send(ibld, inst,
SHADER_OPCODE_TYPED_SURFACE_READ,
- fs_reg(0xffff));
+ brw_imm_d(0xffff));
break;
case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
case SHADER_OPCODE_TG4_OFFSET_LOGICAL: {
/* gather4_po_c is unsupported in SIMD16 mode. */
- const fs_reg &shadow_c = inst->src[1];
+ const fs_reg &shadow_c = inst->src[TEX_LOGICAL_SRC_SHADOW_C];
return (shadow_c.file != BAD_FILE ? 8 : inst->exec_size);
}
case SHADER_OPCODE_TXL_LOGICAL:
* Gen4-6 can't support TXL and TXB with shadow comparison in SIMD16
* mode because the message exceeds the maximum length of 11.
*/
- const fs_reg &shadow_c = inst->src[1];
+ const fs_reg &shadow_c = inst->src[TEX_LOGICAL_SRC_SHADOW_C];
if (devinfo->gen == 4 && shadow_c.file == BAD_FILE)
return 16;
else if (devinfo->gen < 7 && shadow_c.file != BAD_FILE)
* circumstances it can end up with a message that is too long in SIMD16
* mode.
*/
- const unsigned coord_components = inst->src[8].ud;
+ const unsigned coord_components =
+ inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
/* First three arguments are the sample index and the two arguments for
* the MCS data.
*/
case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
return 8;
+ case SHADER_OPCODE_MOV_INDIRECT:
+ /* Prior to Broadwell, we only have 8 address subregisters */
+ return devinfo->gen < 8 ? 8 : inst->exec_size;
+
default:
return inst->exec_size;
}
case IMM:
unreachable("not reached");
}
+ if (inst->dst.stride != 1)
+ fprintf(file, "<%u>", inst->dst.stride);
fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
for (int i = 0; i < inst->sources; i++) {
break;
case UNIFORM:
fprintf(file, "u%d", inst->src[i].nr + inst->src[i].reg_offset);
- if (inst->src[i].reladdr) {
- fprintf(file, "+reladdr");
- } else if (inst->src[i].subreg_offset) {
+ if (inst->src[i].subreg_offset) {
fprintf(file, "+%d.%d", inst->src[i].reg_offset,
inst->src[i].subreg_offset);
}
case IMM:
switch (inst->src[i].type) {
case BRW_REGISTER_TYPE_F:
- fprintf(file, "%ff", inst->src[i].f);
+ fprintf(file, "%-gf", inst->src[i].f);
break;
case BRW_REGISTER_TYPE_W:
case BRW_REGISTER_TYPE_D:
fprintf(file, "|");
if (inst->src[i].file != IMM) {
+ unsigned stride;
+ if (inst->src[i].file == ARF || inst->src[i].file == FIXED_GRF) {
+ unsigned hstride = inst->src[i].hstride;
+ stride = (hstride == 0 ? 0 : (1 << (hstride - 1)));
+ } else {
+ stride = inst->src[i].stride;
+ }
+ if (stride != 1)
+ fprintf(file, "<%u>", stride);
+
fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
}
{
if (end == start ||
end->is_partial_write() ||
- reg.reladdr ||
!reg.equals(end->dst)) {
return NULL;
} else {
}
void
-fs_visitor::setup_payload_gen6()
+fs_visitor::setup_fs_payload_gen6()
{
- bool uses_depth =
- (nir->info.inputs_read & (1 << VARYING_SLOT_POS)) != 0;
+ assert(stage == MESA_SHADER_FRAGMENT);
+ brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
+ brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
+
unsigned barycentric_interp_modes =
(stage == MESA_SHADER_FRAGMENT) ?
((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
}
/* R27: interpolated depth if uses source depth */
- if (uses_depth) {
+ prog_data->uses_src_depth =
+ (nir->info.inputs_read & (1 << VARYING_SLOT_POS)) != 0;
+ if (prog_data->uses_src_depth) {
payload.source_depth_reg = payload.num_regs;
payload.num_regs++;
if (dispatch_width == 16) {
payload.num_regs++;
}
}
+
/* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
- if (uses_depth) {
+ prog_data->uses_src_w =
+ (nir->info.inputs_read & (1 << VARYING_SLOT_POS)) != 0;
+ if (prog_data->uses_src_w) {
payload.source_w_reg = payload.num_regs;
payload.num_regs++;
if (dispatch_width == 16) {
}
}
- if (stage == MESA_SHADER_FRAGMENT) {
- brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
- brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
- prog_data->uses_pos_offset = key->compute_pos_offset;
- /* R31: MSAA position offsets. */
- if (prog_data->uses_pos_offset) {
- payload.sample_pos_reg = payload.num_regs;
- payload.num_regs++;
- }
+ prog_data->uses_pos_offset = key->compute_pos_offset;
+ /* R31: MSAA position offsets. */
+ if (prog_data->uses_pos_offset) {
+ payload.sample_pos_reg = payload.num_regs;
+ payload.num_regs++;
}
/* R32: MSAA input coverage mask */
- if (nir->info.system_values_read & SYSTEM_BIT_SAMPLE_MASK_IN) {
+ prog_data->uses_sample_mask =
+ (nir->info.system_values_read & SYSTEM_BIT_SAMPLE_MASK_IN) != 0;
+ if (prog_data->uses_sample_mask) {
assert(devinfo->gen >= 7);
payload.sample_mask_in_reg = payload.num_regs;
payload.num_regs++;
bld = fs_builder(this, 64);
assign_constant_locations();
- demote_pull_constants();
+ lower_constant_loads();
validate();
if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
char filename[64];
- snprintf(filename, 64, "%s%d-%s-00-start",
+ snprintf(filename, 64, "%s%d-%s-00-00-start",
stage_abbrev, dispatch_width, nir->info.name);
backend_shader::dump_instructions(filename);
OPT(opt_combine_constants);
OPT(lower_integer_multiplication);
+ if (devinfo->gen <= 5 && OPT(lower_minmax)) {
+ OPT(opt_cmod_propagation);
+ OPT(opt_cse);
+ OPT(opt_copy_propagate);
+ OPT(dead_code_eliminate);
+ }
+
lower_uniform_pull_constant_loads();
validate();
void
fs_visitor::fixup_3src_null_dest()
{
+ bool progress = false;
+
foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
if (inst->is_3src() && inst->dst.is_null()) {
inst->dst = fs_reg(VGRF, alloc.allocate(dispatch_width / 8),
inst->dst.type);
+ progress = true;
}
}
+
+ if (progress)
+ invalidate_live_intervals();
}
void
* SIMD8. There's probably actually some intermediate point where
* SIMD16 with a couple of spills is still better.
*/
- if (dispatch_width == 16) {
+ if (dispatch_width == 16 && min_dispatch_width <= 8) {
fail("Failure to register allocate. Reduce number of "
"live scalar values to avoid this.");
} else {
return !failed;
}
+bool
+fs_visitor::run_tes()
+{
+ assert(stage == MESA_SHADER_TESS_EVAL);
+
+ /* R0: thread header, R1-3: gl_TessCoord.xyz, R4: URB handles */
+ payload.num_regs = 5;
+
+ if (shader_time_index >= 0)
+ emit_shader_time_begin();
+
+ emit_nir_code();
+
+ if (failed)
+ return false;
+
+ emit_urb_writes();
+
+ if (shader_time_index >= 0)
+ emit_shader_time_end();
+
+ calculate_cfg();
+
+ optimize();
+
+ assign_curb_setup();
+ assign_tes_urb_setup();
+
+ fixup_3src_null_dest();
+ allocate_registers();
+
+ return !failed;
+}
+
bool
fs_visitor::run_gs()
{
*/
if (gs_compile->control_data_header_size_bits <= 32) {
const fs_builder abld = bld.annotate("initialize control data bits");
- abld.MOV(this->control_data_bits, fs_reg(0u));
+ abld.MOV(this->control_data_bits, brw_imm_ud(0u));
}
}
assert(stage == MESA_SHADER_FRAGMENT);
if (devinfo->gen >= 6)
- setup_payload_gen6();
+ setup_fs_payload_gen6();
else
- setup_payload_gen4();
+ setup_fs_payload_gen4();
if (0) {
emit_dummy_fs();
if (shader_time_index >= 0)
emit_shader_time_begin();
+ if (devinfo->is_haswell && prog_data->total_shared > 0) {
+ /* Move SLM index from g0.0[27:24] to sr0.1[11:8] */
+ const fs_builder abld = bld.exec_all().group(1, 0);
+ abld.MOV(retype(suboffset(brw_sr0_reg(), 1), BRW_REGISTER_TYPE_UW),
+ suboffset(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW), 1));
+ }
+
emit_nir_code();
if (failed)
return barycentric_interp_modes;
}
+static void
+brw_compute_flat_inputs(struct brw_wm_prog_data *prog_data,
+ bool shade_model_flat, const nir_shader *shader)
+{
+ prog_data->flat_inputs = 0;
+
+ nir_foreach_variable(var, &shader->inputs) {
+ enum glsl_interp_qualifier interp_qualifier =
+ (enum glsl_interp_qualifier)var->data.interpolation;
+ bool is_gl_Color = (var->data.location == VARYING_SLOT_COL0) ||
+ (var->data.location == VARYING_SLOT_COL1);
+
+ int input_index = prog_data->urb_setup[var->data.location];
+
+ if (input_index < 0)
+ continue;
+
+ /* flat shading */
+ if (interp_qualifier == INTERP_QUALIFIER_FLAT ||
+ (shade_model_flat && is_gl_Color &&
+ interp_qualifier == INTERP_QUALIFIER_NONE))
+ prog_data->flat_inputs |= (1 << input_index);
+ }
+}
+
static uint8_t
computed_depth_mode(const nir_shader *shader)
{
void *mem_ctx,
const struct brw_wm_prog_key *key,
struct brw_wm_prog_data *prog_data,
- const nir_shader *shader,
+ const nir_shader *src_shader,
struct gl_program *prog,
int shader_time_index8, int shader_time_index16,
bool use_rep_send,
unsigned *final_assembly_size,
char **error_str)
{
+ nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
+ shader = brw_nir_apply_sampler_key(shader, compiler->devinfo, &key->tex,
+ true);
+ brw_nir_lower_fs_inputs(shader);
+ brw_nir_lower_fs_outputs(shader);
+ shader = brw_postprocess_nir(shader, compiler->devinfo, true);
+
/* key->alpha_test_func means simulating alpha testing via discards,
* so the shader definitely kills pixels.
*/
}
}
+ /* We have to compute the flat inputs after the visitor is finished running
+ * because it relies on prog_data->urb_setup which is computed in
+ * fs_visitor::calculate_urb_setup().
+ */
+ brw_compute_flat_inputs(prog_data, key->flat_shade, shader);
+
cfg_t *simd8_cfg;
int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || use_rep_send;
if ((no_simd8 || compiler->devinfo->gen < 5) && simd16_cfg) {
}
fs_generator g(compiler, log_data, mem_ctx, (void *) key, &prog_data->base,
- v.promoted_constants, v.runtime_check_aads_emit, "FS");
+ v.promoted_constants, v.runtime_check_aads_emit,
+ MESA_SHADER_FRAGMENT);
if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
g.enable_debug(ralloc_asprintf(mem_ctx, "%s fragment shader %s",
return g.get_assembly(final_assembly_size);
}
-void
-brw_cs_fill_local_id_payload(const struct brw_cs_prog_data *prog_data,
- void *buffer, uint32_t threads, uint32_t stride)
-{
- if (prog_data->local_invocation_id_regs == 0)
- return;
-
- /* 'stride' should be an integer number of registers, that is, a multiple
- * of 32 bytes.
- */
- assert(stride % 32 == 0);
-
- unsigned x = 0, y = 0, z = 0;
- for (unsigned t = 0; t < threads; t++) {
- uint32_t *param = (uint32_t *) buffer + stride * t / 4;
-
- for (unsigned i = 0; i < prog_data->simd_size; i++) {
- param[0 * prog_data->simd_size + i] = x;
- param[1 * prog_data->simd_size + i] = y;
- param[2 * prog_data->simd_size + i] = z;
-
- x++;
- if (x == prog_data->local_size[0]) {
- x = 0;
- y++;
- if (y == prog_data->local_size[1]) {
- y = 0;
- z++;
- if (z == prog_data->local_size[2])
- z = 0;
- }
- }
- }
- }
-}
-
fs_reg *
fs_visitor::emit_cs_local_invocation_id_setup()
{
void *mem_ctx,
const struct brw_cs_prog_key *key,
struct brw_cs_prog_data *prog_data,
- const nir_shader *shader,
+ const nir_shader *src_shader,
int shader_time_index,
unsigned *final_assembly_size,
char **error_str)
{
+ nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
+ shader = brw_nir_apply_sampler_key(shader, compiler->devinfo, &key->tex,
+ true);
+ brw_nir_lower_cs_shared(shader);
+ prog_data->base.total_shared += shader->num_shared;
+ shader = brw_postprocess_nir(shader, compiler->devinfo, true);
+
prog_data->local_size[0] = shader->info.cs.local_size[0];
prog_data->local_size[1] = shader->info.cs.local_size[1];
prog_data->local_size[2] = shader->info.cs.local_size[2];
shader->info.cs.local_size[2];
unsigned max_cs_threads = compiler->devinfo->max_cs_threads;
+ unsigned simd_required = DIV_ROUND_UP(local_workgroup_size, max_cs_threads);
cfg_t *cfg = NULL;
const char *fail_msg = NULL;
fs_visitor v8(compiler, log_data, mem_ctx, key, &prog_data->base,
NULL, /* Never used in core profile */
shader, 8, shader_time_index);
- if (!v8.run_cs()) {
- fail_msg = v8.fail_msg;
- } else if (local_workgroup_size <= 8 * max_cs_threads) {
- cfg = v8.cfg;
- prog_data->simd_size = 8;
+ if (simd_required <= 8) {
+ if (!v8.run_cs()) {
+ fail_msg = v8.fail_msg;
+ } else {
+ cfg = v8.cfg;
+ prog_data->simd_size = 8;
+ }
}
fs_visitor v16(compiler, log_data, mem_ctx, key, &prog_data->base,
!fail_msg && !v8.simd16_unsupported &&
local_workgroup_size <= 16 * max_cs_threads) {
/* Try a SIMD16 compile */
- v16.import_uniforms(&v8);
+ if (simd_required <= 8)
+ v16.import_uniforms(&v8);
if (!v16.run_cs()) {
compiler->shader_perf_log(log_data,
"SIMD16 shader failed to compile: %s",
}
fs_generator g(compiler, log_data, mem_ctx, (void*) key, &prog_data->base,
- v8.promoted_constants, v8.runtime_check_aads_emit, "CS");
+ v8.promoted_constants, v8.runtime_check_aads_emit,
+ MESA_SHADER_COMPUTE);
if (INTEL_DEBUG & DEBUG_CS) {
char *name = ralloc_asprintf(mem_ctx, "%s compute shader %s",
shader->info.label ? shader->info.label :
return g.get_assembly(final_assembly_size);
}
+
+void
+brw_cs_fill_local_id_payload(const struct brw_cs_prog_data *prog_data,
+ void *buffer, uint32_t threads, uint32_t stride)
+{
+ if (prog_data->local_invocation_id_regs == 0)
+ return;
+
+ /* 'stride' should be an integer number of registers, that is, a multiple
+ * of 32 bytes.
+ */
+ assert(stride % 32 == 0);
+
+ unsigned x = 0, y = 0, z = 0;
+ for (unsigned t = 0; t < threads; t++) {
+ uint32_t *param = (uint32_t *) buffer + stride * t / 4;
+
+ for (unsigned i = 0; i < prog_data->simd_size; i++) {
+ param[0 * prog_data->simd_size + i] = x;
+ param[1 * prog_data->simd_size + i] = y;
+ param[2 * prog_data->simd_size + i] = z;
+
+ x++;
+ if (x == prog_data->local_size[0]) {
+ x = 0;
+ y++;
+ if (y == prog_data->local_size[1]) {
+ y = 0;
+ z++;
+ if (z == prog_data->local_size[2])
+ z = 0;
+ }
+ }
+ }
+ }
+}