#include "brw_vec4_gs_visitor.h"
#include "brw_cfg.h"
#include "brw_dead_control_flow.h"
-#include "common/gen_debug.h"
+#include "dev/gen_debug.h"
#include "compiler/glsl_types.h"
#include "compiler/nir/nir_builder.h"
#include "program/prog_parameter.h"
}
extern "C" int
-type_size_scalar(const struct glsl_type *type)
+type_size_scalar(const struct glsl_type *type, bool bindless)
{
unsigned int size, i;
case GLSL_TYPE_INT64:
return type->components() * 2;
case GLSL_TYPE_ARRAY:
- return type_size_scalar(type->fields.array) * type->length;
+ return type_size_scalar(type->fields.array, bindless) * type->length;
case GLSL_TYPE_STRUCT:
+ case GLSL_TYPE_INTERFACE:
size = 0;
for (i = 0; i < type->length; i++) {
- size += type_size_scalar(type->fields.structure[i].type);
+ size += type_size_scalar(type->fields.structure[i].type, bindless);
}
return size;
case GLSL_TYPE_SAMPLER:
- case GLSL_TYPE_ATOMIC_UINT:
case GLSL_TYPE_IMAGE:
+ if (bindless)
+ return type->components() * 2;
+ case GLSL_TYPE_ATOMIC_UINT:
/* Samplers, atomics, and images take up no register space, since
* they're baked in at link time.
*/
return 1;
case GLSL_TYPE_VOID:
case GLSL_TYPE_ERROR:
- case GLSL_TYPE_INTERFACE:
case GLSL_TYPE_FUNCTION:
unreachable("not reached");
}
* it.
*/
bool
-fs_inst::is_partial_write() const
+fs_inst::is_partial_reg_write() const
{
return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
- (this->exec_size * type_sz(this->dst.type)) < 32 ||
!this->dst.is_contiguous() ||
+ (this->exec_size * type_sz(this->dst.type)) < REG_SIZE ||
this->dst.offset % REG_SIZE != 0);
}
+/**
+ * Returns true if the instruction has a flag that means it won't
+ * update an entire variable for the given dispatch width.
+ *
+ * This is only different from is_partial_reg_write() for SIMD8
+ * dispatches of 16-bit (or smaller) instructions.
+ */
+bool
+fs_inst::is_partial_var_write(uint32_t dispatch_width) const
+{
+ const uint32_t type_size = type_sz(this->dst.type);
+ uint32_t var_size = MIN2(REG_SIZE, dispatch_width * type_size);
+
+ return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
+ !this->dst.is_contiguous() ||
+ (this->exec_size * type_sz(this->dst.type)) < var_size ||
+ this->dst.offset % var_size != 0);
+}
+
unsigned
fs_inst::components_read(unsigned i) const
{
fs_visitor::vgrf(const glsl_type *const type)
{
int reg_width = dispatch_width / 8;
- return fs_reg(VGRF, alloc.allocate(type_size_scalar(type) * reg_width),
+ return fs_reg(VGRF,
+ alloc.allocate(type_size_scalar(type, false) * reg_width),
brw_type_for_base_type(type));
}
break;
}
- /* a * 0.0 = 0.0 */
- if (inst->src[1].is_zero()) {
- inst->opcode = BRW_OPCODE_MOV;
- inst->src[0] = inst->src[1];
- inst->src[1] = reg_undef;
- progress = true;
- break;
- }
-
if (inst->src[0].file == IMM) {
assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
inst->opcode = BRW_OPCODE_MOV;
if (inst->src[1].file != IMM)
continue;
- /* a + 0.0 = a */
- if (inst->src[1].is_zero()) {
- inst->opcode = BRW_OPCODE_MOV;
- inst->src[1] = reg_undef;
- progress = true;
- break;
- }
-
if (inst->src[0].file == IMM) {
assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
inst->opcode = BRW_OPCODE_MOV;
case BRW_OPCODE_OR:
if (inst->src[0].equals(inst->src[1]) ||
inst->src[1].is_zero()) {
- inst->opcode = BRW_OPCODE_MOV;
- inst->src[1] = reg_undef;
- progress = true;
- break;
- }
- break;
- case BRW_OPCODE_LRP:
- if (inst->src[1].equals(inst->src[2])) {
- inst->opcode = BRW_OPCODE_MOV;
- inst->src[0] = inst->src[1];
+ /* On Gen8+, the OR instruction can have a source modifier that
+ * performs logical not on the operand. Cases of 'OR r0, ~r1, 0'
+ * or 'OR r0, ~r1, ~r1' should become a NOT instead of a MOV.
+ */
+ if (inst->src[0].negate) {
+ inst->opcode = BRW_OPCODE_NOT;
+ inst->src[0].negate = false;
+ } else {
+ inst->opcode = BRW_OPCODE_MOV;
+ }
inst->src[1] = reg_undef;
- inst->src[2] = reg_undef;
progress = true;
break;
}
}
break;
case BRW_OPCODE_MAD:
- if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
- inst->opcode = BRW_OPCODE_MOV;
- inst->src[1] = reg_undef;
- inst->src[2] = reg_undef;
- progress = true;
- } else if (inst->src[0].is_zero()) {
- inst->opcode = BRW_OPCODE_MUL;
- inst->src[0] = inst->src[2];
- inst->src[2] = reg_undef;
- progress = true;
- } else if (inst->src[1].is_one()) {
+ if (inst->src[0].type != BRW_REGISTER_TYPE_F ||
+ inst->src[1].type != BRW_REGISTER_TYPE_F ||
+ inst->src[2].type != BRW_REGISTER_TYPE_F)
+ break;
+ if (inst->src[1].is_one()) {
inst->opcode = BRW_OPCODE_ADD;
inst->src[1] = inst->src[2];
inst->src[2] = reg_undef;
inst->opcode = BRW_OPCODE_ADD;
inst->src[2] = reg_undef;
progress = true;
- } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
- inst->opcode = BRW_OPCODE_ADD;
- inst->src[1].f *= inst->src[2].f;
- inst->src[2] = reg_undef;
- progress = true;
}
break;
case SHADER_OPCODE_BROADCAST:
if (depth == 0 &&
inst->dst.file == VGRF &&
alloc.sizes[inst->dst.nr] * REG_SIZE == inst->size_written &&
- !inst->is_partial_write()) {
+ !inst->is_partial_reg_write()) {
if (remap[dst] == ~0u) {
remap[dst] = dst;
} else {
if (csel_inst != NULL) {
progress = true;
+ csel_inst->saturate = inst->saturate;
inst->remove(block);
}
next_ip++;
if (inst->opcode != BRW_OPCODE_MOV ||
- inst->is_partial_write() ||
+ inst->is_partial_reg_write() ||
inst->dst.file != MRF || inst->src[0].file != VGRF ||
inst->dst.type != inst->src[0].type ||
inst->src[0].abs || inst->src[0].negate ||
* that writes that reg, but it would require smarter
* tracking.
*/
- if (scan_inst->is_partial_write())
+ if (scan_inst->is_partial_reg_write())
break;
/* Handling things not fully contained in the source of the copy
if (inst->opcode == BRW_OPCODE_MOV &&
inst->dst.file == MRF &&
inst->src[0].file != ARF &&
- !inst->is_partial_write()) {
+ !inst->is_partial_reg_write()) {
last_mrf_move[inst->dst.nr] = inst;
}
}
/* Set "Source0 Alpha Present to RenderTarget" bit in message
* header.
*/
- if (inst->target > 0 && key->replicate_alpha)
+ if (inst->target > 0 && prog_data->replicate_alpha)
g00_bits |= 1 << 11;
/* Set computes stencil to render target */
length++;
}
+ if (src0_alpha.file != BAD_FILE) {
+ for (unsigned i = 0; i < bld.dispatch_width() / 8; i++) {
+ const fs_builder &ubld = bld.exec_all().group(8, i)
+ .annotate("FB write src0 alpha");
+ const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_F);
+ ubld.MOV(tmp, horiz_offset(src0_alpha, i * 8));
+ setup_color_payload(ubld, key, &sources[length], tmp, 1);
+ length++;
+ }
+ } else if (prog_data->replicate_alpha && inst->target != 0) {
+ /* Handle the case when fragment shader doesn't write to draw buffer
+ * zero. No need to call setup_color_payload() for src0_alpha because
+ * alpha value will be undefined.
+ */
+ length += bld.dispatch_width() / 8;
+ }
+
if (sample_mask.file != BAD_FILE) {
sources[length] = fs_reg(VGRF, bld.shader->alloc.allocate(1),
BRW_REGISTER_TYPE_UD);
payload_header_size = length;
- if (src0_alpha.file != BAD_FILE) {
- /* FIXME: This is being passed at the wrong location in the payload and
- * doesn't work when gl_SampleMask and MRTs are used simultaneously.
- * It's supposed to be immediately before oMask but there seems to be no
- * reasonable way to pass them in the correct order because LOAD_PAYLOAD
- * requires header sources to form a contiguous segment at the beginning
- * of the message and src0_alpha has per-channel semantics.
- */
- setup_color_payload(bld, key, &sources[length], src0_alpha, 1);
- length++;
- } else if (key->replicate_alpha && inst->target != 0) {
- /* Handle the case when fragment shader doesn't write to draw buffer
- * zero. No need to call setup_color_payload() for src0_alpha because
- * alpha value will be undefined.
- */
- length++;
- }
-
setup_color_payload(bld, key, &sources[length], color0, components);
length += 4;
return progress;
}
+static bool
+is_mixed_float_with_fp32_dst(const fs_inst *inst)
+{
+ /* This opcode sometimes uses :W type on the source even if the operand is
+ * a :HF, because in gen7 there is no support for :HF, and thus it uses :W.
+ */
+ if (inst->opcode == BRW_OPCODE_F16TO32)
+ return true;
+
+ if (inst->dst.type != BRW_REGISTER_TYPE_F)
+ return false;
+
+ for (int i = 0; i < inst->sources; i++) {
+ if (inst->src[i].type == BRW_REGISTER_TYPE_HF)
+ return true;
+ }
+
+ return false;
+}
+
+static bool
+is_mixed_float_with_packed_fp16_dst(const fs_inst *inst)
+{
+ /* This opcode sometimes uses :W type on the destination even if the
+ * destination is a :HF, because in gen7 there is no support for :HF, and
+ * thus it uses :W.
+ */
+ if (inst->opcode == BRW_OPCODE_F32TO16 &&
+ inst->dst.stride == 1)
+ return true;
+
+ if (inst->dst.type != BRW_REGISTER_TYPE_HF ||
+ inst->dst.stride != 1)
+ return false;
+
+ for (int i = 0; i < inst->sources; i++) {
+ if (inst->src[i].type == BRW_REGISTER_TYPE_F)
+ return true;
+ }
+
+ return false;
+}
+
/**
* Get the closest allowed SIMD width for instruction \p inst accounting for
* some common regioning and execution control restrictions that apply to FPU
max_width = MIN2(max_width, 4);
}
+ /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
+ * Float Operations:
+ *
+ * "No SIMD16 in mixed mode when destination is f32. Instruction
+ * execution size must be no more than 8."
+ *
+ * FIXME: the simulator doesn't seem to complain if we don't do this and
+ * empirical testing with existing CTS tests show that they pass just fine
+ * without implementing this, however, since our interpretation of the PRM
+ * is that conversion MOVs between HF and F are still mixed-float
+ * instructions (and therefore subject to this restriction) we decided to
+ * split them to be safe. Might be useful to do additional investigation to
+ * lift the restriction if we can ensure that it is safe though, since these
+ * conversions are common when half-float types are involved since many
+ * instructions do not support HF types and conversions from/to F are
+ * required.
+ */
+ if (is_mixed_float_with_fp32_dst(inst))
+ max_width = MIN2(max_width, 8);
+
+ /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
+ * Float Operations:
+ *
+ * "No SIMD16 in mixed mode when destination is packed f16 for both
+ * Align1 and Align16."
+ */
+ if (is_mixed_float_with_packed_fp16_dst(inst))
+ max_width = MIN2(max_width, 8);
+
/* Only power-of-two execution sizes are representable in the instruction
* control fields.
*/
case SHADER_OPCODE_EXP2:
case SHADER_OPCODE_LOG2:
case SHADER_OPCODE_SIN:
- case SHADER_OPCODE_COS:
+ case SHADER_OPCODE_COS: {
/* Unary extended math instructions are limited to SIMD8 on Gen4 and
- * Gen6.
+ * Gen6. Extended Math Function is limited to SIMD8 with half-float.
*/
- return (devinfo->gen >= 7 ? MIN2(16, inst->exec_size) :
- devinfo->gen == 5 || devinfo->is_g4x ? MIN2(16, inst->exec_size) :
- MIN2(8, inst->exec_size));
+ if (devinfo->gen == 6 || (devinfo->gen == 4 && !devinfo->is_g4x))
+ return MIN2(8, inst->exec_size);
+ if (inst->dst.type == BRW_REGISTER_TYPE_HF)
+ return MIN2(8, inst->exec_size);
+ return MIN2(16, inst->exec_size);
+ }
- case SHADER_OPCODE_POW:
- /* SIMD16 is only allowed on Gen7+. */
- return (devinfo->gen >= 7 ? MIN2(16, inst->exec_size) :
- MIN2(8, inst->exec_size));
+ case SHADER_OPCODE_POW: {
+ /* SIMD16 is only allowed on Gen7+. Extended Math Function is limited
+ * to SIMD8 with half-float
+ */
+ if (devinfo->gen < 7)
+ return MIN2(8, inst->exec_size);
+ if (inst->dst.type == BRW_REGISTER_TYPE_HF)
+ return MIN2(8, inst->exec_size);
+ return MIN2(16, inst->exec_size);
+ }
case SHADER_OPCODE_INT_QUOTIENT:
case SHADER_OPCODE_INT_REMAINDER:
brw_nir_lower_fs_inputs(shader, devinfo, key);
brw_nir_lower_fs_outputs(shader);
- if (devinfo->gen < 6) {
- brw_setup_vue_interpolation(vue_map, shader, prog_data, devinfo);
- }
+ if (devinfo->gen < 6)
+ brw_setup_vue_interpolation(vue_map, shader, prog_data);
if (!key->multisample_fbo)
NIR_PASS_V(shader, demote_sample_qualifiers);
{
nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
shader = brw_nir_apply_sampler_key(shader, compiler, &key->tex, true);
- brw_nir_lower_cs_intrinsics(shader, dispatch_width);
+
+ NIR_PASS_V(shader, brw_nir_lower_cs_intrinsics, dispatch_width);
+
+ /* Clean up after the local index and ID calculations. */
+ NIR_PASS_V(shader, nir_opt_constant_folding);
+ NIR_PASS_V(shader, nir_opt_dce);
+
return brw_postprocess_nir(shader, compiler, true);
}