--- /dev/null
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_fs.h"
+#include "brw_cfg.h"
+#include "brw_fs_builder.h"
+
+using namespace brw;
+
+namespace {
+ /* From the SKL PRM Vol 2a, "Move":
+ *
+ * "A mov with the same source and destination type, no source modifier,
+ * and no saturation is a raw move. A packed byte destination region (B
+ * or UB type with HorzStride == 1 and ExecSize > 1) can only be written
+ * using raw move."
+ */
+ bool
+ is_byte_raw_mov(const fs_inst *inst)
+ {
+ return type_sz(inst->dst.type) == 1 &&
+ inst->opcode == BRW_OPCODE_MOV &&
+ inst->src[0].type == inst->dst.type &&
+ !inst->saturate &&
+ !inst->src[0].negate &&
+ !inst->src[0].abs;
+ }
+
+ /*
+ * Return an acceptable byte stride for the destination of an instruction
+ * that requires it to have some particular alignment.
+ */
+ unsigned
+ required_dst_byte_stride(const fs_inst *inst)
+ {
+ if (type_sz(inst->dst.type) < get_exec_type_size(inst) &&
+ !is_byte_raw_mov(inst)) {
+ return get_exec_type_size(inst);
+ } else {
+ unsigned stride = inst->dst.stride * type_sz(inst->dst.type);
+
+ for (unsigned i = 0; i < inst->sources; i++) {
+ if (!is_uniform(inst->src[i]))
+ stride = MAX2(stride, inst->src[i].stride *
+ type_sz(inst->src[i].type));
+ }
+
+ return stride;
+ }
+ }
+
+ /*
+ * Return an acceptable byte sub-register offset for the destination of an
+ * instruction that requires it to be aligned to the sub-register offset of
+ * the sources.
+ */
+ unsigned
+ required_dst_byte_offset(const fs_inst *inst)
+ {
+ for (unsigned i = 0; i < inst->sources; i++) {
+ if (!is_uniform(inst->src[i]))
+ if (reg_offset(inst->src[i]) % REG_SIZE !=
+ reg_offset(inst->dst) % REG_SIZE)
+ return 0;
+ }
+
+ return reg_offset(inst->dst) % REG_SIZE;
+ }
+
+ /*
+ * Return whether the instruction has an unsupported channel bit layout
+ * specified for the i-th source region.
+ */
+ bool
+ has_invalid_src_region(const gen_device_info *devinfo, const fs_inst *inst,
+ unsigned i)
+ {
+ if (is_unordered(inst)) {
+ return false;
+ } else {
+ const unsigned dst_byte_stride = inst->dst.stride * type_sz(inst->dst.type);
+ const unsigned src_byte_stride = inst->src[i].stride *
+ type_sz(inst->src[i].type);
+ const unsigned dst_byte_offset = reg_offset(inst->dst) % REG_SIZE;
+ const unsigned src_byte_offset = reg_offset(inst->src[i]) % REG_SIZE;
+
+ return has_dst_aligned_region_restriction(devinfo, inst) &&
+ !is_uniform(inst->src[i]) &&
+ (src_byte_stride != dst_byte_stride ||
+ src_byte_offset != dst_byte_offset);
+ }
+ }
+
+ /*
+ * Return whether the instruction has an unsupported channel bit layout
+ * specified for the destination region.
+ */
+ bool
+ has_invalid_dst_region(const gen_device_info *devinfo,
+ const fs_inst *inst)
+ {
+ if (is_unordered(inst)) {
+ return false;
+ } else {
+ const brw_reg_type exec_type = get_exec_type(inst);
+ const unsigned dst_byte_offset = reg_offset(inst->dst) % REG_SIZE;
+ const unsigned dst_byte_stride = inst->dst.stride * type_sz(inst->dst.type);
+ const bool is_narrowing_conversion = !is_byte_raw_mov(inst) &&
+ type_sz(inst->dst.type) < type_sz(exec_type);
+
+ return (has_dst_aligned_region_restriction(devinfo, inst) &&
+ (required_dst_byte_stride(inst) != dst_byte_stride ||
+ required_dst_byte_offset(inst) != dst_byte_offset)) ||
+ (is_narrowing_conversion &&
+ required_dst_byte_stride(inst) != dst_byte_stride);
+ }
+ }
+
+ /*
+ * Return whether the instruction has unsupported source modifiers
+ * specified for the i-th source region.
+ */
+ bool
+ has_invalid_src_modifiers(const gen_device_info *devinfo, const fs_inst *inst,
+ unsigned i)
+ {
+ return !inst->can_do_source_mods(devinfo) &&
+ (inst->src[i].negate || inst->src[i].abs);
+ }
+
+ /*
+ * Return whether the instruction has an unsupported type conversion
+ * specified for the destination.
+ */
+ bool
+ has_invalid_conversion(const gen_device_info *devinfo, const fs_inst *inst)
+ {
+ switch (inst->opcode) {
+ case BRW_OPCODE_MOV:
+ return false;
+ case BRW_OPCODE_SEL:
+ return inst->dst.type != get_exec_type(inst);
+ case SHADER_OPCODE_BROADCAST:
+ case SHADER_OPCODE_MOV_INDIRECT:
+ /* The source and destination types of these may be hard-coded to
+ * integer at codegen time due to hardware limitations of 64-bit
+ * types.
+ */
+ return ((devinfo->gen == 7 && !devinfo->is_haswell) ||
+ devinfo->is_cherryview || gen_device_info_is_9lp(devinfo)) &&
+ type_sz(inst->src[0].type) > 4 &&
+ inst->dst.type != inst->src[0].type;
+ default:
+ /* FIXME: We assume the opcodes don't explicitly mentioned before
+ * just work fine with arbitrary conversions.
+ */
+ return false;
+ }
+ }
+
+ /**
+ * Return whether the instruction has non-standard semantics for the
+ * conditional mod which don't cause the flag register to be updated with
+ * the comparison result.
+ */
+ bool
+ has_inconsistent_cmod(const fs_inst *inst)
+ {
+ return inst->opcode == BRW_OPCODE_SEL ||
+ inst->opcode == BRW_OPCODE_CSEL ||
+ inst->opcode == BRW_OPCODE_IF ||
+ inst->opcode == BRW_OPCODE_WHILE;
+ }
+
+ bool
+ lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst);
+}
+
+namespace brw {
+ /**
+ * Remove any modifiers from the \p i-th source region of the instruction,
+ * including negate, abs and any implicit type conversion to the execution
+ * type. Instead any source modifiers will be implemented as a separate
+ * MOV instruction prior to the original instruction.
+ */
+ bool
+ lower_src_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i)
+ {
+ assert(inst->components_read(i) == 1);
+ const fs_builder ibld(v, block, inst);
+ const fs_reg tmp = ibld.vgrf(get_exec_type(inst));
+
+ lower_instruction(v, block, ibld.MOV(tmp, inst->src[i]));
+ inst->src[i] = tmp;
+
+ return true;
+ }
+}
+
+namespace {
+ /**
+ * Remove any modifiers from the destination region of the instruction,
+ * including saturate, conditional mod and any implicit type conversion
+ * from the execution type. Instead any destination modifiers will be
+ * implemented as a separate MOV instruction after the original
+ * instruction.
+ */
+ bool
+ lower_dst_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst)
+ {
+ const fs_builder ibld(v, block, inst);
+ const brw_reg_type type = get_exec_type(inst);
+ /* Not strictly necessary, but if possible use a temporary with the same
+ * channel alignment as the current destination in order to avoid
+ * violating the restrictions enforced later on by lower_src_region()
+ * and lower_dst_region(), which would introduce additional copy
+ * instructions into the program unnecessarily.
+ */
+ const unsigned stride =
+ type_sz(inst->dst.type) * inst->dst.stride <= type_sz(type) ? 1 :
+ type_sz(inst->dst.type) * inst->dst.stride / type_sz(type);
+ const fs_reg tmp = horiz_stride(ibld.vgrf(type, stride), stride);
+
+ /* Emit a MOV taking care of all the destination modifiers. */
+ fs_inst *mov = ibld.at(block, inst->next).MOV(inst->dst, tmp);
+ mov->saturate = inst->saturate;
+ if (!has_inconsistent_cmod(inst))
+ mov->conditional_mod = inst->conditional_mod;
+ if (inst->opcode != BRW_OPCODE_SEL) {
+ mov->predicate = inst->predicate;
+ mov->predicate_inverse = inst->predicate_inverse;
+ }
+ mov->flag_subreg = inst->flag_subreg;
+ lower_instruction(v, block, mov);
+
+ /* Point the original instruction at the temporary, and clean up any
+ * destination modifiers.
+ */
+ assert(inst->size_written == inst->dst.component_size(inst->exec_size));
+ inst->dst = tmp;
+ inst->size_written = inst->dst.component_size(inst->exec_size);
+ inst->saturate = false;
+ if (!has_inconsistent_cmod(inst))
+ inst->conditional_mod = BRW_CONDITIONAL_NONE;
+
+ assert(!inst->flags_written() || !mov->predicate);
+ return true;
+ }
+
+ /**
+ * Remove any non-trivial shuffling of data from the \p i-th source region
+ * of the instruction. Instead implement the region as a series of integer
+ * copies into a temporary with the same channel layout as the destination.
+ */
+ bool
+ lower_src_region(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i)
+ {
+ assert(inst->components_read(i) == 1);
+ const fs_builder ibld(v, block, inst);
+ const unsigned stride = type_sz(inst->dst.type) * inst->dst.stride /
+ type_sz(inst->src[i].type);
+ assert(stride > 0);
+ const fs_reg tmp = horiz_stride(ibld.vgrf(inst->src[i].type, stride),
+ stride);
+
+ /* Emit a series of 32-bit integer copies with any source modifiers
+ * cleaned up (because their semantics are dependent on the type).
+ */
+ const brw_reg_type raw_type = brw_int_type(MIN2(type_sz(tmp.type), 4),
+ false);
+ const unsigned n = type_sz(tmp.type) / type_sz(raw_type);
+ fs_reg raw_src = inst->src[i];
+ raw_src.negate = false;
+ raw_src.abs = false;
+
+ for (unsigned j = 0; j < n; j++)
+ ibld.MOV(subscript(tmp, raw_type, j), subscript(raw_src, raw_type, j));
+
+ /* Point the original instruction at the temporary, making sure to keep
+ * any source modifiers in the instruction.
+ */
+ fs_reg lower_src = tmp;
+ lower_src.negate = inst->src[i].negate;
+ lower_src.abs = inst->src[i].abs;
+ inst->src[i] = lower_src;
+
+ return true;
+ }
+
+ /**
+ * Remove any non-trivial shuffling of data from the destination region of
+ * the instruction. Instead implement the region as a series of integer
+ * copies from a temporary with a channel layout compatible with the
+ * sources.
+ */
+ bool
+ lower_dst_region(fs_visitor *v, bblock_t *block, fs_inst *inst)
+ {
+ const fs_builder ibld(v, block, inst);
+ const unsigned stride = required_dst_byte_stride(inst) /
+ type_sz(inst->dst.type);
+ assert(stride > 0);
+ const fs_reg tmp = horiz_stride(ibld.vgrf(inst->dst.type, stride),
+ stride);
+
+ /* Emit a series of 32-bit integer copies from the temporary into the
+ * original destination.
+ */
+ const brw_reg_type raw_type = brw_int_type(MIN2(type_sz(tmp.type), 4),
+ false);
+ const unsigned n = type_sz(tmp.type) / type_sz(raw_type);
+
+ if (inst->predicate && inst->opcode != BRW_OPCODE_SEL) {
+ /* Note that in general we cannot simply predicate the copies on the
+ * same flag register as the original instruction, since it may have
+ * been overwritten by the instruction itself. Instead initialize
+ * the temporary with the previous contents of the destination
+ * register.
+ */
+ for (unsigned j = 0; j < n; j++)
+ ibld.MOV(subscript(tmp, raw_type, j),
+ subscript(inst->dst, raw_type, j));
+ }
+
+ for (unsigned j = 0; j < n; j++)
+ ibld.at(block, inst->next).MOV(subscript(inst->dst, raw_type, j),
+ subscript(tmp, raw_type, j));
+
+ /* Point the original instruction at the temporary, making sure to keep
+ * any destination modifiers in the instruction.
+ */
+ assert(inst->size_written == inst->dst.component_size(inst->exec_size));
+ inst->dst = tmp;
+ inst->size_written = inst->dst.component_size(inst->exec_size);
+
+ return true;
+ }
+
+ /**
+ * Legalize the source and destination regioning controls of the specified
+ * instruction.
+ */
+ bool
+ lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst)
+ {
+ const gen_device_info *devinfo = v->devinfo;
+ bool progress = false;
+
+ if (has_invalid_conversion(devinfo, inst))
+ progress |= lower_dst_modifiers(v, block, inst);
+
+ if (has_invalid_dst_region(devinfo, inst))
+ progress |= lower_dst_region(v, block, inst);
+
+ for (unsigned i = 0; i < inst->sources; i++) {
+ if (has_invalid_src_modifiers(devinfo, inst, i))
+ progress |= lower_src_modifiers(v, block, inst, i);
+
+ if (has_invalid_src_region(devinfo, inst, i))
+ progress |= lower_src_region(v, block, inst, i);
+ }
+
+ return progress;
+ }
+}
+
+bool
+fs_visitor::lower_regioning()
+{
+ bool progress = false;
+
+ foreach_block_and_inst_safe(block, fs_inst, inst, cfg)
+ progress |= lower_instruction(this, block, inst);
+
+ if (progress)
+ invalidate_live_intervals();
+
+ return progress;
+}