* - tdr0 thread dependency register
*/
-#include <tuple>
-#include <vector>
-
#include "brw_fs.h"
#include "brw_cfg.h"
*/
typedef int ordered_address;
+ /**
+ * Return the number of instructions in the program.
+ */
+ unsigned
+ num_instructions(const backend_shader *shader)
+ {
+ return shader->cfg->blocks[shader->cfg->num_blocks - 1]->end_ip + 1;
+ }
+
/**
* Calculate the local ordered_address instruction counter at every
* instruction of the shader for subsequent constant-time look-up.
*/
- std::vector<ordered_address>
+ ordered_address *
ordered_inst_addresses(const fs_visitor *shader)
{
- std::vector<ordered_address> jps;
+ ordered_address *jps = new ordered_address[num_instructions(shader)];
ordered_address jp = 0;
+ unsigned ip = 0;
foreach_block_and_inst(block, fs_inst, inst, shader->cfg) {
- jps.push_back(jp);
+ jps[ip] = jp;
jp += ordered_unit(inst);
+ ip++;
}
return jps;
* only if i == j for every pair of unsigned integers i and j.
*/
struct equivalence_relation {
+ equivalence_relation(unsigned n) : is(new unsigned[n]), n(n)
+ {
+ for (unsigned i = 0; i < n; i++)
+ is[i] = i;
+ }
+
+ ~equivalence_relation()
+ {
+ delete[] is;
+ }
+
/**
* Return equivalence class index of the specified element. Effectively
* this is the numeric value of an arbitrary representative from the
unsigned
lookup(unsigned i) const
{
- if (i < is.size() && is[i] != i)
+ if (i < n && is[i] != i)
return lookup(is[i]);
else
return i;
* Create an array with the results of the lookup() method for
* constant-time evaluation.
*/
- std::vector<unsigned>
- flatten() const {
- std::vector<unsigned> ids;
+ unsigned *
+ flatten() const
+ {
+ unsigned *ids = new unsigned[n];
- for (const auto i : is)
- ids.push_back(lookup(i));
+ for (unsigned i = 0; i < n; i++)
+ ids[i] = lookup(i);
return ids;
}
}
private:
+ equivalence_relation(const equivalence_relation &);
+
+ equivalence_relation &
+ operator=(const equivalence_relation &);
+
/**
* Assign the representative of \p from to be equivalent to \p to.
*
assign(unsigned from, unsigned to)
{
if (from != to) {
- if (from < is.size() && is[from] != from)
- assign(is[from], to);
+ assert(from < n);
- for (unsigned i = is.size(); i <= from; i++)
- is.push_back(i);
+ if (is[from] != from)
+ assign(is[from], to);
is[from] = to;
}
}
- std::vector<unsigned> is;
+ unsigned *is;
+ unsigned n;
};
/**
* No dependency information.
*/
dependency() : ordered(TGL_REGDIST_NULL), jp(INT_MIN),
- unordered(TGL_SBID_NULL), id(0) {}
+ unordered(TGL_SBID_NULL), id(0),
+ exec_all(false) {}
/**
* Construct a dependency on the in-order instruction with the provided
* ordered_address instruction counter.
*/
- dependency(tgl_regdist_mode mode, ordered_address jp) :
- ordered(mode), jp(jp), unordered(TGL_SBID_NULL), id(0) {}
+ dependency(tgl_regdist_mode mode, ordered_address jp, bool exec_all) :
+ ordered(mode), jp(jp), unordered(TGL_SBID_NULL), id(0),
+ exec_all(exec_all) {}
/**
* Construct a dependency on the out-of-order instruction with the
* specified synchronization token.
*/
- dependency(tgl_sbid_mode mode, unsigned id) :
- ordered(TGL_REGDIST_NULL), jp(INT_MIN), unordered(mode), id(id) {}
+ dependency(tgl_sbid_mode mode, unsigned id, bool exec_all) :
+ ordered(TGL_REGDIST_NULL), jp(INT_MIN), unordered(mode), id(id),
+ exec_all(exec_all) {}
/**
* Synchronization mode of in-order dependency, or zero if no in-order
/** Synchronization token of out-of-order dependency. */
unsigned id;
+ /**
+ * Whether the dependency could be run with execution masking disabled,
+ * which might lead to the unwanted execution of the generating
+ * instruction in cases where a BB is executed with all channels
+ * disabled due to hardware bug GEN:BUG:1407528679.
+ */
+ bool exec_all;
+
/**
* Trivial in-order dependency that's always satisfied.
*
return dep0.ordered == dep1.ordered &&
dep0.jp == dep1.jp &&
dep0.unordered == dep1.unordered &&
- dep0.id == dep1.id;
+ dep0.id == dep1.id &&
+ dep0.exec_all == dep1.exec_all;
}
friend bool
}
};
- const dependency dependency::done = dependency(TGL_REGDIST_SRC, INT_MIN);
+ const dependency dependency::done = dependency(TGL_REGDIST_SRC, INT_MIN, false);
/**
* Return whether \p dep contains any dependency information.
dep1.unordered ? dep1.id : dep0.id);
}
+ dep.exec_all = dep0.exec_all || dep1.exec_all;
+
return dep;
}
* Dependency list handling.
* @{
*/
+ struct dependency_list {
+ dependency_list() : deps(NULL), n(0) {}
+
+ ~dependency_list()
+ {
+ free(deps);
+ }
+
+ void
+ push_back(const dependency &dep)
+ {
+ deps = (dependency *)realloc(deps, (n + 1) * sizeof(*deps));
+ deps[n++] = dep;
+ }
+
+ unsigned
+ size() const
+ {
+ return n;
+ }
+
+ const dependency &
+ operator[](unsigned i) const
+ {
+ assert(i < n);
+ return deps[i];
+ }
+
+ dependency &
+ operator[](unsigned i)
+ {
+ assert(i < n);
+ return deps[i];
+ }
+
+ private:
+ dependency_list(const dependency_list &);
+ dependency_list &
+ operator=(const dependency_list &);
+
+ dependency *deps;
+ unsigned n;
+ };
/**
* Add dependency \p dep to the list of dependencies of an instruction
* \p deps.
*/
void
- add_dependency(const std::vector<unsigned> &ids,
- std::vector<dependency> &deps, dependency dep)
+ add_dependency(const unsigned *ids, dependency_list &deps, dependency dep)
{
if (is_valid(dep)) {
/* Translate the unordered dependency token first in order to keep
* the list minimally redundant.
*/
- if (dep.unordered && dep.id < ids.size())
+ if (dep.unordered)
dep.id = ids[dep.id];
/* Try to combine the specified dependency with any existing ones. */
- for (auto &dep1 : deps) {
- if (dep.ordered && dep1.ordered) {
- dep1.jp = MAX2(dep1.jp, dep.jp);
- dep1.ordered |= dep.ordered;
+ for (unsigned i = 0; i < deps.size(); i++) {
+ /* Don't combine otherwise matching dependencies if there is an
+ * exec_all mismatch which would cause a SET dependency to gain an
+ * exec_all flag, since that would prevent it from being baked
+ * into the instruction we want to allocate an SBID for.
+ */
+ if (deps[i].exec_all != dep.exec_all &&
+ (!deps[i].exec_all || (dep.unordered & TGL_SBID_SET)) &&
+ (!dep.exec_all || (deps[i].unordered & TGL_SBID_SET)))
+ continue;
+
+ if (dep.ordered && deps[i].ordered) {
+ deps[i].jp = MAX2(deps[i].jp, dep.jp);
+ deps[i].ordered |= dep.ordered;
+ deps[i].exec_all |= dep.exec_all;
dep.ordered = TGL_REGDIST_NULL;
}
- if (dep.unordered && dep1.unordered && dep1.id == dep.id) {
- dep1.unordered |= dep.unordered;
+ if (dep.unordered && deps[i].unordered && deps[i].id == dep.id) {
+ deps[i].unordered |= dep.unordered;
+ deps[i].exec_all |= dep.exec_all;
dep.unordered = TGL_SBID_NULL;
}
}
/**
* Construct a tgl_swsb annotation encoding any ordered dependencies from
- * the dependency list \p deps of an instruction with ordered_address
- * \p jp.
+ * the dependency list \p deps of an instruction with ordered_address \p
+ * jp. If \p exec_all is false only dependencies known to be executed with
+ * channel masking applied will be considered in the calculation.
*/
tgl_swsb
- ordered_dependency_swsb(const std::vector<dependency> &deps,
- const ordered_address &jp)
+ ordered_dependency_swsb(const dependency_list &deps,
+ const ordered_address &jp,
+ bool exec_all)
{
unsigned min_dist = ~0u;
- for (const auto &dep : deps) {
- if (dep.ordered) {
- const unsigned dist = jp - dep.jp;
+ for (unsigned i = 0; i < deps.size(); i++) {
+ if (deps[i].ordered && exec_all >= deps[i].exec_all) {
+ const unsigned dist = jp - deps[i].jp;
const unsigned max_dist = 10;
- assert(jp > dep.jp);
+ assert(jp > deps[i].jp);
if (dist <= max_dist)
min_dist = MIN3(min_dist, dist, 7);
}
/**
* Return whether the dependency list \p deps of an instruction with
- * ordered_address \p jp has any non-trivial ordered dependencies.
+ * ordered_address \p jp has any non-trivial ordered dependencies. If \p
+ * exec_all is false only dependencies known to be executed with channel
+ * masking applied will be considered in the calculation.
*/
bool
- find_ordered_dependency(const std::vector<dependency> &deps,
- const ordered_address &jp)
+ find_ordered_dependency(const dependency_list &deps,
+ const ordered_address &jp,
+ bool exec_all)
{
- return ordered_dependency_swsb(deps, jp).regdist;
+ return ordered_dependency_swsb(deps, jp, exec_all).regdist;
}
/**
* Return the full tgl_sbid_mode bitset for the first unordered dependency
* on the list \p deps that matches the specified tgl_sbid_mode, or zero if
- * no such dependency is present.
+ * no such dependency is present. If \p exec_all is false only
+ * dependencies known to be executed with channel masking applied will be
+ * considered in the calculation.
*/
tgl_sbid_mode
- find_unordered_dependency(const std::vector<dependency> &deps,
- tgl_sbid_mode unordered)
+ find_unordered_dependency(const dependency_list &deps,
+ tgl_sbid_mode unordered,
+ bool exec_all)
{
if (unordered) {
- for (const auto &dep : deps) {
- if (unordered & dep.unordered)
- return dep.unordered;
+ for (unsigned i = 0; i < deps.size(); i++) {
+ if ((unordered & deps[i].unordered) &&
+ exec_all >= deps[i].exec_all)
+ return deps[i].unordered;
}
}
*/
tgl_sbid_mode
baked_unordered_dependency_mode(const fs_inst *inst,
- const std::vector<dependency> &deps,
+ const dependency_list &deps,
const ordered_address &jp)
{
- const bool has_ordered = find_ordered_dependency(deps, jp);
+ const bool exec_all = inst->force_writemask_all;
+ const bool has_ordered = find_ordered_dependency(deps, jp, exec_all);
- if (find_unordered_dependency(deps, TGL_SBID_SET))
- return find_unordered_dependency(deps, TGL_SBID_SET);
+ if (find_unordered_dependency(deps, TGL_SBID_SET, exec_all))
+ return find_unordered_dependency(deps, TGL_SBID_SET, exec_all);
else if (has_ordered && is_unordered(inst))
return TGL_SBID_NULL;
- else if (find_unordered_dependency(deps, TGL_SBID_DST) &&
+ else if (find_unordered_dependency(deps, TGL_SBID_DST, exec_all) &&
(!has_ordered || !is_unordered(inst)))
- return find_unordered_dependency(deps, TGL_SBID_DST);
+ return find_unordered_dependency(deps, TGL_SBID_DST, exec_all);
else if (!has_ordered)
- return find_unordered_dependency(deps, TGL_SBID_SRC);
+ return find_unordered_dependency(deps, TGL_SBID_SRC, exec_all);
else
return TGL_SBID_NULL;
}
* instruction \p inst.
*/
void
- update_inst_scoreboard(const fs_visitor *shader,
- const std::vector<ordered_address> &jps,
+ update_inst_scoreboard(const ordered_address *jps,
const fs_inst *inst, unsigned ip, scoreboard &sb)
{
+ const bool exec_all = inst->force_writemask_all;
+
/* Track any source registers that may be fetched asynchronously by this
* instruction, otherwise clear the dependency in order to avoid
* subsequent redundant synchronization.
*/
for (unsigned i = 0; i < inst->sources; i++) {
const dependency rd_dep =
- inst->is_payload(i) || inst->is_math() ? dependency(TGL_SBID_SRC, ip) :
- ordered_unit(inst) ? dependency(TGL_REGDIST_SRC, jps[ip]) :
+ (inst->is_payload(i) ||
+ inst->is_math()) ? dependency(TGL_SBID_SRC, ip, exec_all) :
+ ordered_unit(inst) ? dependency(TGL_REGDIST_SRC, jps[ip], exec_all) :
dependency::done;
for (unsigned j = 0; j < regs_read(inst, i); j++)
}
if (is_send(inst) && inst->base_mrf != -1) {
- const dependency rd_dep = dependency(TGL_SBID_SRC, ip);
+ const dependency rd_dep = dependency(TGL_SBID_SRC, ip, exec_all);
for (unsigned j = 0; j < inst->mlen; j++)
sb.set(brw_uvec_mrf(8, inst->base_mrf + j, 0), rd_dep);
/* Track any destination registers of this instruction. */
const dependency wr_dep =
- is_unordered(inst) ? dependency(TGL_SBID_DST, ip) :
- ordered_unit(inst) ? dependency(TGL_REGDIST_DST, jps[ip]) :
+ is_unordered(inst) ? dependency(TGL_SBID_DST, ip, exec_all) :
+ ordered_unit(inst) ? dependency(TGL_REGDIST_DST, jps[ip], exec_all) :
dependency();
if (is_valid(wr_dep) && inst->dst.file != BAD_FILE &&
* unconditionally resolved) dependencies at the end of each block of the
* program.
*/
- std::vector<scoreboard>
+ scoreboard *
gather_block_scoreboards(const fs_visitor *shader,
- const std::vector<ordered_address> &jps)
+ const ordered_address *jps)
{
- std::vector<scoreboard> sbs(shader->cfg->num_blocks);
+ scoreboard *sbs = new scoreboard[shader->cfg->num_blocks];
unsigned ip = 0;
foreach_block_and_inst(block, fs_inst, inst, shader->cfg)
- update_inst_scoreboard(shader, jps, inst, ip++, sbs[block->num]);
+ update_inst_scoreboard(jps, inst, ip++, sbs[block->num]);
return sbs;
}
* Calculates the set of dependencies potentially pending at the beginning
* of each block, and returns it as an array of scoreboard objects.
*/
- std::pair<std::vector<scoreboard>, std::vector<unsigned>>
+ scoreboard *
propagate_block_scoreboards(const fs_visitor *shader,
- const std::vector<ordered_address> &jps)
+ const ordered_address *jps,
+ equivalence_relation &eq)
{
- const std::vector<scoreboard> delta_sbs =
- gather_block_scoreboards(shader, jps);
- std::vector<scoreboard> in_sbs(shader->cfg->num_blocks);
- std::vector<scoreboard> out_sbs(shader->cfg->num_blocks);
- equivalence_relation eq;
+ const scoreboard *delta_sbs = gather_block_scoreboards(shader, jps);
+ scoreboard *in_sbs = new scoreboard[shader->cfg->num_blocks];
+ scoreboard *out_sbs = new scoreboard[shader->cfg->num_blocks];
for (bool progress = true; progress;) {
progress = false;
}
}
- return { std::move(in_sbs), eq.flatten() };
+ delete[] delta_sbs;
+ delete[] out_sbs;
+
+ return in_sbs;
}
/**
* Return the list of potential dependencies of each instruction in the
* shader based on the result of global dependency analysis.
*/
- std::vector<std::vector<dependency>>
+ dependency_list *
gather_inst_dependencies(const fs_visitor *shader,
- const std::vector<ordered_address> &jps)
+ const ordered_address *jps)
{
- std::vector<scoreboard> sbs;
- std::vector<unsigned> ids;
- std::vector<std::vector<dependency>> deps;
+ equivalence_relation eq(num_instructions(shader));
+ scoreboard *sbs = propagate_block_scoreboards(shader, jps, eq);
+ const unsigned *ids = eq.flatten();
+ dependency_list *deps = new dependency_list[num_instructions(shader)];
unsigned ip = 0;
- std::tie(sbs, ids) = propagate_block_scoreboards(shader, jps);
-
foreach_block_and_inst(block, fs_inst, inst, shader->cfg) {
+ const bool exec_all = inst->force_writemask_all;
scoreboard &sb = sbs[block->num];
- std::vector<dependency> inst_deps;
for (unsigned i = 0; i < inst->sources; i++) {
for (unsigned j = 0; j < regs_read(inst, i); j++)
- add_dependency(ids, inst_deps, dependency_for_read(
+ add_dependency(ids, deps[ip], dependency_for_read(
sb.get(byte_offset(inst->src[i], REG_SIZE * j))));
}
if (is_send(inst) && inst->base_mrf != -1) {
for (unsigned j = 0; j < inst->mlen; j++)
- add_dependency(ids, inst_deps, dependency_for_read(
+ add_dependency(ids, deps[ip], dependency_for_read(
sb.get(brw_uvec_mrf(8, inst->base_mrf + j, 0))));
}
if (is_unordered(inst))
- add_dependency(ids, inst_deps, dependency(TGL_SBID_SET, ip));
+ add_dependency(ids, deps[ip],
+ dependency(TGL_SBID_SET, ip, exec_all));
if (!inst->no_dd_check) {
if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
for (unsigned j = 0; j < regs_written(inst); j++) {
- add_dependency(ids, inst_deps, dependency_for_write(inst,
+ add_dependency(ids, deps[ip], dependency_for_write(inst,
sb.get(byte_offset(inst->dst, REG_SIZE * j))));
}
}
if (is_send(inst) && inst->base_mrf != -1) {
- for (int j = 0; j < shader->implied_mrf_writes(inst); j++)
- add_dependency(ids, inst_deps, dependency_for_write(inst,
+ for (unsigned j = 0; j < inst->implied_mrf_writes(); j++)
+ add_dependency(ids, deps[ip], dependency_for_write(inst,
sb.get(brw_uvec_mrf(8, inst->base_mrf + j, 0))));
}
}
- deps.push_back(inst_deps);
- update_inst_scoreboard(shader, jps, inst, ip, sb);
+ update_inst_scoreboard(jps, inst, ip, sb);
ip++;
}
+ delete[] sbs;
+ delete[] ids;
+
return deps;
}
* Allocate SBID tokens to track the execution of every out-of-order
* instruction of the shader.
*/
- std::vector<std::vector<dependency>>
+ dependency_list *
allocate_inst_dependencies(const fs_visitor *shader,
- const std::vector<std::vector<dependency>> &deps0)
+ const dependency_list *deps0)
{
/* XXX - Use bin-packing algorithm to assign hardware SBIDs optimally in
* shaders with a large number of SEND messages.
*/
- std::vector<std::vector<dependency>> deps1;
- std::vector<unsigned> ids(deps0.size(), ~0u);
+
+ /* Allocate an unordered dependency ID to hardware SBID translation
+ * table with as many entries as instructions there are in the shader,
+ * which is the maximum number of unordered IDs we can find in the
+ * program.
+ */
+ unsigned *ids = new unsigned[num_instructions(shader)];
+ for (unsigned ip = 0; ip < num_instructions(shader); ip++)
+ ids[ip] = ~0u;
+
+ dependency_list *deps1 = new dependency_list[num_instructions(shader)];
unsigned next_id = 0;
- for (const auto &inst_deps0 : deps0) {
- std::vector<dependency> inst_deps1;
+ for (unsigned ip = 0; ip < num_instructions(shader); ip++) {
+ for (unsigned i = 0; i < deps0[ip].size(); i++) {
+ const dependency &dep = deps0[ip][i];
- for (const auto &dep : inst_deps0) {
if (dep.unordered && ids[dep.id] == ~0u)
ids[dep.id] = (next_id++) & 0xf;
- add_dependency(ids, inst_deps1, dep);
+ add_dependency(ids, deps1[ip], dep);
}
-
- deps1.push_back(inst_deps1);
}
+ delete[] ids;
+
return deps1;
}
*/
void
emit_inst_dependencies(fs_visitor *shader,
- const std::vector<ordered_address> &jps,
- const std::vector<std::vector<dependency>> &deps)
+ const ordered_address *jps,
+ const dependency_list *deps)
{
unsigned ip = 0;
foreach_block_and_inst_safe(block, fs_inst, inst, shader->cfg) {
- tgl_swsb swsb = ordered_dependency_swsb(deps[ip], jps[ip]);
+ const bool exec_all = inst->force_writemask_all;
+ tgl_swsb swsb = ordered_dependency_swsb(deps[ip], jps[ip], exec_all);
const tgl_sbid_mode unordered_mode =
baked_unordered_dependency_mode(inst, deps[ip], jps[ip]);
- for (const auto &dep : deps[ip]) {
+ for (unsigned i = 0; i < deps[ip].size(); i++) {
+ const dependency &dep = deps[ip][i];
+
if (dep.unordered) {
- if (unordered_mode == dep.unordered && !swsb.mode) {
+ if (unordered_mode == dep.unordered &&
+ exec_all >= dep.exec_all && !swsb.mode) {
/* Bake unordered dependency into the instruction's SWSB if
- * possible.
+ * possible, except in cases where the current instruction
+ * isn't marked NoMask but the dependency is, since that
+ * might lead to data coherency issues due to
+ * GEN:BUG:1407528679.
*/
swsb.sbid = dep.id;
swsb.mode = dep.unordered;
* instruction.
*/
const fs_builder ibld = fs_builder(shader, block, inst)
- .exec_all().group(1, 0);
+ .exec_all().group(1, 0);
fs_inst *sync = ibld.emit(BRW_OPCODE_SYNC, ibld.null_reg_ud(),
brw_imm_ud(TGL_SYNC_NOP));
sync->sched.sbid = dep.id;
}
}
+ for (unsigned i = 0; i < deps[ip].size(); i++) {
+ const dependency &dep = deps[ip][i];
+
+ if (dep.ordered && dep.exec_all > exec_all &&
+ find_ordered_dependency(deps[ip], jps[ip], true)) {
+ /* If the current instruction is not marked NoMask but an
+ * ordered dependency is, perform the synchronization as a
+ * separate NoMask SYNC instruction in order to avoid data
+ * coherency issues due to GEN:BUG:1407528679. The similar
+ * scenario with unordered dependencies should have been
+ * handled above.
+ */
+ const fs_builder ibld = fs_builder(shader, block, inst)
+ .exec_all().group(1, 0);
+ fs_inst *sync = ibld.emit(BRW_OPCODE_SYNC, ibld.null_reg_ud(),
+ brw_imm_ud(TGL_SYNC_NOP));
+ sync->sched = ordered_dependency_swsb(deps[ip], jps[ip], true);
+ break;
+ }
+ }
+
/* Update the IR. */
inst->sched = swsb;
inst->no_dd_check = inst->no_dd_clear = false;
fs_visitor::lower_scoreboard()
{
if (devinfo->gen >= 12) {
- const std::vector<ordered_address> jps = ordered_inst_addresses(this);
- emit_inst_dependencies(this, jps,
- allocate_inst_dependencies(this,
- gather_inst_dependencies(this, jps)));
+ const ordered_address *jps = ordered_inst_addresses(this);
+ const dependency_list *deps0 = gather_inst_dependencies(this, jps);
+ const dependency_list *deps1 = allocate_inst_dependencies(this, deps0);
+ emit_inst_dependencies(this, jps, deps1);
+ delete[] deps1;
+ delete[] deps0;
+ delete[] jps;
}
return true;