*
*/
-extern "C" {
-
-#include <sys/types.h>
-
-#include "main/macros.h"
-#include "main/shaderobj.h"
-#include "main/uniforms.h"
-#include "program/prog_optimize.h"
-#include "program/register_allocate.h"
-#include "program/sampler.h"
-#include "program/hash_table.h"
-#include "brw_context.h"
-#include "brw_eu.h"
-#include "brw_wm.h"
-}
#include "brw_fs.h"
-#include "../glsl/glsl_types.h"
-#include "../glsl/ir_optimization.h"
-#include "../glsl/ir_print_visitor.h"
+#include "glsl/glsl_types.h"
+#include "glsl/ir_optimization.h"
+#include "glsl/ir_print_visitor.h"
/** @file brw_fs_schedule_instructions.cpp
*
int math_latency = 22;
switch (inst->opcode) {
- case FS_OPCODE_RCP:
+ case SHADER_OPCODE_RCP:
this->latency = 1 * chans * math_latency;
break;
- case FS_OPCODE_RSQ:
+ case SHADER_OPCODE_RSQ:
this->latency = 2 * chans * math_latency;
break;
- case FS_OPCODE_SQRT:
- case FS_OPCODE_LOG2:
+ case SHADER_OPCODE_INT_QUOTIENT:
+ case SHADER_OPCODE_SQRT:
+ case SHADER_OPCODE_LOG2:
/* full precision log. partial is 2. */
this->latency = 3 * chans * math_latency;
break;
- case FS_OPCODE_EXP2:
+ case SHADER_OPCODE_INT_REMAINDER:
+ case SHADER_OPCODE_EXP2:
/* full precision. partial is 3, same throughput. */
this->latency = 4 * chans * math_latency;
break;
- case FS_OPCODE_POW:
+ case SHADER_OPCODE_POW:
this->latency = 8 * chans * math_latency;
break;
- case FS_OPCODE_SIN:
- case FS_OPCODE_COS:
+ case SHADER_OPCODE_SIN:
+ case SHADER_OPCODE_COS:
/* minimum latency, max is 12 rounds. */
this->latency = 5 * chans * math_latency;
break;
instruction_scheduler(fs_visitor *v, void *mem_ctx, int virtual_grf_count)
{
this->v = v;
- this->mem_ctx = talloc_new(mem_ctx);
+ this->mem_ctx = ralloc_context(mem_ctx);
this->virtual_grf_count = virtual_grf_count;
this->instructions.make_empty();
this->instructions_to_schedule = 0;
~instruction_scheduler()
{
- talloc_free(this->mem_ctx);
+ ralloc_free(this->mem_ctx);
}
void add_barrier_deps(schedule_node *n);
void add_dep(schedule_node *before, schedule_node *after, int latency);
+ void add_dep(schedule_node *before, schedule_node *after);
void add_inst(fs_inst *inst);
void calculate_deps();
void schedule_instructions(fs_inst *next_block_header);
+ bool is_compressed(fs_inst *inst);
+
void *mem_ctx;
int instructions_to_schedule;
else
before->child_array_size *= 2;
- before->children = talloc_realloc(mem_ctx, before->children,
- schedule_node *,
- before->child_array_size);
- before->child_latency = talloc_realloc(mem_ctx, before->child_latency,
- int, before->child_array_size);
+ before->children = reralloc(mem_ctx, before->children,
+ schedule_node *,
+ before->child_array_size);
+ before->child_latency = reralloc(mem_ctx, before->child_latency,
+ int, before->child_array_size);
}
before->children[before->child_count] = after;
after->parent_count++;
}
+void
+instruction_scheduler::add_dep(schedule_node *before, schedule_node *after)
+{
+ if (!before)
+ return;
+
+ add_dep(before, after, before->latency);
+}
+
/**
* Sometimes we really want this node to execute after everything that
* was before it and before everything that followed it. This adds
}
}
+/* instruction scheduling needs to be aware of when an MRF write
+ * actually writes 2 MRFs.
+ */
+bool
+instruction_scheduler::is_compressed(fs_inst *inst)
+{
+ return (v->c->dispatch_width == 16 &&
+ !inst->force_uncompressed &&
+ !inst->force_sechalf);
+}
+
void
instruction_scheduler::calculate_deps()
{
schedule_node *last_grf_write[virtual_grf_count];
schedule_node *last_mrf_write[BRW_MAX_MRF];
schedule_node *last_conditional_mod = NULL;
+ /* Fixed HW registers are assumed to be separate from the virtual
+ * GRFs, so they can be tracked separately. We don't really write
+ * to fixed GRFs much, so don't bother tracking them on a more
+ * granular level.
+ */
+ schedule_node *last_fixed_grf_write = NULL;
/* The last instruction always needs to still be the last
* instruction. Either it's flow control (IF, ELSE, ENDIF, DO,
memset(last_mrf_write, 0, sizeof(last_mrf_write));
/* top-to-bottom dependencies: RAW and WAW. */
- foreach_iter(exec_list_iterator, iter, instructions) {
- schedule_node *n = (schedule_node *)iter.get();
+ foreach_list(node, &instructions) {
+ schedule_node *n = (schedule_node *)node;
fs_inst *inst = n->inst;
/* read-after-write deps. */
for (int i = 0; i < 3; i++) {
if (inst->src[i].file == GRF) {
- if (last_grf_write[inst->src[i].reg]) {
- add_dep(last_grf_write[inst->src[i].reg], n,
- last_grf_write[inst->src[i].reg]->latency);
- }
+ add_dep(last_grf_write[inst->src[i].reg], n);
+ } else if (inst->src[i].file == FIXED_HW_REG &&
+ (inst->src[i].fixed_hw_reg.file ==
+ BRW_GENERAL_REGISTER_FILE)) {
+ add_dep(last_fixed_grf_write, n);
} else if (inst->src[i].file != BAD_FILE &&
inst->src[i].file != IMM &&
inst->src[i].file != UNIFORM) {
* instruction once it's sent, not when the result comes
* back.
*/
- if (last_mrf_write[inst->base_mrf + i]) {
- add_dep(last_mrf_write[inst->base_mrf + i], n,
- last_mrf_write[inst->base_mrf + i]->latency);
- }
+ add_dep(last_mrf_write[inst->base_mrf + i], n);
}
if (inst->predicated) {
assert(last_conditional_mod);
- add_dep(last_conditional_mod, n, last_conditional_mod->latency);
+ add_dep(last_conditional_mod, n);
}
/* write-after-write deps. */
if (inst->dst.file == GRF) {
- if (last_grf_write[inst->dst.reg]) {
- add_dep(last_grf_write[inst->dst.reg], n,
- last_grf_write[inst->dst.reg]->latency);
- }
+ add_dep(last_grf_write[inst->dst.reg], n);
last_grf_write[inst->dst.reg] = n;
} else if (inst->dst.file == MRF) {
- if (last_mrf_write[inst->dst.hw_reg]) {
- add_dep(last_mrf_write[inst->dst.hw_reg], n,
- last_mrf_write[inst->dst.hw_reg]->latency);
+ int reg = inst->dst.reg & ~BRW_MRF_COMPR4;
+
+ add_dep(last_mrf_write[reg], n);
+ last_mrf_write[reg] = n;
+ if (is_compressed(inst)) {
+ if (inst->dst.reg & BRW_MRF_COMPR4)
+ reg += 4;
+ else
+ reg++;
+ add_dep(last_mrf_write[reg], n);
+ last_mrf_write[reg] = n;
}
- last_mrf_write[inst->dst.hw_reg] = n;
+ } else if (inst->dst.file == FIXED_HW_REG &&
+ inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
+ last_fixed_grf_write = n;
} else if (inst->dst.file != BAD_FILE) {
add_barrier_deps(n);
}
if (inst->mlen > 0) {
for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
- if (last_mrf_write[inst->base_mrf + i]) {
- add_dep(last_mrf_write[inst->base_mrf + i], n,
- last_mrf_write[inst->base_mrf + i]->latency);
- }
+ add_dep(last_mrf_write[inst->base_mrf + i], n);
last_mrf_write[inst->base_mrf + i] = n;
}
}
memset(last_grf_write, 0, sizeof(last_grf_write));
memset(last_mrf_write, 0, sizeof(last_mrf_write));
last_conditional_mod = NULL;
+ last_fixed_grf_write = NULL;
exec_node *node;
exec_node *prev;
/* write-after-read deps. */
for (int i = 0; i < 3; i++) {
if (inst->src[i].file == GRF) {
- if (last_grf_write[inst->src[i].reg]) {
- add_dep(n, last_grf_write[inst->src[i].reg], n->latency);
- }
+ add_dep(n, last_grf_write[inst->src[i].reg]);
+ } else if (inst->src[i].file == FIXED_HW_REG &&
+ (inst->src[i].fixed_hw_reg.file ==
+ BRW_GENERAL_REGISTER_FILE)) {
+ add_dep(n, last_fixed_grf_write);
} else if (inst->src[i].file != BAD_FILE &&
inst->src[i].file != IMM &&
inst->src[i].file != UNIFORM) {
}
if (inst->predicated) {
- if (last_conditional_mod) {
- add_dep(n, last_conditional_mod, n->latency);
- }
+ add_dep(n, last_conditional_mod);
}
/* Update the things this instruction wrote, so earlier reads
if (inst->dst.file == GRF) {
last_grf_write[inst->dst.reg] = n;
} else if (inst->dst.file == MRF) {
- last_mrf_write[inst->dst.hw_reg] = n;
+ int reg = inst->dst.reg & ~BRW_MRF_COMPR4;
+
+ last_mrf_write[reg] = n;
+
+ if (is_compressed(inst)) {
+ if (inst->dst.reg & BRW_MRF_COMPR4)
+ reg += 4;
+ else
+ reg++;
+
+ last_mrf_write[reg] = n;
+ }
+ } else if (inst->dst.file == FIXED_HW_REG &&
+ inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
+ last_fixed_grf_write = n;
} else if (inst->dst.file != BAD_FILE) {
add_barrier_deps(n);
}
int time = 0;
/* Remove non-DAG heads from the list. */
- foreach_iter(exec_list_iterator, iter, instructions) {
- schedule_node *n = (schedule_node *)iter.get();
+ foreach_list_safe(node, &instructions) {
+ schedule_node *n = (schedule_node *)node;
if (n->parent_count != 0)
n->remove();
}
schedule_node *chosen = NULL;
int chosen_time = 0;
- foreach_iter(exec_list_iterator, iter, instructions) {
- schedule_node *n = (schedule_node *)iter.get();
+ foreach_list(node, &instructions) {
+ schedule_node *n = (schedule_node *)node;
if (!chosen || n->unblocked_time < chosen_time) {
chosen = n;
* progress until the first is done.
*/
if (chosen->inst->is_math()) {
- foreach_iter(exec_list_iterator, iter, instructions) {
- schedule_node *n = (schedule_node *)iter.get();
+ foreach_list(node, &instructions) {
+ schedule_node *n = (schedule_node *)node;
if (n->inst->is_math())
n->unblocked_time = MAX2(n->unblocked_time,