r600g/sb: work around hw issues with stack on eg/cm
authorVadim Girlin <vadimgirlin@gmail.com>
Fri, 15 Nov 2013 17:24:53 +0000 (18:24 +0100)
committerVadim Girlin <vadimgirlin@gmail.com>
Sat, 16 Nov 2013 21:36:28 +0000 (01:36 +0400)
v2: make it actually work, improve condition

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=68503
Cc: "10.0" <mesa-stable@lists.freedesktop.org>
Signed-off-by: Vadim Girlin <vadimgirlin@gmail.com>
src/gallium/drivers/r600/sb/sb_bc.h
src/gallium/drivers/r600/sb/sb_bc_finalize.cpp
src/gallium/drivers/r600/sb/sb_context.cpp
src/gallium/drivers/r600/sb/sb_ir.h
src/gallium/drivers/r600/sb/sb_pass.h

index ad1b862fd57548555bc8cd120e0655a8ae606b36..73b8b08ba3957347bcf84765e09a2a597ec7b662 100644 (file)
@@ -614,6 +614,10 @@ public:
        unsigned num_slots;
        bool uses_mova_gpr;
 
+       bool stack_workaround_8xx;
+       bool stack_workaround_9xx;
+
+       unsigned wavefront_size;
        unsigned stack_entry_size;
 
        static unsigned dump_pass;
@@ -638,6 +642,23 @@ public:
        bool is_cayman() {return hw_class == HW_CLASS_CAYMAN;}
        bool is_egcm() {return hw_class >= HW_CLASS_EVERGREEN;}
 
+       bool needs_8xx_stack_workaround() {
+               if (!is_evergreen())
+                       return false;
+
+               switch (hw_chip) {
+               case HW_CHIP_CYPRESS:
+               case HW_CHIP_JUNIPER:
+                       return false;
+               default:
+                       return true;
+               }
+       }
+
+       bool needs_9xx_stack_workaround() {
+               return is_cayman();
+       }
+
        sb_hw_class_bits hw_class_bit() {
                switch (hw_class) {
                case HW_CLASS_R600:return HB_R6;
index c56c866bafc222dad44f1ef5dabf6f749d8b57ba..bc71cf873da48a0c1afef74e32b0553fd4a192ce 100644 (file)
@@ -40,8 +40,9 @@ namespace r600_sb {
 
 int bc_finalizer::run() {
 
-       regions_vec &rv = sh.get_regions();
+       run_on(sh.root);
 
+       regions_vec &rv = sh.get_regions();
        for (regions_vec::reverse_iterator I = rv.rbegin(), E = rv.rend(); I != E;
                        ++I) {
                region_node *r = *I;
@@ -58,8 +59,6 @@ int bc_finalizer::run() {
                r->expand();
        }
 
-       run_on(sh.root);
-
        cf_peephole();
 
        // workaround for some problems on r6xx/7xx
@@ -213,18 +212,36 @@ void bc_finalizer::run_on(container_node* c) {
                if (n->is_alu_group()) {
                        finalize_alu_group(static_cast<alu_group_node*>(n));
                } else {
-                       if (n->is_fetch_inst()) {
+                       if (n->is_alu_clause()) {
+                               cf_node *c = static_cast<cf_node*>(n);
+
+                               if (c->bc.op == CF_OP_ALU_PUSH_BEFORE && ctx.is_egcm()) {
+                                       if (ctx.stack_workaround_8xx) {
+                                               region_node *r = c->get_parent_region();
+                                               if (r) {
+                                                       unsigned ifs, loops;
+                                                       unsigned elems = get_stack_depth(r, loops, ifs);
+                                                       unsigned dmod1 = elems % ctx.stack_entry_size;
+                                                       unsigned dmod2 = (elems + 1) % ctx.stack_entry_size;
+
+                                                       if (elems && (!dmod1 || !dmod2))
+                                                               c->flags |= NF_ALU_STACK_WORKAROUND;
+                                               }
+                                       } else if (ctx.stack_workaround_9xx) {
+                                               region_node *r = c->get_parent_region();
+                                               if (r) {
+                                                       unsigned ifs, loops;
+                                                       get_stack_depth(r, loops, ifs);
+                                                       if (loops >= 2)
+                                                               c->flags |= NF_ALU_STACK_WORKAROUND;
+                                               }
+                                       }
+                               }
+                       } else if (n->is_fetch_inst()) {
                                finalize_fetch(static_cast<fetch_node*>(n));
                        } else if (n->is_cf_inst()) {
                                finalize_cf(static_cast<cf_node*>(n));
-                       } else if (n->is_alu_clause()) {
-
-                       } else if (n->is_fetch_clause()) {
-
-                       } else {
-                               assert(!"unexpected node");
                        }
-
                        if (n->is_container())
                                run_on(static_cast<container_node*>(n));
                }
@@ -578,10 +595,6 @@ void bc_finalizer::finalize_cf(cf_node* c) {
 
        unsigned flags = c->bc.op_ptr->flags;
 
-       if (flags & CF_CALL) {
-               update_nstack(c->get_parent_region(), ctx.is_cayman() ? 1 : 2);
-       }
-
        c->bc.end_of_program = 0;
        last_cf = c;
 
@@ -715,17 +728,8 @@ void bc_finalizer::finalize_cf(cf_node* c) {
 
                        c->bc.index_gpr = reg >= 0 ? reg : 0;
                }
-
-
-
-       } else {
-
-#if 0
-               if ((flags & (CF_BRANCH | CF_LOOP)) && !sh.uses_gradients) {
-                       c->bc.valid_pixel_mode = 1;
-               }
-#endif
-
+       } else if (flags & CF_CALL) {
+               update_nstack(c->get_parent_region(), ctx.wavefront_size == 16 ? 2 : 1);
        }
 }
 
@@ -763,37 +767,78 @@ void bc_finalizer::update_ngpr(unsigned gpr) {
                ngpr = gpr + 1;
 }
 
-void bc_finalizer::update_nstack(region_node* r, unsigned add) {
-       unsigned loops = 0;
-       unsigned ifs = 0;
+unsigned bc_finalizer::get_stack_depth(node *n, unsigned &loops,
+                                           unsigned &ifs, unsigned add) {
+       unsigned stack_elements = add;
+       bool has_non_wqm_push_with_loops_on_stack = false;
+       bool has_non_wqm_push = (add != 0);
+       region_node *r = n->is_region() ?
+                       static_cast<region_node*>(n) : n->get_parent_region();
+
+       loops = 0;
+       ifs = 0;
 
        while (r) {
-               if (r->is_loop())
+               if (r->is_loop()) {
                        ++loops;
-               else
+                       if (has_non_wqm_push)
+                               has_non_wqm_push_with_loops_on_stack = true;
+               } else {
                        ++ifs;
-
+                       has_non_wqm_push = true;
+               }
                r = r->get_parent_region();
        }
-
-       unsigned stack_elements = (loops * ctx.stack_entry_size) + ifs + add;
-
-       // FIXME calculate more precisely
-       if (ctx.is_evergreen()) {
-               ++stack_elements;
-       } else {
-               stack_elements += 2;
-               if (ctx.is_cayman())
+       stack_elements += (loops * ctx.stack_entry_size) + ifs;
+
+       // reserve additional elements in some cases
+       switch (ctx.hw_class) {
+       case HW_CLASS_R600:
+       case HW_CLASS_R700:
+               if (has_non_wqm_push)
+                       stack_elements += 2;
+               break;
+       case HW_CLASS_CAYMAN:
+               if (stack_elements)
+                       stack_elements += 2;
+               break;
+       case HW_CLASS_EVERGREEN:
+               if (has_non_wqm_push_with_loops_on_stack)
                        ++stack_elements;
+               break;
        }
+       return stack_elements;
+}
 
-       unsigned stack_entries = (stack_elements + 3) >> 2;
+void bc_finalizer::update_nstack(region_node* r, unsigned add) {
+       unsigned loops = 0;
+       unsigned ifs = 0;
+       unsigned elems = r ? get_stack_depth(r, loops, ifs, add) : add;
+
+       // XXX all chips expect this value to be computed using 4 as entry size,
+       // not the real entry size
+       unsigned stack_entries = (elems + 3) >> 2;
 
        if (nstack < stack_entries)
                nstack = stack_entries;
 }
 
 void bc_finalizer::cf_peephole() {
+       if (ctx.stack_workaround_8xx || ctx.stack_workaround_9xx) {
+               for (node_iterator N, I = sh.root->begin(), E = sh.root->end(); I != E;
+                               I = N) {
+                       N = I; ++N;
+                       cf_node *c = static_cast<cf_node*>(*I);
+
+                       if (c->bc.op == CF_OP_ALU_PUSH_BEFORE &&
+                                       (c->flags & NF_ALU_STACK_WORKAROUND)) {
+                               cf_node *push = sh.create_cf(CF_OP_PUSH);
+                               c->insert_before(push);
+                               push->jump(c);
+                               c->bc.set_op(CF_OP_ALU);
+                       }
+               }
+       }
 
        for (node_iterator N, I = sh.root->begin(), E = sh.root->end(); I != E;
                        I = N) {
index 9474f74e89fe4a10576325d51592d46695ce8f3a..8e1142873ac617fe1424f1c11bd61c70ee3ca624 100644 (file)
@@ -66,20 +66,27 @@ int sb_context::init(r600_isa *isa, sb_hw_chip chip, sb_hw_class cclass) {
        case HW_CHIP_RS780:
        case HW_CHIP_RV620:
        case HW_CHIP_RS880:
-
+               wavefront_size = 16;
+               stack_entry_size = 8;
+               break;
        case HW_CHIP_RV630:
        case HW_CHIP_RV635:
        case HW_CHIP_RV730:
        case HW_CHIP_RV710:
        case HW_CHIP_PALM:
        case HW_CHIP_CEDAR:
+               wavefront_size = 32;
                stack_entry_size = 8;
                break;
        default:
+               wavefront_size = 64;
                stack_entry_size = 4;
                break;
        }
 
+       stack_workaround_8xx = needs_8xx_stack_workaround();
+       stack_workaround_9xx = needs_9xx_stack_workaround();
+
        return 0;
 }
 
index a74d6cb5aa2e23d2f25a40805409c7e0db212e76..85c3d06ea7f4f52d172ca69704a832736cacc3d9 100644 (file)
@@ -700,7 +700,10 @@ enum node_flags {
        NF_DONT_MOVE = (1 << 8),
 
        // for KILLxx - we want to schedule them as early as possible
-       NF_SCHEDULE_EARLY = (1 << 9)
+       NF_SCHEDULE_EARLY = (1 << 9),
+
+       // for ALU_PUSH_BEFORE - when set, replace with PUSH + ALU
+       NF_ALU_STACK_WORKAROUND = (1 << 10)
 };
 
 inline node_flags operator |(node_flags l, node_flags r) {
index a3f8515acdeae0ac02a6284bdf26d036920e587b..c955656449fbf04fa97d3df5b4f70c1c37b4a5a9 100644 (file)
@@ -708,6 +708,9 @@ public:
        void update_ngpr(unsigned gpr);
        void update_nstack(region_node *r, unsigned add = 0);
 
+       unsigned get_stack_depth(node *n, unsigned &loops, unsigned &ifs,
+                                unsigned add = 0);
+
        void cf_peephole();
 
 };