r600g: merge alu groups
authorChristian König <deathsimple@vodafone.de>
Mon, 20 Dec 2010 21:09:09 +0000 (22:09 +0100)
committerChristian König <deathsimple@vodafone.de>
Thu, 13 Jan 2011 22:01:35 +0000 (23:01 +0100)
src/gallium/drivers/r600/r600_asm.c
src/gallium/drivers/r600/r600_asm.h

index e2d52c3a46bffedc543a13538eedceb19b0e736d..ca2bf93b0b81dfd6c1c9d7908aca9203d33daf77 100644 (file)
@@ -312,7 +312,7 @@ static int assign_alu_units(struct r600_bc_alu *alu_first, struct r600_bc_alu *a
        for (i = 0; i < 5; i++)
                assignment[i] = NULL;
 
-       for (alu = alu_first; alu; alu = container_of(alu->list.next, alu, list)) {
+       for (alu = alu_first; alu; alu = LIST_ENTRY(struct r600_bc_alu, alu->list.next, list)) {
                chan = alu->dst.chan;
                if (is_alu_trans_unit_inst(alu))
                        trans = 1;
@@ -502,24 +502,21 @@ static int check_scalar(struct r600_bc_alu *alu, struct alu_bank_swizzle *bs, in
        }
        return 0;
 }
-static int check_and_set_bank_swizzle(struct r600_bc *bc, struct r600_bc_alu *alu_first)
+
+static int check_and_set_bank_swizzle(struct r600_bc_alu *slots[5])
 {
-       struct r600_bc_alu *assignment[5];
        struct alu_bank_swizzle bs;
        int bank_swizzle[5];
-       int i, r;
+       int i, r = 0, forced = 0;
  
-       r = assign_alu_units(alu_first, assignment);
-       if (r)
-               return r;
-       if(alu_first->bank_swizzle_force) {
-               for (i = 0; i < 5; i++)
-                       if (assignment[i])
-                               assignment[i]->bank_swizzle = assignment[i]->bank_swizzle_force;
+       for (i = 0; i < 5; i++)
+               if (slots[i] && slots[i]->bank_swizzle_force) {
+                       slots[i]->bank_swizzle = slots[i]->bank_swizzle_force;
+                       forced = 1;
+               }
+
+       if (forced)
                return 0;
-       }
 
        // just check every possible combination of bank swizzle
        // not very efficent, but works on the first try in most of the cases
@@ -529,19 +526,19 @@ static int check_and_set_bank_swizzle(struct r600_bc *bc, struct r600_bc_alu *al
        while(bank_swizzle[4] <= SQ_ALU_SCL_221) {
                init_bank_swizzle(&bs);
                for (i = 0; i < 4; i++) {
-                       if (assignment[i]) {
-                               r = check_vector(assignment[i], &bs, bank_swizzle[i]);
+                       if (slots[i]) {
+                               r = check_vector(slots[i], &bs, bank_swizzle[i]);
                                if (r)
                                        break;
                        }
                }
-               if (!r && assignment[4]) {
-                       r = check_scalar(assignment[4], &bs, bank_swizzle[4]);
+               if (!r && slots[4]) {
+                       r = check_scalar(slots[4], &bs, bank_swizzle[4]);
                }
                if (!r) {
                        for (i = 0; i < 5; i++) {
-                               if (assignment[i])
-                                       assignment[i]->bank_swizzle = bank_swizzle[i];
+                               if (slots[i])
+                                       slots[i]->bank_swizzle = bank_swizzle[i];
                        }
                        return 0;
                }
@@ -559,32 +556,27 @@ static int check_and_set_bank_swizzle(struct r600_bc *bc, struct r600_bc_alu *al
        return -1;
 }
 
-static int replace_gpr_with_pv_ps(struct r600_bc_alu *alu_first, struct r600_bc_alu *alu_prev)
+static int replace_gpr_with_pv_ps(struct r600_bc_alu *slots[5], struct r600_bc_alu *alu_prev)
 {
-       struct r600_bc_alu *slots[5];
+       struct r600_bc_alu *prev[5];
        int gpr[5], chan[5];
        int i, j, r, src, num_src;
        
-       r = assign_alu_units(alu_prev, slots);
+       r = assign_alu_units(alu_prev, prev);
        if (r)
                return r;
 
        for (i = 0; i < 5; ++i) {
-               if(slots[i] && slots[i]->dst.write && !slots[i]->dst.rel) {
-                       gpr[i] = slots[i]->dst.sel;
-                       if (is_alu_reduction_inst(slots[i]))
+               if(prev[i] && prev[i]->dst.write && !prev[i]->dst.rel) {
+                       gpr[i] = prev[i]->dst.sel;
+                       if (is_alu_reduction_inst(prev[i]))
                                chan[i] = 0;
                        else
-                               chan[i] = slots[i]->dst.chan;
+                               chan[i] = prev[i]->dst.chan;
                } else
-                       gpr[i] = -1;
-               
+                       gpr[i] = -1;            
        }
 
-       r = assign_alu_units(alu_first, slots);
-       if (r)
-               return r;
-
        for (i = 0; i < 5; ++i) {
                struct r600_bc_alu *alu = slots[i];
                if(!alu)
@@ -616,6 +608,109 @@ static int replace_gpr_with_pv_ps(struct r600_bc_alu *alu_first, struct r600_bc_
        return 0;
 }
 
+static int merge_inst_groups(struct r600_bc *bc, struct r600_bc_alu *slots[5], struct r600_bc_alu *alu_prev)
+{
+       struct r600_bc_alu *prev[5];
+       struct r600_bc_alu *result[5] = { NULL };
+       int i, j, r, src, num_src;
+       int num_once_inst = 0;
+
+       r = assign_alu_units(alu_prev, prev);
+       if (r)
+               return r;
+
+       for (i = 0; i < 5; ++i) {
+               // TODO: we have literals? forget it!
+               if (prev[i] && prev[i]->nliteral)
+                       return 0;
+               if (slots[i] && slots[i]->nliteral)
+                       return 0;
+
+
+               // let's check used slots
+               if (prev[i] && !slots[i]) {
+                       result[i] = prev[i];
+                       num_once_inst += is_alu_once_inst(prev[i]);
+                       continue;
+               } else if (prev[i] && slots[i]) {
+                       if (result[4] == NULL && prev[4] == NULL && slots[4] == NULL) {
+                               // trans unit is still free try to use it
+                               if (is_alu_any_unit_inst(slots[i])) {
+                                       result[i] = prev[i];
+                                       result[4] = slots[i];
+                               } else if (is_alu_any_unit_inst(prev[i])) {
+                                       result[i] = slots[i];
+                                       result[4] = prev[i];
+                               } else
+                                       return 0;
+                       } else
+                               return 0;
+               } else if(!slots[i]) {
+                       continue;
+               } else 
+                       result[i] = slots[i];
+
+               // let's check source gprs
+               struct r600_bc_alu *alu = slots[i];
+               num_once_inst += is_alu_once_inst(alu);
+
+               num_src = r600_bc_get_num_operands(alu);
+               for (src = 0; src < num_src; ++src) {
+                       // constants doesn't matter
+                       if (!is_gpr(alu->src[src].sel))
+                               continue;
+
+                       for (j = 0; j < 5; ++j) {
+                               if (!prev[j] || !prev[j]->dst.write)
+                                       continue;
+
+                               // if it's relative then we can't determin which gpr is really used
+                               if (prev[j]->dst.chan == alu->src[src].chan &&
+                                       (prev[j]->dst.sel == alu->src[src].sel ||
+                                       prev[j]->dst.rel || alu->src[src].rel))
+                                       return 0;
+                       }
+               }
+       }
+
+       /* more than one PRED_ or KILL_ ? */
+       if (num_once_inst > 1)
+               return 0;
+
+       /* check if the result can still be swizzlet */
+       r = check_and_set_bank_swizzle(result);
+       if (r)
+               return 0;
+
+       /* looks like everything worked out right, apply the changes */
+
+       /* sort instructions */
+       for (i = 0; i < 5; ++i) {
+               slots[i] = result[i];
+               if (result[i]) {
+                       LIST_DEL(&result[i]->list);
+                       result[i]->last = 0;
+                       LIST_ADDTAIL(&result[i]->list, &bc->cf_last->alu);
+               }
+       }
+
+       /* determine new last instruction */
+       LIST_ENTRY(struct r600_bc_alu, bc->cf_last->alu.prev, list)->last = 1;
+
+       /* determine new first instruction */
+       for (i = 0; i < 5; ++i) {
+               if (result[i]) {
+                       bc->cf_last->curr_bs_head = result[i];
+                       break;
+               }
+       }
+
+       bc->cf_last->prev_bs_head = bc->cf_last->prev2_bs_head;
+       bc->cf_last->prev2_bs_head = NULL;
+
+       return 0;
+}
+
 /* This code handles kcache lines as single blocks of 32 constants. We could
  * probably do slightly better by recognizing that we actually have two
  * consecutive lines of 16 constants, but the resulting code would also be
@@ -775,7 +870,7 @@ int r600_bc_add_alu_type(struct r600_bc *bc, const struct r600_bc_alu *alu, int
        if (!bc->cf_last->curr_bs_head) {
                bc->cf_last->curr_bs_head = nalu;
        }
-       /* at most 128 slots, one add alu can add 4 slots + 4 constants(2 slots)
+       /* at most 128 slots, one add alu can add 5 slots + 4 constants(2 slots)
         * worst case */
        if (nalu->last && (bc->cf_last->ndw >> 1) >= 120) {
                bc->force_add_cf = 1;
@@ -810,11 +905,28 @@ int r600_bc_add_alu_type(struct r600_bc *bc, const struct r600_bc_alu *alu, int
 
        /* process cur ALU instructions for bank swizzle */
        if (nalu->last) {
-               if (bc->cf_last->prev_bs_head)
-                       replace_gpr_with_pv_ps(bc->cf_last->curr_bs_head, bc->cf_last->prev_bs_head);
-               r = check_and_set_bank_swizzle(bc, bc->cf_last->curr_bs_head);
+               struct r600_bc_alu *slots[5];
+               r = assign_alu_units(bc->cf_last->curr_bs_head, slots);
                if (r)
                        return r;
+
+               if (bc->cf_last->prev_bs_head) {
+                       r = merge_inst_groups(bc, slots, bc->cf_last->prev_bs_head);
+                       if (r)
+                               return r;
+               }
+
+               if (bc->cf_last->prev_bs_head) {
+                       r = replace_gpr_with_pv_ps(slots, bc->cf_last->prev_bs_head);
+                       if (r)
+                               return r;
+               }
+
+               r = check_and_set_bank_swizzle(slots);
+               if (r)
+                       return r;
+
+               bc->cf_last->prev2_bs_head = bc->cf_last->prev_bs_head;
                bc->cf_last->prev_bs_head = bc->cf_last->curr_bs_head;
                bc->cf_last->curr_bs_head = NULL;
        }
index 2a046d1e88d5e82f3b4111342fc1a3ab8d5054bd..570292e9fdc9e44ad2f325235c558dcc57753bca 100644 (file)
@@ -144,6 +144,7 @@ struct r600_bc_cf {
        struct r600_bc_output           output;
        struct r600_bc_alu              *curr_bs_head;
        struct r600_bc_alu              *prev_bs_head;
+       struct r600_bc_alu              *prev2_bs_head;
 };
 
 #define FC_NONE                                0