From d7342f6a81a0d13acb6486a24bffa8e5987d5410 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Christian=20K=C3=B6nig?= Date: Mon, 20 Dec 2010 22:09:09 +0100 Subject: [PATCH] r600g: merge alu groups --- src/gallium/drivers/r600/r600_asm.c | 186 ++++++++++++++++++++++------ src/gallium/drivers/r600/r600_asm.h | 1 + 2 files changed, 150 insertions(+), 37 deletions(-) diff --git a/src/gallium/drivers/r600/r600_asm.c b/src/gallium/drivers/r600/r600_asm.c index e2d52c3a46b..ca2bf93b0b8 100644 --- a/src/gallium/drivers/r600/r600_asm.c +++ b/src/gallium/drivers/r600/r600_asm.c @@ -312,7 +312,7 @@ static int assign_alu_units(struct r600_bc_alu *alu_first, struct r600_bc_alu *a for (i = 0; i < 5; i++) assignment[i] = NULL; - for (alu = alu_first; alu; alu = container_of(alu->list.next, alu, list)) { + for (alu = alu_first; alu; alu = LIST_ENTRY(struct r600_bc_alu, alu->list.next, list)) { chan = alu->dst.chan; if (is_alu_trans_unit_inst(alu)) trans = 1; @@ -502,24 +502,21 @@ static int check_scalar(struct r600_bc_alu *alu, struct alu_bank_swizzle *bs, in } return 0; } - -static int check_and_set_bank_swizzle(struct r600_bc *bc, struct r600_bc_alu *alu_first) + +static int check_and_set_bank_swizzle(struct r600_bc_alu *slots[5]) { - struct r600_bc_alu *assignment[5]; struct alu_bank_swizzle bs; int bank_swizzle[5]; - int i, r; + int i, r = 0, forced = 0; - r = assign_alu_units(alu_first, assignment); - if (r) - return r; - - if(alu_first->bank_swizzle_force) { - for (i = 0; i < 5; i++) - if (assignment[i]) - assignment[i]->bank_swizzle = assignment[i]->bank_swizzle_force; + for (i = 0; i < 5; i++) + if (slots[i] && slots[i]->bank_swizzle_force) { + slots[i]->bank_swizzle = slots[i]->bank_swizzle_force; + forced = 1; + } + + if (forced) return 0; - } // just check every possible combination of bank swizzle // not very efficent, but works on the first try in most of the cases @@ -529,19 +526,19 @@ static int check_and_set_bank_swizzle(struct r600_bc *bc, struct r600_bc_alu *al while(bank_swizzle[4] <= SQ_ALU_SCL_221) { init_bank_swizzle(&bs); for (i = 0; i < 4; i++) { - if (assignment[i]) { - r = check_vector(assignment[i], &bs, bank_swizzle[i]); + if (slots[i]) { + r = check_vector(slots[i], &bs, bank_swizzle[i]); if (r) break; } } - if (!r && assignment[4]) { - r = check_scalar(assignment[4], &bs, bank_swizzle[4]); + if (!r && slots[4]) { + r = check_scalar(slots[4], &bs, bank_swizzle[4]); } if (!r) { for (i = 0; i < 5; i++) { - if (assignment[i]) - assignment[i]->bank_swizzle = bank_swizzle[i]; + if (slots[i]) + slots[i]->bank_swizzle = bank_swizzle[i]; } return 0; } @@ -559,32 +556,27 @@ static int check_and_set_bank_swizzle(struct r600_bc *bc, struct r600_bc_alu *al return -1; } -static int replace_gpr_with_pv_ps(struct r600_bc_alu *alu_first, struct r600_bc_alu *alu_prev) +static int replace_gpr_with_pv_ps(struct r600_bc_alu *slots[5], struct r600_bc_alu *alu_prev) { - struct r600_bc_alu *slots[5]; + struct r600_bc_alu *prev[5]; int gpr[5], chan[5]; int i, j, r, src, num_src; - r = assign_alu_units(alu_prev, slots); + r = assign_alu_units(alu_prev, prev); if (r) return r; for (i = 0; i < 5; ++i) { - if(slots[i] && slots[i]->dst.write && !slots[i]->dst.rel) { - gpr[i] = slots[i]->dst.sel; - if (is_alu_reduction_inst(slots[i])) + if(prev[i] && prev[i]->dst.write && !prev[i]->dst.rel) { + gpr[i] = prev[i]->dst.sel; + if (is_alu_reduction_inst(prev[i])) chan[i] = 0; else - chan[i] = slots[i]->dst.chan; + chan[i] = prev[i]->dst.chan; } else - gpr[i] = -1; - + gpr[i] = -1; } - r = assign_alu_units(alu_first, slots); - if (r) - return r; - for (i = 0; i < 5; ++i) { struct r600_bc_alu *alu = slots[i]; if(!alu) @@ -616,6 +608,109 @@ static int replace_gpr_with_pv_ps(struct r600_bc_alu *alu_first, struct r600_bc_ return 0; } +static int merge_inst_groups(struct r600_bc *bc, struct r600_bc_alu *slots[5], struct r600_bc_alu *alu_prev) +{ + struct r600_bc_alu *prev[5]; + struct r600_bc_alu *result[5] = { NULL }; + int i, j, r, src, num_src; + int num_once_inst = 0; + + r = assign_alu_units(alu_prev, prev); + if (r) + return r; + + for (i = 0; i < 5; ++i) { + // TODO: we have literals? forget it! + if (prev[i] && prev[i]->nliteral) + return 0; + if (slots[i] && slots[i]->nliteral) + return 0; + + + // let's check used slots + if (prev[i] && !slots[i]) { + result[i] = prev[i]; + num_once_inst += is_alu_once_inst(prev[i]); + continue; + } else if (prev[i] && slots[i]) { + if (result[4] == NULL && prev[4] == NULL && slots[4] == NULL) { + // trans unit is still free try to use it + if (is_alu_any_unit_inst(slots[i])) { + result[i] = prev[i]; + result[4] = slots[i]; + } else if (is_alu_any_unit_inst(prev[i])) { + result[i] = slots[i]; + result[4] = prev[i]; + } else + return 0; + } else + return 0; + } else if(!slots[i]) { + continue; + } else + result[i] = slots[i]; + + // let's check source gprs + struct r600_bc_alu *alu = slots[i]; + num_once_inst += is_alu_once_inst(alu); + + num_src = r600_bc_get_num_operands(alu); + for (src = 0; src < num_src; ++src) { + // constants doesn't matter + if (!is_gpr(alu->src[src].sel)) + continue; + + for (j = 0; j < 5; ++j) { + if (!prev[j] || !prev[j]->dst.write) + continue; + + // if it's relative then we can't determin which gpr is really used + if (prev[j]->dst.chan == alu->src[src].chan && + (prev[j]->dst.sel == alu->src[src].sel || + prev[j]->dst.rel || alu->src[src].rel)) + return 0; + } + } + } + + /* more than one PRED_ or KILL_ ? */ + if (num_once_inst > 1) + return 0; + + /* check if the result can still be swizzlet */ + r = check_and_set_bank_swizzle(result); + if (r) + return 0; + + /* looks like everything worked out right, apply the changes */ + + /* sort instructions */ + for (i = 0; i < 5; ++i) { + slots[i] = result[i]; + if (result[i]) { + LIST_DEL(&result[i]->list); + result[i]->last = 0; + LIST_ADDTAIL(&result[i]->list, &bc->cf_last->alu); + } + } + + /* determine new last instruction */ + LIST_ENTRY(struct r600_bc_alu, bc->cf_last->alu.prev, list)->last = 1; + + /* determine new first instruction */ + for (i = 0; i < 5; ++i) { + if (result[i]) { + bc->cf_last->curr_bs_head = result[i]; + break; + } + } + + bc->cf_last->prev_bs_head = bc->cf_last->prev2_bs_head; + bc->cf_last->prev2_bs_head = NULL; + + return 0; +} + /* This code handles kcache lines as single blocks of 32 constants. We could * probably do slightly better by recognizing that we actually have two * consecutive lines of 16 constants, but the resulting code would also be @@ -775,7 +870,7 @@ int r600_bc_add_alu_type(struct r600_bc *bc, const struct r600_bc_alu *alu, int if (!bc->cf_last->curr_bs_head) { bc->cf_last->curr_bs_head = nalu; } - /* at most 128 slots, one add alu can add 4 slots + 4 constants(2 slots) + /* at most 128 slots, one add alu can add 5 slots + 4 constants(2 slots) * worst case */ if (nalu->last && (bc->cf_last->ndw >> 1) >= 120) { bc->force_add_cf = 1; @@ -810,11 +905,28 @@ int r600_bc_add_alu_type(struct r600_bc *bc, const struct r600_bc_alu *alu, int /* process cur ALU instructions for bank swizzle */ if (nalu->last) { - if (bc->cf_last->prev_bs_head) - replace_gpr_with_pv_ps(bc->cf_last->curr_bs_head, bc->cf_last->prev_bs_head); - r = check_and_set_bank_swizzle(bc, bc->cf_last->curr_bs_head); + struct r600_bc_alu *slots[5]; + r = assign_alu_units(bc->cf_last->curr_bs_head, slots); if (r) return r; + + if (bc->cf_last->prev_bs_head) { + r = merge_inst_groups(bc, slots, bc->cf_last->prev_bs_head); + if (r) + return r; + } + + if (bc->cf_last->prev_bs_head) { + r = replace_gpr_with_pv_ps(slots, bc->cf_last->prev_bs_head); + if (r) + return r; + } + + r = check_and_set_bank_swizzle(slots); + if (r) + return r; + + bc->cf_last->prev2_bs_head = bc->cf_last->prev_bs_head; bc->cf_last->prev_bs_head = bc->cf_last->curr_bs_head; bc->cf_last->curr_bs_head = NULL; } diff --git a/src/gallium/drivers/r600/r600_asm.h b/src/gallium/drivers/r600/r600_asm.h index 2a046d1e88d..570292e9fdc 100644 --- a/src/gallium/drivers/r600/r600_asm.h +++ b/src/gallium/drivers/r600/r600_asm.h @@ -144,6 +144,7 @@ struct r600_bc_cf { struct r600_bc_output output; struct r600_bc_alu *curr_bs_head; struct r600_bc_alu *prev_bs_head; + struct r600_bc_alu *prev2_bs_head; }; #define FC_NONE 0 -- 2.30.2