From: Rob Clark Date: Tue, 7 Nov 2017 20:12:03 +0000 (-0500) Subject: freedreno/ir3: moar better scheduler X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=819a613ae33410584e13e78e78af82c71716f67d;p=mesa.git freedreno/ir3: moar better scheduler Add a new pass that inserts additional dependencies, rather than simply relying on SSA srcs added in the nir->ir3 frontend. This makes it easier to deal with barriers, but the additional false deps also lets us deal properly with ensuring a write depends on all previous reads. Since conversion to barrier instructions is lossy (ie. just knowing the instruction doesn't tell us enough about what other instructions the barrier applies to), use barrier_class/barrier_conflict fields in the ir3_instruction to retain this information. This could probably be relaxed somewhat by considering *which* array/ buffer/image variable is being referenced. Ie. a write to buffer A can overtake a read from buffer B, if B is not coherent. (right?) Signed-off-by: Rob Clark --- diff --git a/src/gallium/drivers/freedreno/ir3/ir3.c b/src/gallium/drivers/freedreno/ir3/ir3.c index 6db0a2a20cd..01a7bbc7dc6 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3.c +++ b/src/gallium/drivers/freedreno/ir3/ir3.c @@ -817,6 +817,12 @@ struct ir3_instruction * ir3_instr_clone(struct ir3_instruction *instr) return new_instr; } +/* Add a false dependency to instruction, to ensure it is scheduled first: */ +void ir3_instr_add_dep(struct ir3_instruction *instr, struct ir3_instruction *dep) +{ + array_insert(instr, instr->deps, dep); +} + struct ir3_register * ir3_reg_create(struct ir3_instruction *instr, int num, int flags) { diff --git a/src/gallium/drivers/freedreno/ir3/ir3.h b/src/gallium/drivers/freedreno/ir3/ir3.h index 90f8e3c44d3..6ef0683ab00 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3.h +++ b/src/gallium/drivers/freedreno/ir3/ir3.h @@ -326,6 +326,40 @@ struct ir3_instruction { */ struct ir3_instruction *address; + /* Tracking for additional dependent instructions. Used to handle + * barriers, WAR hazards for arrays/SSBOs/etc. + */ + DECLARE_ARRAY(struct ir3_instruction *, deps); + + /* + * From PoV of instruction scheduling, not execution (ie. ignores global/ + * local distinction): + * shared image atomic SSBO everything + * barrier()/ - R/W R/W R/W R/W X + * groupMemoryBarrier() + * memoryBarrier() - R/W R/W + * (but only images declared coherent?) + * memoryBarrierAtomic() - R/W + * memoryBarrierBuffer() - R/W + * memoryBarrierImage() - R/W + * memoryBarrierShared() - R/W + * + * TODO I think for SSBO/image/shared, in cases where we can determine + * which variable is accessed, we don't need to care about accesses to + * different variables (unless declared coherent??) + */ + enum { + IR3_BARRIER_EVERYTHING = 1 << 0, + IR3_BARRIER_SHARED_R = 1 << 1, + IR3_BARRIER_SHARED_W = 1 << 2, + IR3_BARRIER_IMAGE_R = 1 << 3, + IR3_BARRIER_IMAGE_W = 1 << 4, + IR3_BARRIER_BUFFER_R = 1 << 5, + IR3_BARRIER_BUFFER_W = 1 << 6, + IR3_BARRIER_ARRAY_R = 1 << 7, + IR3_BARRIER_ARRAY_W = 1 << 8, + } barrier_class, barrier_conflict; + /* Entry in ir3_block's instruction list: */ struct list_head node; @@ -417,16 +451,13 @@ struct ir3_array { nir_register *r; - /* We track the last write and last access (read or write) to - * setup dependencies on instructions that read or write the - * array. Reads can be re-ordered wrt. other reads, but should - * not be re-ordered wrt. to writes. Writes cannot be reordered - * wrt. any other access to the array. - * - * So array reads depend on last write, and array writes depend - * on the last access. + /* To avoid array write's from getting DCE'd, keep track of the + * most recent write. Any array access depends on the most + * recent write. This way, nothing depends on writes after the + * last read. But all the writes that happen before that have + * something depending on them */ - struct ir3_instruction *last_write, *last_access; + struct ir3_instruction *last_write; /* extra stuff used in RA pass: */ unsigned base; /* base vreg name */ @@ -493,6 +524,7 @@ struct ir3_instruction * ir3_instr_create(struct ir3_block *block, opc_t opc); struct ir3_instruction * ir3_instr_create2(struct ir3_block *block, opc_t opc, int nreg); struct ir3_instruction * ir3_instr_clone(struct ir3_instruction *instr); +void ir3_instr_add_dep(struct ir3_instruction *instr, struct ir3_instruction *dep); const char *ir3_instr_name(struct ir3_instruction *instr); struct ir3_register * ir3_reg_create(struct ir3_instruction *instr, @@ -907,25 +939,36 @@ static inline unsigned ir3_cat3_absneg(opc_t opc) static inline unsigned __ssa_src_cnt(struct ir3_instruction *instr) { + unsigned cnt = instr->regs_count + instr->deps_count; if (instr->address) - return instr->regs_count + 1; - return instr->regs_count; + cnt++; + return cnt; } static inline struct ir3_instruction * __ssa_src_n(struct ir3_instruction *instr, unsigned n) { - if (n == (instr->regs_count + 0)) + if (n == (instr->regs_count + instr->deps_count)) return instr->address; + if (n >= instr->regs_count) + return instr->deps[n - instr->regs_count]; return ssa(instr->regs[n]); } +static inline bool __is_false_dep(struct ir3_instruction *instr, unsigned n) +{ + if (n == (instr->regs_count + instr->deps_count)) + return false; + if (n >= instr->regs_count) + return true; + return false; +} + #define __src_cnt(__instr) ((__instr)->address ? (__instr)->regs_count : (__instr)->regs_count - 1) /* iterator for an instruction's SSA sources (instr), also returns src #: */ #define foreach_ssa_src_n(__srcinst, __n, __instr) \ - if ((__instr)->regs_count) \ - for (unsigned __cnt = __ssa_src_cnt(__instr), __n = 0; __n < __cnt; __n++) \ - if ((__srcinst = __ssa_src_n(__instr, __n))) + for (unsigned __cnt = __ssa_src_cnt(__instr), __n = 0; __n < __cnt; __n++) \ + if ((__srcinst = __ssa_src_n(__instr, __n))) /* iterator for an instruction's SSA sources (instr): */ #define foreach_ssa_src(__srcinst, __instr) \ @@ -950,6 +993,7 @@ void ir3_cp(struct ir3 *ir, struct ir3_shader_variant *so); void ir3_group(struct ir3 *ir); /* scheduling: */ +void ir3_sched_add_deps(struct ir3 *ir); int ir3_sched(struct ir3 *ir); /* register assignment: */ diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c index bd3e0d0cd4a..3fd2e50d82f 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c @@ -74,20 +74,6 @@ struct ir3_context { /* Compute shader inputs: */ struct ir3_instruction *local_invocation_id, *work_group_id; - /* For SSBO's and atomics, we need to preserve order, such - * that reads don't overtake writes, and the order of writes - * is preserved. Atomics are considered as a write. - * - * To do this, we track last write and last access, in a - * similar way to ir3_array. But since we don't know whether - * the same SSBO is bound to multiple slots, so we simply - * track this globally rather than per-SSBO. - * - * TODO should we track this per block instead? I guess it - * shouldn't matter much? - */ - struct ir3_instruction *last_write, *last_access; - /* mapping from nir_register to defining instruction: */ struct hash_table *def_ht; @@ -345,6 +331,8 @@ create_array_load(struct ir3_context *ctx, struct ir3_array *arr, int n, mov = ir3_instr_create(block, OPC_MOV); mov->cat1.src_type = TYPE_U32; mov->cat1.dst_type = TYPE_U32; + mov->barrier_class = IR3_BARRIER_ARRAY_R; + mov->barrier_conflict = IR3_BARRIER_ARRAY_W; ir3_reg_create(mov, 0, 0); src = ir3_reg_create(mov, 0, IR3_REG_ARRAY | COND(address, IR3_REG_RELATIV)); @@ -356,8 +344,6 @@ create_array_load(struct ir3_context *ctx, struct ir3_array *arr, int n, if (address) ir3_instr_set_address(mov, address); - arr->last_access = mov; - return mov; } @@ -373,9 +359,11 @@ create_array_store(struct ir3_context *ctx, struct ir3_array *arr, int n, mov = ir3_instr_create(block, OPC_MOV); mov->cat1.src_type = TYPE_U32; mov->cat1.dst_type = TYPE_U32; + mov->barrier_class = IR3_BARRIER_ARRAY_W; + mov->barrier_conflict = IR3_BARRIER_ARRAY_R | IR3_BARRIER_ARRAY_W; dst = ir3_reg_create(mov, 0, IR3_REG_ARRAY | COND(address, IR3_REG_RELATIV)); - dst->instr = arr->last_access; + dst->instr = arr->last_write; dst->size = arr->length; dst->array.id = arr->id; dst->array.offset = n; @@ -384,7 +372,7 @@ create_array_store(struct ir3_context *ctx, struct ir3_array *arr, int n, if (address) ir3_instr_set_address(mov, address); - arr->last_write = arr->last_access = mov; + arr->last_write = mov; return mov; } @@ -1236,22 +1224,6 @@ emit_intrinsic_load_ubo(struct ir3_context *ctx, nir_intrinsic_instr *intr, } } -static void -mark_read(struct ir3_context *ctx, struct ir3_instruction *instr) -{ - instr->regs[0]->instr = ctx->last_write; - instr->regs[0]->flags |= IR3_REG_SSA; - ctx->last_access = instr; -} - -static void -mark_write(struct ir3_context *ctx, struct ir3_instruction *instr) -{ - instr->regs[0]->instr = ctx->last_access; - instr->regs[0]->flags |= IR3_REG_SSA; - ctx->last_write = ctx->last_access = instr; -} - /* src[] = { buffer_index, offset }. No const_index */ static void emit_intrinsic_load_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr, @@ -1280,7 +1252,8 @@ emit_intrinsic_load_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr, ldgb->cat6.iim_val = intr->num_components; ldgb->cat6.d = 4; ldgb->cat6.type = TYPE_U32; - mark_read(ctx, ldgb); + ldgb->barrier_class = IR3_BARRIER_BUFFER_R; + ldgb->barrier_conflict = IR3_BARRIER_BUFFER_W; split_dest(b, dst, ldgb, 0, intr->num_components); } @@ -1320,7 +1293,8 @@ emit_intrinsic_store_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr) stgb->cat6.iim_val = ncomp; stgb->cat6.d = 4; stgb->cat6.type = TYPE_U32; - mark_write(ctx, stgb); + stgb->barrier_class = IR3_BARRIER_BUFFER_W; + stgb->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W; array_insert(b, b->keeps, stgb); } @@ -1430,7 +1404,8 @@ emit_intrinsic_atomic_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr) atomic->cat6.iim_val = 1; atomic->cat6.d = 4; atomic->cat6.type = type; - mark_write(ctx, atomic); + atomic->barrier_class = IR3_BARRIER_BUFFER_W; + atomic->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W; /* even if nothing consume the result, we can't DCE the instruction: */ array_insert(b, b->keeps, atomic); @@ -1455,7 +1430,8 @@ emit_intrinsic_load_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr, ldl->cat6.type = TYPE_U32; ldl->regs[0]->wrmask = MASK(intr->num_components); - mark_read(ctx, ldl); + ldl->barrier_class = IR3_BARRIER_SHARED_R; + ldl->barrier_conflict = IR3_BARRIER_SHARED_W; split_dest(b, dst, ldl, 0, intr->num_components); } @@ -1491,8 +1467,9 @@ emit_intrinsic_store_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr) create_immed(b, length), 0); stl->cat6.dst_offset = first_component + base; stl->cat6.type = TYPE_U32; + stl->barrier_class = IR3_BARRIER_SHARED_W; + stl->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W; - mark_write(ctx, stl); array_insert(b, b->keeps, stl); /* Clear the bits in the writemask that we just wrote, then try @@ -1573,7 +1550,8 @@ emit_intrinsic_atomic_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr) atomic->cat6.iim_val = 1; atomic->cat6.d = 1; atomic->cat6.type = type; - mark_write(ctx, atomic); + atomic->barrier_class = IR3_BARRIER_SHARED_W; + atomic->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W; /* even if nothing consume the result, we can't DCE the instruction: */ array_insert(b, b->keeps, atomic); @@ -1702,6 +1680,9 @@ emit_intrinsic_load_image(struct ir3_context *ctx, nir_intrinsic_instr *intr, sam = ir3_SAM(b, OPC_ISAM, type, TGSI_WRITEMASK_XYZW, flags, tex_idx, tex_idx, create_collect(b, coords, ncoords), NULL); + sam->barrier_class = IR3_BARRIER_IMAGE_R; + sam->barrier_conflict = IR3_BARRIER_IMAGE_W; + split_dest(b, dst, sam, 0, 4); } @@ -1737,7 +1718,8 @@ emit_intrinsic_store_image(struct ir3_context *ctx, nir_intrinsic_instr *intr) stib->cat6.d = ncoords; stib->cat6.type = get_image_type(var); stib->cat6.typed = true; - mark_write(ctx, stib); + stib->barrier_class = IR3_BARRIER_IMAGE_W; + stib->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W; array_insert(b, b->keeps, stib); } @@ -1821,7 +1803,8 @@ emit_intrinsic_atomic_image(struct ir3_context *ctx, nir_intrinsic_instr *intr) atomic->cat6.d = ncoords; atomic->cat6.type = get_image_type(var); atomic->cat6.typed = true; - mark_write(ctx, atomic); + atomic->barrier_class = IR3_BARRIER_IMAGE_W; + atomic->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W; /* even if nothing consume the result, we can't DCE the instruction: */ array_insert(b, b->keeps, atomic); @@ -1841,23 +1824,62 @@ emit_intrinsic_barrier(struct ir3_context *ctx, nir_intrinsic_instr *intr) barrier->cat7.g = true; barrier->cat7.l = true; barrier->flags = IR3_INSTR_SS | IR3_INSTR_SY; + barrier->barrier_class = IR3_BARRIER_EVERYTHING; break; case nir_intrinsic_memory_barrier: + barrier = ir3_FENCE(b); + barrier->cat7.g = true; + barrier->cat7.r = true; + barrier->cat7.w = true; + barrier->barrier_class = IR3_BARRIER_IMAGE_W | + IR3_BARRIER_BUFFER_W; + barrier->barrier_conflict = + IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W | + IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W; + break; case nir_intrinsic_memory_barrier_atomic_counter: case nir_intrinsic_memory_barrier_buffer: barrier = ir3_FENCE(b); barrier->cat7.g = true; barrier->cat7.r = true; barrier->cat7.w = true; + barrier->barrier_class = IR3_BARRIER_BUFFER_W; + barrier->barrier_conflict = IR3_BARRIER_BUFFER_R | + IR3_BARRIER_BUFFER_W; break; - case nir_intrinsic_group_memory_barrier: case nir_intrinsic_memory_barrier_image: + // TODO double check if this should have .g set + barrier = ir3_FENCE(b); + barrier->cat7.g = true; + barrier->cat7.r = true; + barrier->cat7.w = true; + barrier->barrier_class = IR3_BARRIER_IMAGE_W; + barrier->barrier_conflict = IR3_BARRIER_IMAGE_R | + IR3_BARRIER_IMAGE_W; + break; case nir_intrinsic_memory_barrier_shared: barrier = ir3_FENCE(b); barrier->cat7.g = true; barrier->cat7.l = true; barrier->cat7.r = true; barrier->cat7.w = true; + barrier->barrier_class = IR3_BARRIER_SHARED_W; + barrier->barrier_conflict = IR3_BARRIER_SHARED_R | + IR3_BARRIER_SHARED_W; + break; + case nir_intrinsic_group_memory_barrier: + barrier = ir3_FENCE(b); + barrier->cat7.g = true; + barrier->cat7.l = true; + barrier->cat7.r = true; + barrier->cat7.w = true; + barrier->barrier_class = IR3_BARRIER_SHARED_W | + IR3_BARRIER_IMAGE_W | + IR3_BARRIER_BUFFER_W; + barrier->barrier_conflict = + IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W | + IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W | + IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W; break; default: unreachable("boo"); @@ -3301,6 +3323,8 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler, ir3_print(ir); } + ir3_sched_add_deps(ir); + /* Group left/right neighbors, inserting mov's where needed to * solve conflicts: */ diff --git a/src/gallium/drivers/freedreno/ir3/ir3_depth.c b/src/gallium/drivers/freedreno/ir3/ir3_depth.c index be39027b6a0..55ca5333b47 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_depth.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_depth.c @@ -55,6 +55,10 @@ int ir3_delayslots(struct ir3_instruction *assigner, struct ir3_instruction *consumer, unsigned n) { + /* don't count false-dependencies: */ + if (__is_false_dep(consumer, n)) + return 0; + /* worst case is cat1-3 (alu) -> cat4/5 needing 6 cycles, normal * alu -> alu needs 3 cycles, cat4 -> alu and texture fetch * handled with sync bits diff --git a/src/gallium/drivers/freedreno/ir3/ir3_sched.c b/src/gallium/drivers/freedreno/ir3/ir3_sched.c index b56da304f92..9492e9ba650 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_sched.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_sched.c @@ -669,3 +669,94 @@ int ir3_sched(struct ir3 *ir) return -1; return 0; } + +/* does instruction 'prior' need to be scheduled before 'instr'? */ +static bool +depends_on(struct ir3_instruction *instr, struct ir3_instruction *prior) +{ + /* TODO for dependencies that are related to a specific object, ie + * a specific SSBO/image/array, we could relax this constraint to + * make accesses to unrelated objects not depend on each other (at + * least as long as not declared coherent) + */ + if ((instr->barrier_class & IR3_BARRIER_EVERYTHING) || + (prior->barrier_class & IR3_BARRIER_EVERYTHING)) + return true; + return !!(instr->barrier_class & prior->barrier_conflict); +} + +static void +add_barrier_deps(struct ir3_block *block, struct ir3_instruction *instr) +{ + struct list_head *prev = instr->node.prev; + struct list_head *next = instr->node.next; + + /* add dependencies on previous instructions that must be scheduled + * prior to the current instruction + */ + while (prev != &block->instr_list) { + struct ir3_instruction *pi = + LIST_ENTRY(struct ir3_instruction, prev, node); + + prev = prev->prev; + + if (is_meta(pi)) + continue; + + if (instr->barrier_class == pi->barrier_class) { + ir3_instr_add_dep(instr, pi); + break; + } + + if (depends_on(instr, pi)) + ir3_instr_add_dep(instr, pi); + } + + /* add dependencies on this instruction to following instructions + * that must be scheduled after the current instruction: + */ + while (next != &block->instr_list) { + struct ir3_instruction *ni = + LIST_ENTRY(struct ir3_instruction, next, node); + + next = next->next; + + if (is_meta(ni)) + continue; + + if (instr->barrier_class == ni->barrier_class) { + ir3_instr_add_dep(ni, instr); + break; + } + + if (depends_on(ni, instr)) + ir3_instr_add_dep(ni, instr); + } +} + +/* before scheduling a block, we need to add any necessary false-dependencies + * to ensure that: + * + * (1) barriers are scheduled in the right order wrt instructions related + * to the barrier + * + * (2) reads that come before a write actually get scheduled before the + * write + */ +static void +calculate_deps(struct ir3_block *block) +{ + list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { + if (instr->barrier_class) { + add_barrier_deps(block, instr); + } + } +} + +void +ir3_sched_add_deps(struct ir3 *ir) +{ + list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { + calculate_deps(block); + } +}