X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fgallium%2Fwinsys%2Famdgpu%2Fdrm%2Famdgpu_cs.c;h=f51c7782033d8006082521d2c6e4adc30093887d;hb=73128dd46e12fb778e8a5fce3ee873d1ffa3f4d3;hp=555150a7018a2e2166c49375766d3503dac46ce9;hpb=187f1c999f90c3bef5b657bf386f076436149c1c;p=mesa.git diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c index 555150a7018..f51c7782033 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c @@ -35,14 +35,6 @@ DEBUG_GET_ONCE_BOOL_OPTION(noop, "RADEON_NOOP", false) -#ifndef AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID -#define AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID (1 << 4) -#endif - -#ifndef AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES -#define AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES 0x07 -#endif - /* FENCES */ static struct pipe_fence_handle * @@ -199,26 +191,21 @@ bool amdgpu_fence_wait(struct pipe_fence_handle *fence, uint64_t timeout, if (afence->signalled) return true; + if (absolute) + abs_timeout = timeout; + else + abs_timeout = os_time_get_absolute_timeout(timeout); + /* Handle syncobjs. */ if (amdgpu_fence_is_syncobj(afence)) { - /* Absolute timeouts are only be used by BO fences, which aren't - * backed by syncobjs. - */ - assert(!absolute); - if (amdgpu_cs_syncobj_wait(afence->ws->dev, &afence->syncobj, 1, - timeout, 0, NULL)) + abs_timeout, 0, NULL)) return false; afence->signalled = true; return true; } - if (absolute) - abs_timeout = timeout; - else - abs_timeout = os_time_get_absolute_timeout(timeout); - /* The fence might not have a number assigned if its IB is being * submitted in the other thread right now. Wait until the submission * is done. */ @@ -349,33 +336,49 @@ static enum pipe_reset_status amdgpu_ctx_query_reset_status(struct radeon_winsys_ctx *rwctx) { struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx; - uint32_t result, hangs; int r; + /* Return a failure due to a GPU hang. */ + if (ctx->ws->info.drm_minor >= 24) { + uint64_t flags; + + r = amdgpu_cs_query_reset_state2(ctx->ctx, &flags); + if (r) { + fprintf(stderr, "amdgpu: amdgpu_cs_query_reset_state failed. (%i)\n", r); + return PIPE_NO_RESET; + } + + if (flags & AMDGPU_CTX_QUERY2_FLAGS_RESET) { + if (flags & AMDGPU_CTX_QUERY2_FLAGS_GUILTY) + return PIPE_GUILTY_CONTEXT_RESET; + else + return PIPE_INNOCENT_CONTEXT_RESET; + } + } else { + uint32_t result, hangs; + + r = amdgpu_cs_query_reset_state(ctx->ctx, &result, &hangs); + if (r) { + fprintf(stderr, "amdgpu: amdgpu_cs_query_reset_state failed. (%i)\n", r); + return PIPE_NO_RESET; + } + + switch (result) { + case AMDGPU_CTX_GUILTY_RESET: + return PIPE_GUILTY_CONTEXT_RESET; + case AMDGPU_CTX_INNOCENT_RESET: + return PIPE_INNOCENT_CONTEXT_RESET; + case AMDGPU_CTX_UNKNOWN_RESET: + return PIPE_UNKNOWN_CONTEXT_RESET; + } + } + /* Return a failure due to a rejected command submission. */ if (ctx->ws->num_total_rejected_cs > ctx->initial_num_total_rejected_cs) { return ctx->num_rejected_cs ? PIPE_GUILTY_CONTEXT_RESET : PIPE_INNOCENT_CONTEXT_RESET; } - - /* Return a failure due to a GPU hang. */ - r = amdgpu_cs_query_reset_state(ctx->ctx, &result, &hangs); - if (r) { - fprintf(stderr, "amdgpu: amdgpu_cs_query_reset_state failed. (%i)\n", r); - return PIPE_NO_RESET; - } - - switch (result) { - case AMDGPU_CTX_GUILTY_RESET: - return PIPE_GUILTY_CONTEXT_RESET; - case AMDGPU_CTX_INNOCENT_RESET: - return PIPE_INNOCENT_CONTEXT_RESET; - case AMDGPU_CTX_UNKNOWN_RESET: - return PIPE_UNKNOWN_CONTEXT_RESET; - case AMDGPU_CTX_NO_RESET: - default: - return PIPE_NO_RESET; - } + return PIPE_NO_RESET; } /* COMMAND SUBMISSION */ @@ -673,7 +676,8 @@ static unsigned amdgpu_cs_add_buffer(struct radeon_cmdbuf *rcs, return index; } -static bool amdgpu_ib_new_buffer(struct amdgpu_winsys *ws, struct amdgpu_ib *ib, +static bool amdgpu_ib_new_buffer(struct amdgpu_winsys *ws, + struct amdgpu_ib *ib, enum ring_type ring_type) { struct pb_buffer *pb; @@ -697,18 +701,18 @@ static bool amdgpu_ib_new_buffer(struct amdgpu_winsys *ws, struct amdgpu_ib *ib, buffer_size = MIN2(buffer_size, max_size); buffer_size = MAX2(buffer_size, min_size); /* min_size is more important */ - pb = ws->base.buffer_create(&ws->base, buffer_size, - ws->info.gart_page_size, - RADEON_DOMAIN_GTT, - RADEON_FLAG_NO_INTERPROCESS_SHARING | - (ring_type == RING_GFX || - ring_type == RING_COMPUTE || - ring_type == RING_DMA ? - RADEON_FLAG_32BIT | RADEON_FLAG_GTT_WC : 0)); + pb = amdgpu_bo_create(ws, buffer_size, + ws->info.gart_page_size, + RADEON_DOMAIN_GTT, + RADEON_FLAG_NO_INTERPROCESS_SHARING | + (ring_type == RING_GFX || + ring_type == RING_COMPUTE || + ring_type == RING_DMA ? + RADEON_FLAG_32BIT | RADEON_FLAG_GTT_WC : 0)); if (!pb) return false; - mapped = ws->base.buffer_map(pb, NULL, PIPE_TRANSFER_WRITE); + mapped = amdgpu_bo_map(pb, NULL, PIPE_TRANSFER_WRITE); if (!mapped) { pb_reference(&pb, NULL); return false; @@ -741,10 +745,9 @@ static unsigned amdgpu_ib_max_submit_dwords(enum ib_type ib_type) } } -static bool amdgpu_get_new_ib(struct radeon_winsys *ws, struct amdgpu_cs *cs, +static bool amdgpu_get_new_ib(struct amdgpu_winsys *ws, struct amdgpu_cs *cs, enum ib_type ib_type) { - struct amdgpu_winsys *aws = (struct amdgpu_winsys*)ws; /* Small IBs are better than big IBs, because the GPU goes idle quicker * and there is less waiting for buffers and fences. Proof: * http://www.phoronix.com/scan.php?page=article&item=mesa-111-si&num=1 @@ -786,7 +789,7 @@ static bool amdgpu_get_new_ib(struct radeon_winsys *ws, struct amdgpu_cs *cs, /* Allocate a new buffer for IBs if the current buffer is all used. */ if (!ib->big_ib_buffer || ib->used_ib_space + ib_size > ib->big_ib_buffer->size) { - if (!amdgpu_ib_new_buffer(aws, ib, cs->ring_type)) + if (!amdgpu_ib_new_buffer(ws, ib, cs->ring_type)) return false; } @@ -823,7 +826,7 @@ static void amdgpu_ib_finalize(struct amdgpu_winsys *ws, struct amdgpu_ib *ib) { amdgpu_set_ib_size(ib); ib->used_ib_space += ib->base.current.cdw * 4; - ib->used_ib_space = align(ib->used_ib_space, ws->info.ib_start_alignment); + ib->used_ib_space = align(ib->used_ib_space, ws->info.ib_alignment); ib->max_ib_size = MAX2(ib->max_ib_size, ib->base.prev_dw + ib->base.current.cdw); } @@ -988,7 +991,7 @@ amdgpu_cs_create(struct radeon_winsys_ctx *rwctx, cs->csc = &cs->csc1; cs->cst = &cs->csc2; - if (!amdgpu_get_new_ib(&ctx->ws->base, cs, IB_MAIN)) { + if (!amdgpu_get_new_ib(ctx->ws, cs, IB_MAIN)) { amdgpu_destroy_cs_context(&cs->csc2); amdgpu_destroy_cs_context(&cs->csc1); FREE(cs); @@ -1014,7 +1017,7 @@ amdgpu_cs_add_parallel_compute_ib(struct radeon_cmdbuf *ib, return NULL; /* Allocate the compute IB. */ - if (!amdgpu_get_new_ib(&ws->base, cs, IB_PARALLEL_COMPUTE)) + if (!amdgpu_get_new_ib(ws, cs, IB_PARALLEL_COMPUTE)) return NULL; if (uses_gds_ordered_append) { @@ -1026,6 +1029,60 @@ amdgpu_cs_add_parallel_compute_ib(struct radeon_cmdbuf *ib, return &cs->compute_ib.base; } +static bool +amdgpu_cs_setup_preemption(struct radeon_cmdbuf *rcs, const uint32_t *preamble_ib, + unsigned preamble_num_dw) +{ + struct amdgpu_ib *ib = amdgpu_ib(rcs); + struct amdgpu_cs *cs = amdgpu_cs_from_ib(ib); + struct amdgpu_winsys *ws = cs->ctx->ws; + struct amdgpu_cs_context *csc[2] = {&cs->csc1, &cs->csc2}; + unsigned size = align(preamble_num_dw * 4, ws->info.ib_alignment); + struct pb_buffer *preamble_bo; + uint32_t *map; + + /* Create the preamble IB buffer. */ + preamble_bo = amdgpu_bo_create(ws, size, ws->info.ib_alignment, + RADEON_DOMAIN_VRAM, + RADEON_FLAG_NO_INTERPROCESS_SHARING | + RADEON_FLAG_GTT_WC | + RADEON_FLAG_READ_ONLY); + if (!preamble_bo) + return false; + + map = (uint32_t*)amdgpu_bo_map(preamble_bo, NULL, + PIPE_TRANSFER_WRITE | RADEON_TRANSFER_TEMPORARY); + if (!map) { + pb_reference(&preamble_bo, NULL); + return false; + } + + /* Upload the preamble IB. */ + memcpy(map, preamble_ib, preamble_num_dw * 4); + + /* Pad the IB. */ + uint32_t ib_pad_dw_mask = ws->info.ib_pad_dw_mask[cs->ring_type]; + while (preamble_num_dw & ib_pad_dw_mask) + map[preamble_num_dw++] = PKT3_NOP_PAD; + amdgpu_bo_unmap(preamble_bo); + + for (unsigned i = 0; i < 2; i++) { + csc[i]->ib[IB_PREAMBLE] = csc[i]->ib[IB_MAIN]; + csc[i]->ib[IB_PREAMBLE].flags |= AMDGPU_IB_FLAG_PREAMBLE; + csc[i]->ib[IB_PREAMBLE].va_start = amdgpu_winsys_bo(preamble_bo)->va; + csc[i]->ib[IB_PREAMBLE].ib_bytes = preamble_num_dw * 4; + + csc[i]->ib[IB_MAIN].flags |= AMDGPU_IB_FLAG_PREEMPT; + } + + assert(!cs->preamble_ib_bo); + cs->preamble_ib_bo = preamble_bo; + + amdgpu_cs_add_buffer(rcs, cs->preamble_ib_bo, RADEON_USAGE_READ, 0, + RADEON_PRIO_IB1); + return true; +} + static bool amdgpu_cs_validate(struct radeon_cmdbuf *rcs) { return true; @@ -1089,14 +1146,16 @@ static bool amdgpu_cs_check_space(struct radeon_cmdbuf *rcs, unsigned dw, /* This space was originally reserved. */ rcs->current.max_dw += cs_epilog_dw; - /* Pad with NOPs and add INDIRECT_BUFFER packet */ - while ((rcs->current.cdw & 7) != 4) - radeon_emit(rcs, 0xffff1000); /* type3 nop packet */ + /* Pad with NOPs but leave 4 dwords for INDIRECT_BUFFER. */ + uint32_t ib_pad_dw_mask = cs->ctx->ws->info.ib_pad_dw_mask[cs->ring_type]; + while ((rcs->current.cdw & ib_pad_dw_mask) != ib_pad_dw_mask - 3) + radeon_emit(rcs, PKT3_NOP_PAD); radeon_emit(rcs, PKT3(PKT3_INDIRECT_BUFFER_CIK, 2, 0)); radeon_emit(rcs, va); radeon_emit(rcs, va >> 32); new_ptr_ib_size = &rcs->current.buf[rcs->current.cdw++]; + assert((rcs->current.cdw & ib_pad_dw_mask) == 0); assert((rcs->current.cdw & 7) == 0); assert(rcs->current.cdw <= rcs->current.max_dw); @@ -1160,17 +1219,20 @@ static void add_fence_to_list(struct amdgpu_fence_list *fences, amdgpu_fence_reference(&fences->list[idx], (struct pipe_fence_handle*)fence); } -/* TODO: recognizing dependencies as no-ops doesn't take the parallel - * compute IB into account. The compute IB won't wait for these. - * Also, the scheduler can execute compute and SDMA IBs on any rings. - * Should we always insert dependencies? - */ static bool is_noop_fence_dependency(struct amdgpu_cs *acs, struct amdgpu_fence *fence) { struct amdgpu_cs_context *cs = acs->csc; - if (!amdgpu_fence_is_syncobj(fence) && + /* Detect no-op dependencies only when there is only 1 ring, + * because IBs on one ring are always executed one at a time. + * + * We always want no dependency between back-to-back gfx IBs, because + * we need the parallelism between IBs for good performance. + */ + if ((acs->ring_type == RING_GFX || + acs->ctx->ws->info.num_rings[acs->ring_type] == 1) && + !amdgpu_fence_is_syncobj(fence) && fence->ctx == acs->ctx && fence->fence.ip_type == cs->ib[IB_MAIN].ip_type && fence->fence.ip_instance == cs->ib[IB_MAIN].ip_instance && @@ -1379,9 +1441,6 @@ void amdgpu_cs_submit_ib(void *job, int thread_index) simple_mtx_lock(&ws->global_bo_list_lock); LIST_FOR_EACH_ENTRY(bo, &ws->global_bo_list, u.real.global_list_item) { - if (bo->is_local) - continue; - list[num_handles].bo_handle = bo->u.real.kms_handle; list[num_handles].bo_priority = 0; ++num_handles; @@ -1406,10 +1465,6 @@ void amdgpu_cs_submit_ib(void *job, int thread_index) unsigned num_handles = 0; for (i = 0; i < cs->num_real_buffers; ++i) { struct amdgpu_cs_buffer *buffer = &cs->real_buffers[i]; - - if (buffer->bo->is_local) - continue; - assert(buffer->u.real.priority_usage != 0); list[num_handles].bo_handle = buffer->bo->u.real.kms_handle; @@ -1440,7 +1495,7 @@ void amdgpu_cs_submit_ib(void *job, int thread_index) if (acs->stop_exec_on_failure && acs->ctx->num_rejected_cs) { r = -ECANCELED; } else { - struct drm_amdgpu_cs_chunk chunks[6]; + struct drm_amdgpu_cs_chunk chunks[7]; unsigned num_chunks = 0; /* BO list */ @@ -1583,6 +1638,14 @@ void amdgpu_cs_submit_ib(void *job, int thread_index) num_chunks++; } + /* IB */ + if (cs->ib[IB_PREAMBLE].ib_bytes) { + chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_IB; + chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_ib) / 4; + chunks[num_chunks].chunk_data = (uintptr_t)&cs->ib[IB_PREAMBLE]; + num_chunks++; + } + /* IB */ cs->ib[IB_MAIN].ib_bytes *= 4; /* Convert from dwords to bytes. */ chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_IB; @@ -1590,6 +1653,14 @@ void amdgpu_cs_submit_ib(void *job, int thread_index) chunks[num_chunks].chunk_data = (uintptr_t)&cs->ib[IB_MAIN]; num_chunks++; + if (ws->secure && cs->secure) { + cs->ib[IB_PREAMBLE].flags |= AMDGPU_IB_FLAGS_SECURE; + cs->ib[IB_MAIN].flags |= AMDGPU_IB_FLAGS_SECURE; + } else { + cs->ib[IB_PREAMBLE].flags &= ~AMDGPU_IB_FLAGS_SECURE; + cs->ib[IB_MAIN].flags &= ~AMDGPU_IB_FLAGS_SECURE; + } + assert(num_chunks <= ARRAY_SIZE(chunks)); r = amdgpu_cs_submit_raw2(ws->dev, acs->ctx->ctx, bo_list, @@ -1655,54 +1726,54 @@ static int amdgpu_cs_flush(struct radeon_cmdbuf *rcs, struct amdgpu_cs *cs = amdgpu_cs(rcs); struct amdgpu_winsys *ws = cs->ctx->ws; int error_code = 0; + uint32_t ib_pad_dw_mask = ws->info.ib_pad_dw_mask[cs->ring_type]; rcs->current.max_dw += amdgpu_cs_epilog_dws(cs); + /* Pad the IB according to the mask. */ switch (cs->ring_type) { case RING_DMA: - /* pad DMA ring to 8 DWs */ if (ws->info.chip_class <= GFX6) { - while (rcs->current.cdw & 7) + while (rcs->current.cdw & ib_pad_dw_mask) radeon_emit(rcs, 0xf0000000); /* NOP packet */ } else { - while (rcs->current.cdw & 7) + while (rcs->current.cdw & ib_pad_dw_mask) radeon_emit(rcs, 0x00000000); /* NOP packet */ } break; case RING_GFX: case RING_COMPUTE: - /* pad GFX ring to 8 DWs to meet CP fetch alignment requirements */ if (ws->info.gfx_ib_pad_with_type2) { - while (rcs->current.cdw & 7) - radeon_emit(rcs, 0x80000000); /* type2 nop packet */ + while (rcs->current.cdw & ib_pad_dw_mask) + radeon_emit(rcs, PKT2_NOP_PAD); } else { - while (rcs->current.cdw & 7) - radeon_emit(rcs, 0xffff1000); /* type3 nop packet */ + while (rcs->current.cdw & ib_pad_dw_mask) + radeon_emit(rcs, PKT3_NOP_PAD); } if (cs->ring_type == RING_GFX) ws->gfx_ib_size_counter += (rcs->prev_dw + rcs->current.cdw) * 4; /* Also pad secondary IBs. */ if (cs->compute_ib.ib_mapped) { - while (cs->compute_ib.base.current.cdw & 7) - radeon_emit(&cs->compute_ib.base, 0xffff1000); /* type3 nop packet */ + while (cs->compute_ib.base.current.cdw & ib_pad_dw_mask) + radeon_emit(&cs->compute_ib.base, PKT3_NOP_PAD); } break; case RING_UVD: case RING_UVD_ENC: - while (rcs->current.cdw & 15) + while (rcs->current.cdw & ib_pad_dw_mask) radeon_emit(rcs, 0x80000000); /* type2 nop packet */ break; case RING_VCN_JPEG: if (rcs->current.cdw % 2) assert(0); - while (rcs->current.cdw & 15) { + while (rcs->current.cdw & ib_pad_dw_mask) { radeon_emit(rcs, 0x60000000); /* nop packet */ radeon_emit(rcs, 0x00000000); } break; case RING_VCN_DEC: - while (rcs->current.cdw & 15) + while (rcs->current.cdw & ib_pad_dw_mask) radeon_emit(rcs, 0x81ff); /* nop packet */ break; default: @@ -1757,7 +1828,7 @@ static int amdgpu_cs_flush(struct radeon_cmdbuf *rcs, /* Submit. */ util_queue_add_job(&ws->cs_queue, cs, &cs->flush_completed, - amdgpu_cs_submit_ib, NULL); + amdgpu_cs_submit_ib, NULL, 0); /* The submission has been queued, unlock the fence now. */ simple_mtx_unlock(&ws->bo_fence_lock); @@ -1769,9 +1840,14 @@ static int amdgpu_cs_flush(struct radeon_cmdbuf *rcs, amdgpu_cs_context_cleanup(cs->csc); } - amdgpu_get_new_ib(&ws->base, cs, IB_MAIN); + amdgpu_get_new_ib(ws, cs, IB_MAIN); if (cs->compute_ib.ib_mapped) - amdgpu_get_new_ib(&ws->base, cs, IB_PARALLEL_COMPUTE); + amdgpu_get_new_ib(ws, cs, IB_PARALLEL_COMPUTE); + + if (cs->preamble_ib_bo) { + amdgpu_cs_add_buffer(rcs, cs->preamble_ib_bo, RADEON_USAGE_READ, 0, + RADEON_PRIO_IB1); + } cs->main.base.used_gart = 0; cs->main.base.used_vram = 0; @@ -1791,6 +1867,7 @@ static void amdgpu_cs_destroy(struct radeon_cmdbuf *rcs) amdgpu_cs_sync_flush(rcs); util_queue_fence_destroy(&cs->flush_completed); p_atomic_dec(&cs->ctx->ws->num_cs); + pb_reference(&cs->preamble_ib_bo, NULL); pb_reference(&cs->main.big_ib_buffer, NULL); FREE(cs->main.base.prev); pb_reference(&cs->compute_ib.big_ib_buffer, NULL); @@ -1811,13 +1888,14 @@ static bool amdgpu_bo_is_referenced(struct radeon_cmdbuf *rcs, return amdgpu_bo_is_referenced_by_cs_with_usage(cs, bo, usage); } -void amdgpu_cs_init_functions(struct amdgpu_winsys *ws) +void amdgpu_cs_init_functions(struct amdgpu_screen_winsys *ws) { ws->base.ctx_create = amdgpu_ctx_create; ws->base.ctx_destroy = amdgpu_ctx_destroy; ws->base.ctx_query_reset_status = amdgpu_ctx_query_reset_status; ws->base.cs_create = amdgpu_cs_create; ws->base.cs_add_parallel_compute_ib = amdgpu_cs_add_parallel_compute_ib; + ws->base.cs_setup_preemption = amdgpu_cs_setup_preemption; ws->base.cs_destroy = amdgpu_cs_destroy; ws->base.cs_add_buffer = amdgpu_cs_add_buffer; ws->base.cs_validate = amdgpu_cs_validate;