X-Git-Url: https://git.libre-soc.org/?p=mesa.git;a=blobdiff_plain;f=src%2Fgallium%2Fwinsys%2Famdgpu%2Fdrm%2Famdgpu_cs.c;h=f51c7782033d8006082521d2c6e4adc30093887d;hp=4308cfb3f96661e99d92672a22e985220fc370bc;hb=73128dd46e12fb778e8a5fce3ee873d1ffa3f4d3;hpb=ab54624d0d52d88da7fb7f4df61f33f600a1dfd7 diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c index 4308cfb3f96..f51c7782033 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c @@ -35,14 +35,6 @@ DEBUG_GET_ONCE_BOOL_OPTION(noop, "RADEON_NOOP", false) -#ifndef AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID -#define AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID (1 << 4) -#endif - -#ifndef AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES -#define AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES 0x07 -#endif - /* FENCES */ static struct pipe_fence_handle * @@ -199,26 +191,21 @@ bool amdgpu_fence_wait(struct pipe_fence_handle *fence, uint64_t timeout, if (afence->signalled) return true; + if (absolute) + abs_timeout = timeout; + else + abs_timeout = os_time_get_absolute_timeout(timeout); + /* Handle syncobjs. */ if (amdgpu_fence_is_syncobj(afence)) { - /* Absolute timeouts are only be used by BO fences, which aren't - * backed by syncobjs. - */ - assert(!absolute); - if (amdgpu_cs_syncobj_wait(afence->ws->dev, &afence->syncobj, 1, - timeout, 0, NULL)) + abs_timeout, 0, NULL)) return false; afence->signalled = true; return true; } - if (absolute) - abs_timeout = timeout; - else - abs_timeout = os_time_get_absolute_timeout(timeout); - /* The fence might not have a number assigned if its IB is being * submitted in the other thread right now. Wait until the submission * is done. */ @@ -839,7 +826,7 @@ static void amdgpu_ib_finalize(struct amdgpu_winsys *ws, struct amdgpu_ib *ib) { amdgpu_set_ib_size(ib); ib->used_ib_space += ib->base.current.cdw * 4; - ib->used_ib_space = align(ib->used_ib_space, ws->info.ib_start_alignment); + ib->used_ib_space = align(ib->used_ib_space, ws->info.ib_alignment); ib->max_ib_size = MAX2(ib->max_ib_size, ib->base.prev_dw + ib->base.current.cdw); } @@ -1042,6 +1029,60 @@ amdgpu_cs_add_parallel_compute_ib(struct radeon_cmdbuf *ib, return &cs->compute_ib.base; } +static bool +amdgpu_cs_setup_preemption(struct radeon_cmdbuf *rcs, const uint32_t *preamble_ib, + unsigned preamble_num_dw) +{ + struct amdgpu_ib *ib = amdgpu_ib(rcs); + struct amdgpu_cs *cs = amdgpu_cs_from_ib(ib); + struct amdgpu_winsys *ws = cs->ctx->ws; + struct amdgpu_cs_context *csc[2] = {&cs->csc1, &cs->csc2}; + unsigned size = align(preamble_num_dw * 4, ws->info.ib_alignment); + struct pb_buffer *preamble_bo; + uint32_t *map; + + /* Create the preamble IB buffer. */ + preamble_bo = amdgpu_bo_create(ws, size, ws->info.ib_alignment, + RADEON_DOMAIN_VRAM, + RADEON_FLAG_NO_INTERPROCESS_SHARING | + RADEON_FLAG_GTT_WC | + RADEON_FLAG_READ_ONLY); + if (!preamble_bo) + return false; + + map = (uint32_t*)amdgpu_bo_map(preamble_bo, NULL, + PIPE_TRANSFER_WRITE | RADEON_TRANSFER_TEMPORARY); + if (!map) { + pb_reference(&preamble_bo, NULL); + return false; + } + + /* Upload the preamble IB. */ + memcpy(map, preamble_ib, preamble_num_dw * 4); + + /* Pad the IB. */ + uint32_t ib_pad_dw_mask = ws->info.ib_pad_dw_mask[cs->ring_type]; + while (preamble_num_dw & ib_pad_dw_mask) + map[preamble_num_dw++] = PKT3_NOP_PAD; + amdgpu_bo_unmap(preamble_bo); + + for (unsigned i = 0; i < 2; i++) { + csc[i]->ib[IB_PREAMBLE] = csc[i]->ib[IB_MAIN]; + csc[i]->ib[IB_PREAMBLE].flags |= AMDGPU_IB_FLAG_PREAMBLE; + csc[i]->ib[IB_PREAMBLE].va_start = amdgpu_winsys_bo(preamble_bo)->va; + csc[i]->ib[IB_PREAMBLE].ib_bytes = preamble_num_dw * 4; + + csc[i]->ib[IB_MAIN].flags |= AMDGPU_IB_FLAG_PREEMPT; + } + + assert(!cs->preamble_ib_bo); + cs->preamble_ib_bo = preamble_bo; + + amdgpu_cs_add_buffer(rcs, cs->preamble_ib_bo, RADEON_USAGE_READ, 0, + RADEON_PRIO_IB1); + return true; +} + static bool amdgpu_cs_validate(struct radeon_cmdbuf *rcs) { return true; @@ -1105,14 +1146,16 @@ static bool amdgpu_cs_check_space(struct radeon_cmdbuf *rcs, unsigned dw, /* This space was originally reserved. */ rcs->current.max_dw += cs_epilog_dw; - /* Pad with NOPs and add INDIRECT_BUFFER packet */ - while ((rcs->current.cdw & 7) != 4) - radeon_emit(rcs, 0xffff1000); /* type3 nop packet */ + /* Pad with NOPs but leave 4 dwords for INDIRECT_BUFFER. */ + uint32_t ib_pad_dw_mask = cs->ctx->ws->info.ib_pad_dw_mask[cs->ring_type]; + while ((rcs->current.cdw & ib_pad_dw_mask) != ib_pad_dw_mask - 3) + radeon_emit(rcs, PKT3_NOP_PAD); radeon_emit(rcs, PKT3(PKT3_INDIRECT_BUFFER_CIK, 2, 0)); radeon_emit(rcs, va); radeon_emit(rcs, va >> 32); new_ptr_ib_size = &rcs->current.buf[rcs->current.cdw++]; + assert((rcs->current.cdw & ib_pad_dw_mask) == 0); assert((rcs->current.cdw & 7) == 0); assert(rcs->current.cdw <= rcs->current.max_dw); @@ -1452,7 +1495,7 @@ void amdgpu_cs_submit_ib(void *job, int thread_index) if (acs->stop_exec_on_failure && acs->ctx->num_rejected_cs) { r = -ECANCELED; } else { - struct drm_amdgpu_cs_chunk chunks[6]; + struct drm_amdgpu_cs_chunk chunks[7]; unsigned num_chunks = 0; /* BO list */ @@ -1595,6 +1638,14 @@ void amdgpu_cs_submit_ib(void *job, int thread_index) num_chunks++; } + /* IB */ + if (cs->ib[IB_PREAMBLE].ib_bytes) { + chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_IB; + chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_ib) / 4; + chunks[num_chunks].chunk_data = (uintptr_t)&cs->ib[IB_PREAMBLE]; + num_chunks++; + } + /* IB */ cs->ib[IB_MAIN].ib_bytes *= 4; /* Convert from dwords to bytes. */ chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_IB; @@ -1602,6 +1653,14 @@ void amdgpu_cs_submit_ib(void *job, int thread_index) chunks[num_chunks].chunk_data = (uintptr_t)&cs->ib[IB_MAIN]; num_chunks++; + if (ws->secure && cs->secure) { + cs->ib[IB_PREAMBLE].flags |= AMDGPU_IB_FLAGS_SECURE; + cs->ib[IB_MAIN].flags |= AMDGPU_IB_FLAGS_SECURE; + } else { + cs->ib[IB_PREAMBLE].flags &= ~AMDGPU_IB_FLAGS_SECURE; + cs->ib[IB_MAIN].flags &= ~AMDGPU_IB_FLAGS_SECURE; + } + assert(num_chunks <= ARRAY_SIZE(chunks)); r = amdgpu_cs_submit_raw2(ws->dev, acs->ctx->ctx, bo_list, @@ -1667,51 +1726,54 @@ static int amdgpu_cs_flush(struct radeon_cmdbuf *rcs, struct amdgpu_cs *cs = amdgpu_cs(rcs); struct amdgpu_winsys *ws = cs->ctx->ws; int error_code = 0; + uint32_t ib_pad_dw_mask = ws->info.ib_pad_dw_mask[cs->ring_type]; rcs->current.max_dw += amdgpu_cs_epilog_dws(cs); + /* Pad the IB according to the mask. */ switch (cs->ring_type) { case RING_DMA: - /* pad DMA ring to 8 DWs */ if (ws->info.chip_class <= GFX6) { - while (rcs->current.cdw & 7) + while (rcs->current.cdw & ib_pad_dw_mask) radeon_emit(rcs, 0xf0000000); /* NOP packet */ + } else { + while (rcs->current.cdw & ib_pad_dw_mask) + radeon_emit(rcs, 0x00000000); /* NOP packet */ } break; case RING_GFX: case RING_COMPUTE: - /* pad GFX ring to 8 DWs to meet CP fetch alignment requirements */ if (ws->info.gfx_ib_pad_with_type2) { - while (rcs->current.cdw & 7) - radeon_emit(rcs, 0x80000000); /* type2 nop packet */ + while (rcs->current.cdw & ib_pad_dw_mask) + radeon_emit(rcs, PKT2_NOP_PAD); } else { - while (rcs->current.cdw & 7) - radeon_emit(rcs, 0xffff1000); /* type3 nop packet */ + while (rcs->current.cdw & ib_pad_dw_mask) + radeon_emit(rcs, PKT3_NOP_PAD); } if (cs->ring_type == RING_GFX) ws->gfx_ib_size_counter += (rcs->prev_dw + rcs->current.cdw) * 4; /* Also pad secondary IBs. */ if (cs->compute_ib.ib_mapped) { - while (cs->compute_ib.base.current.cdw & 7) - radeon_emit(&cs->compute_ib.base, 0xffff1000); /* type3 nop packet */ + while (cs->compute_ib.base.current.cdw & ib_pad_dw_mask) + radeon_emit(&cs->compute_ib.base, PKT3_NOP_PAD); } break; case RING_UVD: case RING_UVD_ENC: - while (rcs->current.cdw & 15) + while (rcs->current.cdw & ib_pad_dw_mask) radeon_emit(rcs, 0x80000000); /* type2 nop packet */ break; case RING_VCN_JPEG: if (rcs->current.cdw % 2) assert(0); - while (rcs->current.cdw & 15) { + while (rcs->current.cdw & ib_pad_dw_mask) { radeon_emit(rcs, 0x60000000); /* nop packet */ radeon_emit(rcs, 0x00000000); } break; case RING_VCN_DEC: - while (rcs->current.cdw & 15) + while (rcs->current.cdw & ib_pad_dw_mask) radeon_emit(rcs, 0x81ff); /* nop packet */ break; default: @@ -1782,6 +1844,11 @@ static int amdgpu_cs_flush(struct radeon_cmdbuf *rcs, if (cs->compute_ib.ib_mapped) amdgpu_get_new_ib(ws, cs, IB_PARALLEL_COMPUTE); + if (cs->preamble_ib_bo) { + amdgpu_cs_add_buffer(rcs, cs->preamble_ib_bo, RADEON_USAGE_READ, 0, + RADEON_PRIO_IB1); + } + cs->main.base.used_gart = 0; cs->main.base.used_vram = 0; @@ -1800,6 +1867,7 @@ static void amdgpu_cs_destroy(struct radeon_cmdbuf *rcs) amdgpu_cs_sync_flush(rcs); util_queue_fence_destroy(&cs->flush_completed); p_atomic_dec(&cs->ctx->ws->num_cs); + pb_reference(&cs->preamble_ib_bo, NULL); pb_reference(&cs->main.big_ib_buffer, NULL); FREE(cs->main.base.prev); pb_reference(&cs->compute_ib.big_ib_buffer, NULL); @@ -1827,6 +1895,7 @@ void amdgpu_cs_init_functions(struct amdgpu_screen_winsys *ws) ws->base.ctx_query_reset_status = amdgpu_ctx_query_reset_status; ws->base.cs_create = amdgpu_cs_create; ws->base.cs_add_parallel_compute_ib = amdgpu_cs_add_parallel_compute_ib; + ws->base.cs_setup_preemption = amdgpu_cs_setup_preemption; ws->base.cs_destroy = amdgpu_cs_destroy; ws->base.cs_add_buffer = amdgpu_cs_add_buffer; ws->base.cs_validate = amdgpu_cs_validate;