+ return cs->request.ip_type != AMDGPU_HW_IP_UVD &&
+ cs->request.ip_type != AMDGPU_HW_IP_VCE;
+}
+
+static bool amdgpu_cs_has_chaining(struct amdgpu_cs *cs)
+{
+ return cs->ctx->ws->info.chip_class >= CIK &&
+ cs->ring_type == RING_GFX;
+}
+
+static unsigned amdgpu_cs_epilog_dws(enum ring_type ring_type)
+{
+ if (ring_type == RING_GFX)
+ return 4; /* for chaining */
+
+ return 0;
+}
+
+int amdgpu_lookup_buffer(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo *bo)
+{
+ unsigned hash = bo->unique_id & (ARRAY_SIZE(cs->buffer_indices_hashlist)-1);
+ int i = cs->buffer_indices_hashlist[hash];
+ struct amdgpu_cs_buffer *buffers;
+ int num_buffers;
+
+ if (bo->bo) {
+ buffers = cs->real_buffers;
+ num_buffers = cs->num_real_buffers;
+ } else {
+ buffers = cs->slab_buffers;
+ num_buffers = cs->num_slab_buffers;
+ }
+
+ /* not found or found */
+ if (i < 0 || (i < num_buffers && buffers[i].bo == bo))
+ return i;
+
+ /* Hash collision, look for the BO in the list of buffers linearly. */
+ for (i = num_buffers - 1; i >= 0; i--) {
+ if (buffers[i].bo == bo) {
+ /* Put this buffer in the hash list.
+ * This will prevent additional hash collisions if there are
+ * several consecutive lookup_buffer calls for the same buffer.
+ *
+ * Example: Assuming buffers A,B,C collide in the hash list,
+ * the following sequence of buffers:
+ * AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC
+ * will collide here: ^ and here: ^,
+ * meaning that we should get very few collisions in the end. */
+ cs->buffer_indices_hashlist[hash] = i;
+ return i;
+ }
+ }
+ return -1;
+}
+
+static int
+amdgpu_lookup_or_add_real_buffer(struct amdgpu_cs *acs, struct amdgpu_winsys_bo *bo)
+{
+ struct amdgpu_cs_context *cs = acs->csc;
+ struct amdgpu_cs_buffer *buffer;
+ unsigned hash;
+ int idx = amdgpu_lookup_buffer(cs, bo);
+
+ if (idx >= 0)
+ return idx;
+
+ /* New buffer, check if the backing array is large enough. */
+ if (cs->num_real_buffers >= cs->max_real_buffers) {
+ unsigned new_max =
+ MAX2(cs->max_real_buffers + 16, (unsigned)(cs->max_real_buffers * 1.3));
+ struct amdgpu_cs_buffer *new_buffers;
+ amdgpu_bo_handle *new_handles;
+ uint8_t *new_flags;
+
+ new_buffers = MALLOC(new_max * sizeof(*new_buffers));
+ new_handles = MALLOC(new_max * sizeof(*new_handles));
+ new_flags = MALLOC(new_max * sizeof(*new_flags));
+
+ if (!new_buffers || !new_handles || !new_flags) {
+ fprintf(stderr, "amdgpu_lookup_or_add_buffer: allocation failed\n");
+ FREE(new_buffers);
+ FREE(new_handles);
+ FREE(new_flags);
+ return -1;
+ }
+
+ memcpy(new_buffers, cs->real_buffers, cs->num_real_buffers * sizeof(*new_buffers));
+ memcpy(new_handles, cs->handles, cs->num_real_buffers * sizeof(*new_handles));
+ memcpy(new_flags, cs->flags, cs->num_real_buffers * sizeof(*new_flags));
+
+ FREE(cs->real_buffers);
+ FREE(cs->handles);
+ FREE(cs->flags);
+
+ cs->max_real_buffers = new_max;
+ cs->real_buffers = new_buffers;
+ cs->handles = new_handles;
+ cs->flags = new_flags;
+ }
+
+ idx = cs->num_real_buffers;
+ buffer = &cs->real_buffers[idx];
+
+ memset(buffer, 0, sizeof(*buffer));
+ amdgpu_winsys_bo_reference(&buffer->bo, bo);
+ cs->handles[idx] = bo->bo;
+ cs->flags[idx] = 0;
+ p_atomic_inc(&bo->num_cs_references);
+ cs->num_real_buffers++;
+
+ hash = bo->unique_id & (ARRAY_SIZE(cs->buffer_indices_hashlist)-1);
+ cs->buffer_indices_hashlist[hash] = idx;
+
+ if (bo->initial_domain & RADEON_DOMAIN_VRAM)
+ acs->main.base.used_vram += bo->base.size;
+ else if (bo->initial_domain & RADEON_DOMAIN_GTT)
+ acs->main.base.used_gart += bo->base.size;
+
+ return idx;
+}
+
+static int amdgpu_lookup_or_add_slab_buffer(struct amdgpu_cs *acs,
+ struct amdgpu_winsys_bo *bo)
+{
+ struct amdgpu_cs_context *cs = acs->csc;
+ struct amdgpu_cs_buffer *buffer;
+ unsigned hash;
+ int idx = amdgpu_lookup_buffer(cs, bo);
+ int real_idx;
+
+ if (idx >= 0)
+ return idx;
+
+ real_idx = amdgpu_lookup_or_add_real_buffer(acs, bo->u.slab.real);
+ if (real_idx < 0)
+ return -1;
+
+ /* New buffer, check if the backing array is large enough. */
+ if (cs->num_slab_buffers >= cs->max_slab_buffers) {
+ unsigned new_max =
+ MAX2(cs->max_slab_buffers + 16, (unsigned)(cs->max_slab_buffers * 1.3));
+ struct amdgpu_cs_buffer *new_buffers;
+
+ new_buffers = REALLOC(cs->slab_buffers,
+ cs->max_slab_buffers * sizeof(*new_buffers),
+ new_max * sizeof(*new_buffers));
+ if (!new_buffers) {
+ fprintf(stderr, "amdgpu_lookup_or_add_slab_buffer: allocation failed\n");
+ return -1;
+ }
+
+ cs->max_slab_buffers = new_max;
+ cs->slab_buffers = new_buffers;
+ }
+
+ idx = cs->num_slab_buffers;
+ buffer = &cs->slab_buffers[idx];
+
+ memset(buffer, 0, sizeof(*buffer));
+ amdgpu_winsys_bo_reference(&buffer->bo, bo);
+ buffer->u.slab.real_idx = real_idx;
+ p_atomic_inc(&bo->num_cs_references);
+ cs->num_slab_buffers++;
+
+ hash = bo->unique_id & (ARRAY_SIZE(cs->buffer_indices_hashlist)-1);
+ cs->buffer_indices_hashlist[hash] = idx;
+
+ return idx;
+}
+
+static unsigned amdgpu_cs_add_buffer(struct radeon_winsys_cs *rcs,
+ struct pb_buffer *buf,
+ enum radeon_bo_usage usage,
+ enum radeon_bo_domain domains,
+ enum radeon_bo_priority priority)
+{
+ /* Don't use the "domains" parameter. Amdgpu doesn't support changing
+ * the buffer placement during command submission.
+ */
+ struct amdgpu_cs *acs = amdgpu_cs(rcs);
+ struct amdgpu_cs_context *cs = acs->csc;
+ struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf;
+ struct amdgpu_cs_buffer *buffer;
+ int index;
+
+ /* Fast exit for no-op calls.
+ * This is very effective with suballocators and linear uploaders that
+ * are outside of the winsys.
+ */
+ if (bo == cs->last_added_bo &&
+ (usage & cs->last_added_bo_usage) == usage &&
+ (1ull << priority) & cs->last_added_bo_priority_usage)
+ return cs->last_added_bo_index;
+
+ if (!bo->bo) {
+ index = amdgpu_lookup_or_add_slab_buffer(acs, bo);
+ if (index < 0)
+ return 0;
+
+ buffer = &cs->slab_buffers[index];
+ buffer->usage |= usage;
+
+ usage &= ~RADEON_USAGE_SYNCHRONIZED;
+ index = buffer->u.slab.real_idx;
+ } else {
+ index = amdgpu_lookup_or_add_real_buffer(acs, bo);
+ if (index < 0)
+ return 0;
+ }
+
+ buffer = &cs->real_buffers[index];
+ buffer->u.real.priority_usage |= 1llu << priority;
+ buffer->usage |= usage;
+ cs->flags[index] = MAX2(cs->flags[index], priority / 4);
+
+ cs->last_added_bo = bo;
+ cs->last_added_bo_index = index;
+ cs->last_added_bo_usage = buffer->usage;
+ cs->last_added_bo_priority_usage = buffer->u.real.priority_usage;
+ return index;
+}
+
+static bool amdgpu_ib_new_buffer(struct amdgpu_winsys *ws, struct amdgpu_ib *ib)
+{
+ struct pb_buffer *pb;
+ uint8_t *mapped;
+ unsigned buffer_size;
+
+ /* Always create a buffer that is at least as large as the maximum seen IB
+ * size, aligned to a power of two (and multiplied by 4 to reduce internal
+ * fragmentation if chaining is not available). Limit to 512k dwords, which
+ * is the largest power of two that fits into the size field of the
+ * INDIRECT_BUFFER packet.
+ */
+ if (amdgpu_cs_has_chaining(amdgpu_cs_from_ib(ib)))
+ buffer_size = 4 *util_next_power_of_two(ib->max_ib_size);
+ else
+ buffer_size = 4 *util_next_power_of_two(4 * ib->max_ib_size);
+
+ buffer_size = MIN2(buffer_size, 4 * 512 * 1024);
+
+ switch (ib->ib_type) {
+ case IB_CONST_PREAMBLE:
+ buffer_size = MAX2(buffer_size, 4 * 1024);
+ break;
+ case IB_CONST:
+ buffer_size = MAX2(buffer_size, 16 * 1024 * 4);
+ break;
+ case IB_MAIN:
+ buffer_size = MAX2(buffer_size, 8 * 1024 * 4);
+ break;
+ default:
+ unreachable("unhandled IB type");
+ }
+
+ pb = ws->base.buffer_create(&ws->base, buffer_size,
+ ws->info.gart_page_size,
+ RADEON_DOMAIN_GTT,
+ RADEON_FLAG_CPU_ACCESS);
+ if (!pb)
+ return false;
+
+ mapped = ws->base.buffer_map(pb, NULL, PIPE_TRANSFER_WRITE);
+ if (!mapped) {
+ pb_reference(&pb, NULL);
+ return false;
+ }
+
+ pb_reference(&ib->big_ib_buffer, pb);
+ pb_reference(&pb, NULL);
+
+ ib->ib_mapped = mapped;
+ ib->used_ib_space = 0;
+
+ return true;
+}
+
+static unsigned amdgpu_ib_max_submit_dwords(enum ib_type ib_type)
+{
+ switch (ib_type) {
+ case IB_MAIN:
+ /* Smaller submits means the GPU gets busy sooner and there is less
+ * waiting for buffers and fences. Proof:
+ * http://www.phoronix.com/scan.php?page=article&item=mesa-111-si&num=1
+ */
+ return 20 * 1024;
+ case IB_CONST_PREAMBLE:
+ case IB_CONST:
+ /* There isn't really any reason to limit CE IB size beyond the natural
+ * limit implied by the main IB, except perhaps GTT size. Just return
+ * an extremely large value that we never get anywhere close to.
+ */
+ return 16 * 1024 * 1024;
+ default:
+ unreachable("bad ib_type");
+ }
+}
+
+static bool amdgpu_get_new_ib(struct radeon_winsys *ws, struct amdgpu_cs *cs,
+ enum ib_type ib_type)
+{
+ struct amdgpu_winsys *aws = (struct amdgpu_winsys*)ws;