bool has_llc:1;
bool has_mmap_wc:1;
bool bo_reuse:1;
+ bool supports_48b_addresses:1;
};
static int bo_set_tiling_internal(struct brw_bo *bo, uint32_t tiling_mode,
bo->reusable = true;
bo->cache_coherent = bufmgr->has_llc;
bo->index = -1;
+ if (bufmgr->supports_48b_addresses)
+ bo->kflags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
mtx_unlock(&bufmgr->lock);
return v;
}
+static bool
+gem_supports_48b_addresses(int fd)
+{
+ struct drm_i915_gem_exec_object2 obj = {
+ .flags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS,
+ };
+
+ struct drm_i915_gem_execbuffer2 execbuf = {
+ .buffers_ptr = (uintptr_t)&obj,
+ .buffer_count = 1,
+ .rsvd1 = 0xffffffu,
+ };
+
+ int ret = drmIoctl(fd, DRM_IOCTL_I915_GEM_EXECBUFFER2, &execbuf);
+
+ return ret == -1 && errno == ENOENT;
+}
+
/**
* Initializes the GEM buffer manager, which uses the kernel to allocate, map,
* and manage map buffer objections.
bufmgr->has_llc = devinfo->has_llc;
bufmgr->has_mmap_wc = gem_param(fd, I915_PARAM_MMAP_VERSION) > 0;
+ bufmgr->supports_48b_addresses =
+ devinfo->gen >= 8 && gem_supports_48b_addresses(fd);
init_cache_buckets(bufmgr);
}
if (devinfo->gen >= 8) {
+ /* STATE_BASE_ADDRESS has issues with 48-bit address spaces. If the
+ * address + size as seen by STATE_BASE_ADDRESS overflows 48 bits,
+ * the GPU appears to treat all accesses to the buffer as being out
+ * of bounds and returns zero. To work around this, we pin all SBAs
+ * to the bottom 4GB.
+ */
uint32_t mocs_wb = devinfo->gen >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB;
int pkt_len = devinfo->gen >= 9 ? 19 : 16;
OUT_BATCH(0);
OUT_BATCH(mocs_wb << 16);
/* Surface state base address: */
- OUT_RELOC64(brw->batch.state.bo, 0, mocs_wb << 4 | 1);
+ OUT_RELOC64(brw->batch.state.bo, RELOC_32BIT, mocs_wb << 4 | 1);
/* Dynamic state base address: */
- OUT_RELOC64(brw->batch.state.bo, 0, mocs_wb << 4 | 1);
+ OUT_RELOC64(brw->batch.state.bo, RELOC_32BIT, mocs_wb << 4 | 1);
/* Indirect object base address: MEDIA_OBJECT data */
OUT_BATCH(mocs_wb << 4 | 1);
OUT_BATCH(0);
/* Instruction base address: shader kernels (incl. SIP) */
- OUT_RELOC64(brw->cache.bo, 0, mocs_wb << 4 | 1);
-
+ OUT_RELOC64(brw->cache.bo, RELOC_32BIT, mocs_wb << 4 | 1);
/* General state buffer size */
OUT_BATCH(0xfffff001);
/* Dynamic state buffer size */
* FIXME: move to the point of assignment.
*/
assert((aux_offset & 0xfff) == 0);
- uint32_t *aux_addr = state + brw->isl_dev.ss.aux_addr_offset;
- *aux_addr = brw_state_reloc(&brw->batch,
- *surf_offset +
- brw->isl_dev.ss.aux_addr_offset,
- aux_bo, *aux_addr,
- reloc_flags);
+
+ if (devinfo->gen >= 8) {
+ uint64_t *aux_addr = state + brw->isl_dev.ss.aux_addr_offset;
+ *aux_addr = brw_state_reloc(&brw->batch,
+ *surf_offset +
+ brw->isl_dev.ss.aux_addr_offset,
+ aux_bo, *aux_addr,
+ reloc_flags);
+ } else {
+ uint32_t *aux_addr = state + brw->isl_dev.ss.aux_addr_offset;
+ *aux_addr = brw_state_reloc(&brw->batch,
+ *surf_offset +
+ brw->isl_dev.ss.aux_addr_offset,
+ aux_bo, *aux_addr,
+ reloc_flags);
+
+ }
}
}
.buffer = brw->batch.state.bo,
.offset = offset,
+ /* The VF cache designers apparently cut corners, and made the cache
+ * only consider the bottom 32 bits of memory addresses. If you happen
+ * to have two vertex buffers which get placed exactly 4 GiB apart and
+ * use them in back-to-back draw calls, you can get collisions. To work
+ * around this problem, we restrict vertex buffers to the low 32 bits of
+ * the address space.
+ */
+ .reloc_flags = RELOC_32BIT,
+
#if GEN_GEN == 10
.mocs = CNL_MOCS_WB,
#elif GEN_GEN == 9
}
}
-static struct brw_address
+UNUSED static struct brw_address
rw_bo(struct brw_bo *bo, uint32_t offset)
{
return (struct brw_address) {
};
}
+static struct brw_address
+rw_32_bo(struct brw_bo *bo, uint32_t offset)
+{
+ return (struct brw_address) {
+ .bo = bo,
+ .offset = offset,
+ .reloc_flags = RELOC_WRITE | RELOC_32BIT,
+ };
+}
+
+static struct brw_address
+ro_32_bo(struct brw_bo *bo, uint32_t offset)
+{
+ return (struct brw_address) {
+ .bo = bo,
+ .offset = offset,
+ .reloc_flags = RELOC_32BIT,
+ };
+}
+
UNUSED static struct brw_address
ggtt_bo(struct brw_bo *bo, uint32_t offset)
{
struct GENX(VERTEX_BUFFER_STATE) buf_state = {
.VertexBufferIndex = buffer_nr,
.BufferPitch = stride,
- .BufferStartingAddress = ro_bo(bo, start_offset),
+
+ /* The VF cache designers apparently cut corners, and made the cache
+ * only consider the bottom 32 bits of memory addresses. If you happen
+ * to have two vertex buffers which get placed exactly 4 GiB apart and
+ * use them in back-to-back draw calls, you can get collisions. To work
+ * around this problem, we restrict vertex buffers to the low 32 bits of
+ * the address space.
+ */
+ .BufferStartingAddress = ro_32_bo(bo, start_offset),
#if GEN_GEN >= 8
.BufferSize = end_offset - start_offset,
#endif
ib.CutIndexEnable = brw->prim_restart.enable_cut_index;
#endif
ib.IndexFormat = brw_get_index_type(index_buffer->index_size);
- ib.BufferStartingAddress = ro_bo(brw->ib.bo, 0);
+
+ /* The VF cache designers apparently cut corners, and made the cache
+ * only consider the bottom 32 bits of memory addresses. If you happen
+ * to have two index buffers which get placed exactly 4 GiB apart and
+ * use them in back-to-back draw calls, you can get collisions. To work
+ * around this problem, we restrict index buffers to the low 32 bits of
+ * the address space.
+ */
+ ib.BufferStartingAddress = ro_32_bo(brw->ib.bo, 0);
#if GEN_GEN >= 8
ib.IndexBufferMOCS = GEN_GEN >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB;
ib.BufferSize = brw->ib.size;
#endif
if (wm_prog_data->base.total_scratch) {
- wm.ScratchSpaceBasePointer = rw_bo(stage_state->scratch_bo, 0);
+ wm.ScratchSpaceBasePointer = rw_32_bo(stage_state->scratch_bo, 0);
wm.PerThreadScratchSpace =
ffs(stage_state->per_thread_scratch) - 11;
}
/* ---------------------------------------------------------------------- */
+/* We restrict scratch buffers to the bottom 32 bits of the address space
+ * by using rw_32_bo().
+ *
+ * General State Base Address is a bit broken. If the address + size as
+ * seen by STATE_BASE_ADDRESS overflows 48 bits, the GPU appears to treat
+ * all accesses to the buffer as being out of bounds and returns zero.
+ */
+
#define INIT_THREAD_DISPATCH_FIELDS(pkt, prefix) \
pkt.KernelStartPointer = KSP(brw, stage_state->prog_offset); \
pkt.SamplerCount = \
pkt.FloatingPointMode = stage_prog_data->use_alt_mode; \
\
if (stage_prog_data->total_scratch) { \
- pkt.ScratchSpaceBasePointer = rw_bo(stage_state->scratch_bo, 0); \
+ pkt.ScratchSpaceBasePointer = rw_32_bo(stage_state->scratch_bo, 0); \
pkt.PerThreadScratchSpace = \
ffs(stage_state->per_thread_scratch) - 11; \
} \
if (prog_data->base.total_scratch) {
ps.ScratchSpaceBasePointer =
- rw_bo(stage_state->scratch_bo,
- ffs(stage_state->per_thread_scratch) - 11);
+ rw_32_bo(stage_state->scratch_bo,
+ ffs(stage_state->per_thread_scratch) - 11);
}
}
}
*/
per_thread_scratch_value = stage_state->per_thread_scratch / 1024 - 1;
}
- vfe.ScratchSpaceBasePointer = rw_bo(stage_state->scratch_bo, 0);
+ vfe.ScratchSpaceBasePointer = rw_32_bo(stage_state->scratch_bo, 0);
vfe.PerThreadScratchSpace = per_thread_scratch_value;
}
unsigned int index = add_exec_bo(batch, target);
struct drm_i915_gem_exec_object2 *entry = &batch->validation_list[index];
+ if (reloc_flags & RELOC_32BIT) {
+ /* Restrict this buffer to the low 32 bits of the address space.
+ *
+ * Altering the validation list flags restricts it for this batch,
+ * but we also alter the BO's kflags to restrict it permanently
+ * (until the BO is destroyed and put back in the cache). Buffers
+ * may stay bound across batches, and we want keep it constrained.
+ */
+ target->kflags &= ~EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
+ entry->flags &= ~EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
+
+ /* RELOC_32BIT is not an EXEC_OBJECT_* flag, so get rid of it. */
+ reloc_flags &= ~RELOC_32BIT;
+ }
+
if (reloc_flags)
entry->flags |= reloc_flags & batch->valid_reloc_flags;
#define RELOC_WRITE EXEC_OBJECT_WRITE
#define RELOC_NEEDS_GGTT EXEC_OBJECT_NEEDS_GTT
+/* Inverted meaning, but using the same bit...emit_reloc will flip it. */
+#define RELOC_32BIT EXEC_OBJECT_SUPPORTS_48B_ADDRESS
uint64_t brw_batch_reloc(struct intel_batchbuffer *batch,
uint32_t batch_offset,
struct brw_bo *target,