Since almost all BOs will be in one CL at a time, this cache will almost
always hit except for the first usage of the BO in each CL.
This didn't show up as statistically significant on the minetest trace
(n=340), but if I lop off the throttled lobe of the bimodal distribution,
it very clearly does (0.74731% +/- 0.162093%, n=269).
uint32_t handle;
uint32_t size;
+ /* This will be read/written by multiple threads without a lock -- you
+ * should take a snapshot and use it to see if you happen to be in the
+ * CL's handles at this position, to make most lookups O(1). It's
+ * volatile to make sure that the compiler doesn't emit multiple loads
+ * from the address, which would make the lookup racy.
+ */
+ volatile uint32_t last_hindex;
+
/** Entry in the linked list of buffers freed, by age. */
struct list_head time_list;
/** Entry in the per-page-count linked list of buffers freed (by age). */
{
uint32_t hindex;
uint32_t *current_handles = job->bo_handles.base;
+ uint32_t cl_hindex_count = cl_offset(&job->bo_handles) / 4;
+ uint32_t last_hindex = bo->last_hindex; /* volatile read! */
- for (hindex = 0; hindex < cl_offset(&job->bo_handles) / 4; hindex++) {
- if (current_handles[hindex] == bo->handle)
+ if (last_hindex < cl_hindex_count &&
+ current_handles[last_hindex] == bo->handle) {
+ return last_hindex;
+ }
+
+ for (hindex = 0; hindex < cl_hindex_count; hindex++) {
+ if (current_handles[hindex] == bo->handle) {
+ bo->last_hindex = hindex;
return hindex;
+ }
}
struct vc4_cl_out *out;
job->bo_space += bo->size;
+ bo->last_hindex = hindex;
return hindex;
}