#include "brw_oa_sklgt3.h"
#include "brw_oa_sklgt4.h"
#include "brw_oa_bxt.h"
+#include "brw_oa_kblgt2.h"
+#include "brw_oa_kblgt3.h"
+#include "brw_oa_glk.h"
#include "intel_batchbuffer.h"
#define FILE_DEBUG_FLAG DEBUG_PERFMON
int refcount;
int len;
uint8_t buf[I915_PERF_OA_SAMPLE_SIZE * 10];
+ uint32_t last_timestamp;
};
/**
*/
struct brw_bo *bo;
+ /**
+ * Address of mapped of @bo
+ */
+ void *map;
+
/**
* The MI_REPORT_PERF_COUNT command lets us specify a unique
* ID that will be reflected in the resulting OA report
}
}
-static bool
-read_oa_samples(struct brw_context *brw)
+enum OaReadStatus {
+ OA_READ_STATUS_ERROR,
+ OA_READ_STATUS_UNFINISHED,
+ OA_READ_STATUS_FINISHED,
+};
+
+static enum OaReadStatus
+read_oa_samples_until(struct brw_context *brw,
+ uint32_t start_timestamp,
+ uint32_t end_timestamp)
{
+ struct exec_node *tail_node =
+ exec_list_get_tail(&brw->perfquery.sample_buffers);
+ struct brw_oa_sample_buf *tail_buf =
+ exec_node_data(struct brw_oa_sample_buf, tail_node, link);
+ uint32_t last_timestamp = tail_buf->last_timestamp;
+
while (1) {
struct brw_oa_sample_buf *buf = get_free_sample_buf(brw);
+ uint32_t offset;
int len;
while ((len = read(brw->perfquery.oa_stream_fd, buf->buf,
if (len < 0) {
if (errno == EAGAIN)
- return true;
+ return ((last_timestamp - start_timestamp) >=
+ (end_timestamp - start_timestamp)) ?
+ OA_READ_STATUS_FINISHED :
+ OA_READ_STATUS_UNFINISHED;
else {
DBG("Error reading i915 perf samples: %m\n");
- return false;
}
- } else {
+ } else
DBG("Spurious EOF reading i915 perf samples\n");
- return false;
- }
+
+ return OA_READ_STATUS_ERROR;
}
buf->len = len;
exec_list_push_tail(&brw->perfquery.sample_buffers, &buf->link);
+
+ /* Go through the reports and update the last timestamp. */
+ offset = 0;
+ while (offset < buf->len) {
+ const struct drm_i915_perf_record_header *header =
+ (const struct drm_i915_perf_record_header *) &buf->buf[offset];
+ uint32_t *report = (uint32_t *) (header + 1);
+
+ if (header->type == DRM_I915_PERF_RECORD_SAMPLE)
+ last_timestamp = report[1];
+
+ offset += header->size;
+ }
+
+ buf->last_timestamp = last_timestamp;
}
unreachable("not reached");
+ return OA_READ_STATUS_ERROR;
+}
+
+/**
+ * Try to read all the reports until either the delimiting timestamp
+ * or an error arises.
+ */
+static bool
+read_oa_samples_for_query(struct brw_context *brw,
+ struct brw_perf_query_object *obj)
+{
+ uint32_t *start;
+ uint32_t *last;
+ uint32_t *end;
+
+ /* We need the MI_REPORT_PERF_COUNT to land before we can start
+ * accumulate. */
+ assert(!brw_batch_references(&brw->batch, obj->oa.bo) &&
+ !brw_bo_busy(obj->oa.bo));
+
+ /* Map the BO once here and let accumulate_oa_reports() unmap
+ * it. */
+ if (obj->oa.map == NULL)
+ obj->oa.map = brw_bo_map(brw, obj->oa.bo, MAP_READ);
+
+ start = last = obj->oa.map;
+ end = obj->oa.map + MI_RPC_BO_END_OFFSET_BYTES;
+
+ if (start[0] != obj->oa.begin_report_id) {
+ DBG("Spurious start report id=%"PRIu32"\n", start[0]);
+ return true;
+ }
+ if (end[0] != (obj->oa.begin_report_id + 1)) {
+ DBG("Spurious end report id=%"PRIu32"\n", end[0]);
+ return true;
+ }
+
+ /* Read the reports until the end timestamp. */
+ switch (read_oa_samples_until(brw, start[1], end[1])) {
+ case OA_READ_STATUS_ERROR:
+ /* Fallthrough and let accumulate_oa_reports() deal with the
+ * error. */
+ case OA_READ_STATUS_FINISHED:
+ return true;
+ case OA_READ_STATUS_UNFINISHED:
+ return false;
+ }
+
+ unreachable("invalid read status");
return false;
}
/**
- * Accumulate raw OA counter values based on deltas between pairs
- * of OA reports.
+ * Accumulate raw OA counter values based on deltas between pairs of
+ * OA reports.
*
* Accumulation starts from the first report captured via
* MI_REPORT_PERF_COUNT (MI_RPC) by brw_begin_perf_query() until the
accumulate_oa_reports(struct brw_context *brw,
struct brw_perf_query_object *obj)
{
+ const struct gen_device_info *devinfo = &brw->screen->devinfo;
struct gl_perf_query_object *o = &obj->base;
- uint32_t *query_buffer;
uint32_t *start;
uint32_t *last;
uint32_t *end;
struct exec_node *first_samples_node;
bool in_ctx = true;
uint32_t ctx_id;
+ int out_duration = 0;
assert(o->Ready);
+ assert(obj->oa.map != NULL);
- /* Collect the latest periodic OA reports from i915 perf */
- if (!read_oa_samples(brw))
- goto error;
-
- query_buffer = brw_bo_map(brw, obj->oa.bo, MAP_READ);
-
- start = last = query_buffer;
- end = query_buffer + (MI_RPC_BO_END_OFFSET_BYTES / sizeof(uint32_t));
+ start = last = obj->oa.map;
+ end = obj->oa.map + MI_RPC_BO_END_OFFSET_BYTES;
if (start[0] != obj->oa.begin_report_id) {
DBG("Spurious start report id=%"PRIu32"\n", start[0]);
* For Haswell we can rely on the HW to stop the progress
* of OA counters while any other context is acctive.
*/
- if (brw->gen >= 8) {
+ if (devinfo->gen >= 8) {
if (in_ctx && report[2] != ctx_id) {
DBG("i915 perf: Switch AWAY (observed by ID change)\n");
in_ctx = false;
+ out_duration = 0;
} else if (in_ctx == false && report[2] == ctx_id) {
DBG("i915 perf: Switch TO\n");
in_ctx = true;
- add = false;
+
+ /* From experimentation in IGT, we found that the OA unit
+ * might label some report as "idle" (using an invalid
+ * context ID), right after a report for a given context.
+ * Deltas generated by those reports actually belong to the
+ * previous context, even though they're not labelled as
+ * such.
+ *
+ * We didn't *really* Switch AWAY in the case that we e.g.
+ * saw a single periodic report while idle...
+ */
+ if (out_duration >= 1)
+ add = false;
} else if (in_ctx) {
assert(report[2] == ctx_id);
DBG("i915 perf: Continuation IN\n");
assert(report[2] != ctx_id);
DBG("i915 perf: Continuation OUT\n");
add = false;
+ out_duration++;
}
}
DBG("Marking %d accumulated - results gathered\n", o->Id);
brw_bo_unmap(obj->oa.bo);
+ obj->oa.map = NULL;
obj->oa.results_accumulated = true;
drop_from_unaccumulated_query_list(brw, obj);
dec_n_oa_users(brw);
error:
brw_bo_unmap(obj->oa.bo);
+ obj->oa.map = NULL;
discard_all_queries(brw);
}
if (brw_batch_references(&brw->batch, bo))
intel_batchbuffer_flush(brw);
- brw_bo_wait_rendering(brw, bo);
+ brw_bo_wait_rendering(bo);
+
+ /* Due to a race condition between the OA unit signaling report
+ * availability and the report actually being written into memory,
+ * we need to wait for all the reports to come in before we can
+ * read them.
+ */
+ if (obj->query->kind == OA_COUNTERS) {
+ while (!read_oa_samples_for_query(brw, obj))
+ ;
+ }
}
static bool
return (obj->oa.results_accumulated ||
(obj->oa.bo &&
!brw_batch_references(&brw->batch, obj->oa.bo) &&
- !brw_bo_busy(obj->oa.bo)));
-
+ !brw_bo_busy(obj->oa.bo) &&
+ read_oa_samples_for_query(brw, obj)));
case PIPELINE_STATS:
return (obj->pipeline_stats.bo &&
!brw_batch_references(&brw->batch, obj->pipeline_stats.bo) &&
brw->perfquery.sys_vars.gt_min_freq = min_freq_mhz * 1000000;
brw->perfquery.sys_vars.gt_max_freq = max_freq_mhz * 1000000;
brw->perfquery.sys_vars.timestamp_frequency = devinfo->timestamp_frequency;
+ brw->perfquery.sys_vars.n_eu_slices = devinfo->num_slices;
+ /* Assuming uniform distribution of subslices per slices. */
+ brw->perfquery.sys_vars.n_eu_sub_slices = devinfo->num_subslices[0];
if (devinfo->is_haswell) {
+ brw->perfquery.sys_vars.slice_mask = 0;
+ brw->perfquery.sys_vars.subslice_mask = 0;
+
+ for (int s = 0; s < devinfo->num_slices; s++)
+ brw->perfquery.sys_vars.slice_mask |= 1U << s;
+ for (int ss = 0; ss < devinfo->num_subslices[0]; ss++)
+ brw->perfquery.sys_vars.subslice_mask |= 1U << ss;
+
if (devinfo->gt == 1) {
brw->perfquery.sys_vars.n_eus = 10;
- brw->perfquery.sys_vars.n_eu_slices = 1;
- brw->perfquery.sys_vars.n_eu_sub_slices = 1;
- brw->perfquery.sys_vars.slice_mask = 0x1;
- brw->perfquery.sys_vars.subslice_mask = 0x1;
} else if (devinfo->gt == 2) {
brw->perfquery.sys_vars.n_eus = 20;
- brw->perfquery.sys_vars.n_eu_slices = 1;
- brw->perfquery.sys_vars.n_eu_sub_slices = 2;
- brw->perfquery.sys_vars.slice_mask = 0x1;
- brw->perfquery.sys_vars.subslice_mask = 0x3;
} else if (devinfo->gt == 3) {
brw->perfquery.sys_vars.n_eus = 40;
- brw->perfquery.sys_vars.n_eu_slices = 2;
- brw->perfquery.sys_vars.n_eu_sub_slices = 2;
- brw->perfquery.sys_vars.slice_mask = 0x3;
- brw->perfquery.sys_vars.subslice_mask = 0xf;
} else
unreachable("not reached");
} else {
__DRIscreen *screen = brw->screen->driScrnPriv;
drm_i915_getparam_t gp;
int ret;
- int n_eus = 0;
int slice_mask = 0;
int ss_mask = 0;
- int s_max = devinfo->num_slices; /* maximum number of slices */
- int ss_max = 0; /* maximum number of subslices per slice */
+ /* maximum number of slices */
+ int s_max = devinfo->num_slices;
+ /* maximum number of subslices per slice (assuming uniform subslices per
+ * slices)
+ */
+ int ss_max = devinfo->num_subslices[0];
uint64_t subslice_mask = 0;
int s;
- if (devinfo->gen == 8) {
- if (devinfo->gt == 1) {
- ss_max = 2;
- } else {
- ss_max = 3;
- }
- } else if (devinfo->gen == 9) {
- /* XXX: beware that the kernel (as of writing) actually works as if
- * ss_max == 4 since the HW register that reports the global subslice
- * mask has 4 bits while in practice the limit is 3. It's also
- * important that we initialize $SubsliceMask with 3 bits per slice
- * since that's what the counter availability expressions in XML
- * expect.
- */
- ss_max = 3;
- } else
- return false;
-
- gp.param = I915_PARAM_EU_TOTAL;
- gp.value = &n_eus;
- ret = drmIoctl(screen->fd, DRM_IOCTL_I915_GETPARAM, &gp);
- if (ret)
- return false;
-
gp.param = I915_PARAM_SLICE_MASK;
gp.value = &slice_mask;
ret = drmIoctl(screen->fd, DRM_IOCTL_I915_GETPARAM, &gp);
if (ret)
return false;
- brw->perfquery.sys_vars.n_eus = n_eus;
+ brw->perfquery.sys_vars.n_eus = brw->screen->eu_total;
brw->perfquery.sys_vars.n_eu_slices = __builtin_popcount(slice_mask);
brw->perfquery.sys_vars.slice_mask = slice_mask;
if (devinfo->gt == 4)
return brw_oa_register_queries_sklgt4;
}
+ if (devinfo->is_kabylake) {
+ if (devinfo->gt == 2)
+ return brw_oa_register_queries_kblgt2;
+ if (devinfo->gt == 3)
+ return brw_oa_register_queries_kblgt3;
+ }
+ if (devinfo->is_geminilake)
+ return brw_oa_register_queries_glk;
return NULL;
}