+
+ functions->MemoryBarrier = brw_memory_barrier;
+}
+
+struct shader_times {
+ uint64_t time;
+ uint64_t written;
+ uint64_t reset;
+};
+
+void
+brw_init_shader_time(struct brw_context *brw)
+{
+ const int max_entries = 2048;
+ brw->shader_time.bo =
+ drm_intel_bo_alloc(brw->bufmgr, "shader time",
+ max_entries * SHADER_TIME_STRIDE * 3, 4096);
+ brw->shader_time.names = rzalloc_array(brw, const char *, max_entries);
+ brw->shader_time.ids = rzalloc_array(brw, int, max_entries);
+ brw->shader_time.types = rzalloc_array(brw, enum shader_time_shader_type,
+ max_entries);
+ brw->shader_time.cumulative = rzalloc_array(brw, struct shader_times,
+ max_entries);
+ brw->shader_time.max_entries = max_entries;
+}
+
+static int
+compare_time(const void *a, const void *b)
+{
+ uint64_t * const *a_val = a;
+ uint64_t * const *b_val = b;
+
+ /* We don't just subtract because we're turning the value to an int. */
+ if (**a_val < **b_val)
+ return -1;
+ else if (**a_val == **b_val)
+ return 0;
+ else
+ return 1;
+}
+
+static void
+print_shader_time_line(const char *stage, const char *name,
+ int shader_num, uint64_t time, uint64_t total)
+{
+ fprintf(stderr, "%-6s%-18s", stage, name);
+
+ if (shader_num != 0)
+ fprintf(stderr, "%4d: ", shader_num);
+ else
+ fprintf(stderr, " : ");
+
+ fprintf(stderr, "%16lld (%7.2f Gcycles) %4.1f%%\n",
+ (long long)time,
+ (double)time / 1000000000.0,
+ (double)time / total * 100.0);
+}
+
+static void
+brw_report_shader_time(struct brw_context *brw)
+{
+ if (!brw->shader_time.bo || !brw->shader_time.num_entries)
+ return;
+
+ uint64_t scaled[brw->shader_time.num_entries];
+ uint64_t *sorted[brw->shader_time.num_entries];
+ uint64_t total_by_type[ST_CS + 1];
+ memset(total_by_type, 0, sizeof(total_by_type));
+ double total = 0;
+ for (int i = 0; i < brw->shader_time.num_entries; i++) {
+ uint64_t written = 0, reset = 0;
+ enum shader_time_shader_type type = brw->shader_time.types[i];
+
+ sorted[i] = &scaled[i];
+
+ switch (type) {
+ case ST_VS:
+ case ST_GS:
+ case ST_FS8:
+ case ST_FS16:
+ case ST_CS:
+ written = brw->shader_time.cumulative[i].written;
+ reset = brw->shader_time.cumulative[i].reset;
+ break;
+
+ default:
+ /* I sometimes want to print things that aren't the 3 shader times.
+ * Just print the sum in that case.
+ */
+ written = 1;
+ reset = 0;
+ break;
+ }
+
+ uint64_t time = brw->shader_time.cumulative[i].time;
+ if (written) {
+ scaled[i] = time / written * (written + reset);
+ } else {
+ scaled[i] = time;
+ }
+
+ switch (type) {
+ case ST_VS:
+ case ST_GS:
+ case ST_FS8:
+ case ST_FS16:
+ case ST_CS:
+ total_by_type[type] += scaled[i];
+ break;
+ default:
+ break;
+ }
+
+ total += scaled[i];
+ }
+
+ if (total == 0) {
+ fprintf(stderr, "No shader time collected yet\n");
+ return;
+ }
+
+ qsort(sorted, brw->shader_time.num_entries, sizeof(sorted[0]), compare_time);
+
+ fprintf(stderr, "\n");
+ fprintf(stderr, "type ID cycles spent %% of total\n");
+ for (int s = 0; s < brw->shader_time.num_entries; s++) {
+ const char *stage;
+ /* Work back from the sorted pointers times to a time to print. */
+ int i = sorted[s] - scaled;
+
+ if (scaled[i] == 0)
+ continue;
+
+ int shader_num = brw->shader_time.ids[i];
+ const char *shader_name = brw->shader_time.names[i];
+
+ switch (brw->shader_time.types[i]) {
+ case ST_VS:
+ stage = "vs";
+ break;
+ case ST_GS:
+ stage = "gs";
+ break;
+ case ST_FS8:
+ stage = "fs8";
+ break;
+ case ST_FS16:
+ stage = "fs16";
+ break;
+ case ST_CS:
+ stage = "cs";
+ break;
+ default:
+ stage = "other";
+ break;
+ }
+
+ print_shader_time_line(stage, shader_name, shader_num,
+ scaled[i], total);
+ }
+
+ fprintf(stderr, "\n");
+ print_shader_time_line("total", "vs", 0, total_by_type[ST_VS], total);
+ print_shader_time_line("total", "gs", 0, total_by_type[ST_GS], total);
+ print_shader_time_line("total", "fs8", 0, total_by_type[ST_FS8], total);
+ print_shader_time_line("total", "fs16", 0, total_by_type[ST_FS16], total);
+ print_shader_time_line("total", "cs", 0, total_by_type[ST_CS], total);
+}
+
+static void
+brw_collect_shader_time(struct brw_context *brw)
+{
+ if (!brw->shader_time.bo)
+ return;
+
+ /* This probably stalls on the last rendering. We could fix that by
+ * delaying reading the reports, but it doesn't look like it's a big
+ * overhead compared to the cost of tracking the time in the first place.
+ */
+ drm_intel_bo_map(brw->shader_time.bo, true);
+ void *bo_map = brw->shader_time.bo->virtual;
+
+ for (int i = 0; i < brw->shader_time.num_entries; i++) {
+ uint32_t *times = bo_map + i * 3 * SHADER_TIME_STRIDE;
+
+ brw->shader_time.cumulative[i].time += times[SHADER_TIME_STRIDE * 0 / 4];
+ brw->shader_time.cumulative[i].written += times[SHADER_TIME_STRIDE * 1 / 4];
+ brw->shader_time.cumulative[i].reset += times[SHADER_TIME_STRIDE * 2 / 4];
+ }
+
+ /* Zero the BO out to clear it out for our next collection.
+ */
+ memset(bo_map, 0, brw->shader_time.bo->size);
+ drm_intel_bo_unmap(brw->shader_time.bo);
+}
+
+void
+brw_collect_and_report_shader_time(struct brw_context *brw)
+{
+ brw_collect_shader_time(brw);
+
+ if (brw->shader_time.report_time == 0 ||
+ get_time() - brw->shader_time.report_time >= 1.0) {
+ brw_report_shader_time(brw);
+ brw->shader_time.report_time = get_time();
+ }