nvc0: reduce the number of GPR used when reading MP perf counters
[mesa.git] / src / gallium / drivers / nouveau / nvc0 / nvc0_query_hw_sm.c
1 /*
2 * Copyright 2011 Christoph Bumiller
3 * Copyright 2015 Samuel Pitoiset
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21 * OTHER DEALINGS IN THE SOFTWARE.
22 */
23
24 #define NVC0_PUSH_EXPLICIT_SPACE_CHECKING
25
26 #include "nvc0/nvc0_context.h"
27 #include "nvc0/nvc0_query_hw_sm.h"
28
29 #include "nv_object.xml.h"
30 #include "nvc0/nve4_compute.xml.h"
31 #include "nvc0/nvc0_compute.xml.h"
32
33 /* === PERFORMANCE MONITORING COUNTERS for NVE4+ === */
34
35 /* NOTE: intentionally using the same names as NV */
36 static const char *nve4_hw_sm_query_names[] =
37 {
38 /* MP counters */
39 "active_cycles",
40 "active_warps",
41 "atom_count",
42 "branch",
43 "divergent_branch",
44 "gld_request",
45 "global_ld_mem_divergence_replays",
46 "global_store_transaction",
47 "global_st_mem_divergence_replays",
48 "gred_count",
49 "gst_request",
50 "inst_executed",
51 "inst_issued",
52 "inst_issued1",
53 "inst_issued2",
54 "l1_global_load_hit",
55 "l1_global_load_miss",
56 "l1_local_load_hit",
57 "l1_local_load_miss",
58 "l1_local_store_hit",
59 "l1_local_store_miss",
60 "l1_shared_load_transactions",
61 "l1_shared_store_transactions",
62 "local_load",
63 "local_load_transactions",
64 "local_store",
65 "local_store_transactions",
66 "prof_trigger_00",
67 "prof_trigger_01",
68 "prof_trigger_02",
69 "prof_trigger_03",
70 "prof_trigger_04",
71 "prof_trigger_05",
72 "prof_trigger_06",
73 "prof_trigger_07",
74 "shared_load",
75 "shared_load_replay",
76 "shared_store",
77 "shared_store_replay",
78 "sm_cta_launched",
79 "threads_launched",
80 "uncached_global_load_transaction",
81 "warps_launched",
82 /* metrics, i.e. functions of the MP counters */
83 "metric-ipc", /* inst_executed, clock */
84 "metric-ipac", /* inst_executed, active_cycles */
85 "metric-ipec", /* inst_executed, (bool)inst_executed */
86 "metric-achieved_occupancy", /* active_warps, active_cycles */
87 "metric-sm_efficiency", /* active_cycles, clock */
88 "metric-inst_replay_overhead" /* inst_issued, inst_executed */
89 };
90
91 /* Code to read out MP counters: They are accessible via mmio, too, but let's
92 * just avoid mapping registers in userspace. We'd have to know which MPs are
93 * enabled/present, too, and that information is not presently exposed.
94 * We could add a kernel interface for it, but reading the counters like this
95 * has the advantage of being async (if get_result isn't called immediately).
96 */
97 static const uint64_t nve4_read_hw_sm_counters_code[] =
98 {
99 /* sched 0x20 0x20 0x20 0x20 0x20 0x20 0x20
100 * mov b32 $r8 $tidx
101 * mov b32 $r12 $physid
102 * mov b32 $r0 $pm0
103 * mov b32 $r1 $pm1
104 * mov b32 $r2 $pm2
105 * mov b32 $r3 $pm3
106 * mov b32 $r4 $pm4
107 * sched 0x20 0x20 0x23 0x04 0x20 0x04 0x2b
108 * mov b32 $r5 $pm5
109 * mov b32 $r6 $pm6
110 * mov b32 $r7 $pm7
111 * set $p0 0x1 eq u32 $r8 0x0
112 * mov b32 $r10 c0[0x0]
113 * ext u32 $r8 $r12 0x414
114 * mov b32 $r11 c0[0x4]
115 * sched 0x04 0x2e 0x04 0x20 0x20 0x28 0x04
116 * ext u32 $r9 $r12 0x208
117 * (not $p0) exit
118 * set $p1 0x1 eq u32 $r9 0x0
119 * mul $r8 u32 $r8 u32 96
120 * mul $r12 u32 $r9 u32 16
121 * mul $r13 u32 $r9 u32 4
122 * add b32 $r9 $r8 $r13
123 * sched 0x28 0x04 0x2c 0x04 0x2c 0x04 0x2c
124 * add b32 $r8 $r8 $r12
125 * mov b32 $r12 $r10
126 * add b32 $r10 $c $r10 $r8
127 * mov b32 $r13 $r11
128 * add b32 $r11 $r11 0x0 $c
129 * add b32 $r12 $c $r12 $r9
130 * st b128 wt g[$r10d] $r0q
131 * sched 0x4 0x2c 0x20 0x04 0x2e 0x00 0x00
132 * mov b32 $r0 c0[0x8]
133 * add b32 $r13 $r13 0x0 $c
134 * $p1 st b128 wt g[$r12d+0x40] $r4q
135 * st b32 wt g[$r12d+0x50] $r0
136 * exit */
137 0x2202020202020207ULL,
138 0x2c00000084021c04ULL,
139 0x2c0000000c031c04ULL,
140 0x2c00000010001c04ULL,
141 0x2c00000014005c04ULL,
142 0x2c00000018009c04ULL,
143 0x2c0000001c00dc04ULL,
144 0x2c00000020011c04ULL,
145 0x22b0420042320207ULL,
146 0x2c00000024015c04ULL,
147 0x2c00000028019c04ULL,
148 0x2c0000002c01dc04ULL,
149 0x190e0000fc81dc03ULL,
150 0x2800400000029de4ULL,
151 0x7000c01050c21c03ULL,
152 0x280040001002dde4ULL,
153 0x204282020042e047ULL,
154 0x7000c00820c25c03ULL,
155 0x80000000000021e7ULL,
156 0x190e0000fc93dc03ULL,
157 0x1000000180821c02ULL,
158 0x1000000040931c02ULL,
159 0x1000000010935c02ULL,
160 0x4800000034825c03ULL,
161 0x22c042c042c04287ULL,
162 0x4800000030821c03ULL,
163 0x2800000028031de4ULL,
164 0x4801000020a29c03ULL,
165 0x280000002c035de4ULL,
166 0x0800000000b2dc42ULL,
167 0x4801000024c31c03ULL,
168 0x9400000000a01fc5ULL,
169 0x200002e04202c047ULL,
170 0x2800400020001de4ULL,
171 0x0800000000d35c42ULL,
172 0x9400000100c107c5ULL,
173 0x9400000140c01f85ULL,
174 0x8000000000001de7ULL
175 };
176
177 /* For simplicity, we will allocate as many group slots as we allocate counter
178 * slots. This means that a single counter which wants to source from 2 groups
179 * will have to be declared as using 2 counter slots. This shouldn't really be
180 * a problem because such queries don't make much sense ... (unless someone is
181 * really creative).
182 */
183 struct nvc0_hw_sm_counter_cfg
184 {
185 uint32_t func : 16; /* mask or 4-bit logic op (depending on mode) */
186 uint32_t mode : 4; /* LOGOP,B6,LOGOP_B6(_PULSE) */
187 uint32_t sig_dom : 1; /* if 0, MP_PM_A (per warp-sched), if 1, MP_PM_B */
188 uint32_t sig_sel : 8; /* signal group */
189 uint32_t src_mask; /* mask for signal selection (only for NVC0:NVE4) */
190 uint32_t src_sel; /* signal selection for up to 4 sources */
191 };
192
193 #define NVC0_COUNTER_OPn_SUM 0
194 #define NVC0_COUNTER_OPn_OR 1
195 #define NVC0_COUNTER_OPn_AND 2
196 #define NVC0_COUNTER_OP2_REL_SUM_MM 3 /* (sum(ctr0) - sum(ctr1)) / sum(ctr0) */
197 #define NVC0_COUNTER_OP2_DIV_SUM_M0 4 /* sum(ctr0) / ctr1 of MP[0]) */
198 #define NVC0_COUNTER_OP2_AVG_DIV_MM 5 /* avg(ctr0 / ctr1) */
199 #define NVC0_COUNTER_OP2_AVG_DIV_M0 6 /* avg(ctr0) / ctr1 of MP[0]) */
200
201 struct nvc0_hw_sm_query_cfg
202 {
203 struct nvc0_hw_sm_counter_cfg ctr[8];
204 uint8_t num_counters;
205 uint8_t op;
206 uint8_t norm[2]; /* normalization num,denom */
207 };
208
209 #define _Q1A(n, f, m, g, s, nu, dn) [NVE4_HW_SM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g, 0, s }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { nu, dn } }
210 #define _Q1B(n, f, m, g, s, nu, dn) [NVE4_HW_SM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g, 0, s }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { nu, dn } }
211 #define _M2A(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_HW_SM_QUERY_METRIC_##n] = { { \
212 { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g0, 0, s0 }, \
213 { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g1, 0, s1 }, \
214 {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } }
215 #define _M2B(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_HW_SM_QUERY_METRIC_##n] = { { \
216 { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g0, 0, s0 }, \
217 { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g1, 0, s1 }, \
218 {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } }
219 #define _M2AB(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_HW_SM_QUERY_METRIC_##n] = { { \
220 { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g0, 0, s0 }, \
221 { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g1, 0, s1 }, \
222 {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } }
223
224 /* NOTES:
225 * active_warps: bit 0 alternates btw 0 and 1 for odd nr of warps
226 * inst_executed etc.: we only count a single warp scheduler
227 * metric-ipXc: we simply multiply by 4 to account for the 4 warp schedulers;
228 * this is inaccurate !
229 */
230 static const struct nvc0_hw_sm_query_cfg nve4_hw_sm_queries[] =
231 {
232 _Q1B(ACTIVE_CYCLES, 0x0001, B6, WARP, 0x00000000, 1, 1),
233 _Q1B(ACTIVE_WARPS, 0x003f, B6, WARP, 0x31483104, 2, 1),
234 _Q1A(ATOM_COUNT, 0x0001, B6, BRANCH, 0x00000000, 1, 1),
235 _Q1A(BRANCH, 0x0001, B6, BRANCH, 0x0000000c, 1, 1),
236 _Q1A(DIVERGENT_BRANCH, 0x0001, B6, BRANCH, 0x00000010, 1, 1),
237 _Q1A(GLD_REQUEST, 0x0001, B6, LDST, 0x00000010, 1, 1),
238 _Q1B(GLD_MEM_DIV_REPLAY, 0x0001, B6, REPLAY, 0x00000010, 1, 1),
239 _Q1B(GST_TRANSACTIONS, 0x0001, B6, MEM, 0x00000004, 1, 1),
240 _Q1B(GST_MEM_DIV_REPLAY, 0x0001, B6, REPLAY, 0x00000014, 1, 1),
241 _Q1A(GRED_COUNT, 0x0001, B6, BRANCH, 0x00000008, 1, 1),
242 _Q1A(GST_REQUEST, 0x0001, B6, LDST, 0x00000014, 1, 1),
243 _Q1A(INST_EXECUTED, 0x0003, B6, EXEC, 0x00000398, 1, 1),
244 _Q1A(INST_ISSUED, 0x0003, B6, ISSUE, 0x00000104, 1, 1),
245 _Q1A(INST_ISSUED1, 0x0001, B6, ISSUE, 0x00000004, 1, 1),
246 _Q1A(INST_ISSUED2, 0x0001, B6, ISSUE, 0x00000008, 1, 1),
247 _Q1B(L1_GLD_HIT, 0x0001, B6, L1, 0x00000010, 1, 1),
248 _Q1B(L1_GLD_MISS, 0x0001, B6, L1, 0x00000014, 1, 1),
249 _Q1B(L1_LOCAL_LD_HIT, 0x0001, B6, L1, 0x00000000, 1, 1),
250 _Q1B(L1_LOCAL_LD_MISS, 0x0001, B6, L1, 0x00000004, 1, 1),
251 _Q1B(L1_LOCAL_ST_HIT, 0x0001, B6, L1, 0x00000008, 1, 1),
252 _Q1B(L1_LOCAL_ST_MISS, 0x0001, B6, L1, 0x0000000c, 1, 1),
253 _Q1B(L1_SHARED_LD_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000008, 1, 1),
254 _Q1B(L1_SHARED_ST_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x0000000c, 1, 1),
255 _Q1A(LOCAL_LD, 0x0001, B6, LDST, 0x00000008, 1, 1),
256 _Q1B(LOCAL_LD_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000000, 1, 1),
257 _Q1A(LOCAL_ST, 0x0001, B6, LDST, 0x0000000c, 1, 1),
258 _Q1B(LOCAL_ST_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000004, 1, 1),
259 _Q1A(PROF_TRIGGER_0, 0x0001, B6, USER, 0x00000000, 1, 1),
260 _Q1A(PROF_TRIGGER_1, 0x0001, B6, USER, 0x00000004, 1, 1),
261 _Q1A(PROF_TRIGGER_2, 0x0001, B6, USER, 0x00000008, 1, 1),
262 _Q1A(PROF_TRIGGER_3, 0x0001, B6, USER, 0x0000000c, 1, 1),
263 _Q1A(PROF_TRIGGER_4, 0x0001, B6, USER, 0x00000010, 1, 1),
264 _Q1A(PROF_TRIGGER_5, 0x0001, B6, USER, 0x00000014, 1, 1),
265 _Q1A(PROF_TRIGGER_6, 0x0001, B6, USER, 0x00000018, 1, 1),
266 _Q1A(PROF_TRIGGER_7, 0x0001, B6, USER, 0x0000001c, 1, 1),
267 _Q1A(SHARED_LD, 0x0001, B6, LDST, 0x00000000, 1, 1),
268 _Q1B(SHARED_LD_REPLAY, 0x0001, B6, REPLAY, 0x00000008, 1, 1),
269 _Q1A(SHARED_ST, 0x0001, B6, LDST, 0x00000004, 1, 1),
270 _Q1B(SHARED_ST_REPLAY, 0x0001, B6, REPLAY, 0x0000000c, 1, 1),
271 _Q1B(SM_CTA_LAUNCHED, 0x0001, B6, WARP, 0x0000001c, 1, 1),
272 _Q1A(THREADS_LAUNCHED, 0x003f, B6, LAUNCH, 0x398a4188, 1, 1),
273 _Q1B(UNCACHED_GLD_TRANSACTIONS, 0x0001, B6, MEM, 0x00000000, 1, 1),
274 _Q1A(WARPS_LAUNCHED, 0x0001, B6, LAUNCH, 0x00000004, 1, 1),
275 _M2AB(IPC, 0x3, B6, EXEC, 0x398, 0xffff, LOGOP, WARP, 0x0, DIV_SUM_M0, 10, 1),
276 _M2AB(IPAC, 0x3, B6, EXEC, 0x398, 0x1, B6, WARP, 0x0, AVG_DIV_MM, 10, 1),
277 _M2A(IPEC, 0x3, B6, EXEC, 0x398, 0xe, LOGOP, EXEC, 0x398, AVG_DIV_MM, 10, 1),
278 _M2A(INST_REPLAY_OHEAD, 0x3, B6, ISSUE, 0x104, 0x3, B6, EXEC, 0x398, REL_SUM_MM, 100, 1),
279 _M2B(MP_OCCUPANCY, 0x3f, B6, WARP, 0x31483104, 0x01, B6, WARP, 0x0, AVG_DIV_MM, 200, 64),
280 _M2B(MP_EFFICIENCY, 0x01, B6, WARP, 0x0, 0xffff, LOGOP, WARP, 0x0, AVG_DIV_M0, 100, 1),
281 };
282
283 #undef _Q1A
284 #undef _Q1B
285 #undef _M2A
286 #undef _M2B
287
288 /* === PERFORMANCE MONITORING COUNTERS for NVC0:NVE4 === */
289 /* NOTES:
290 * - MP counters on GF100/GF110 (compute capability 2.0) are buggy
291 * because there is a context-switch problem that we need to fix.
292 * Results might be wrong sometimes, be careful!
293 */
294 static const char *nvc0_hw_sm_query_names[] =
295 {
296 /* MP counters */
297 "active_cycles",
298 "active_warps",
299 "atom_count",
300 "branch",
301 "divergent_branch",
302 "gld_request",
303 "gred_count",
304 "gst_request",
305 "inst_executed",
306 "inst_issued",
307 "inst_issued1_0",
308 "inst_issued1_1",
309 "inst_issued2_0",
310 "inst_issued2_1",
311 "local_load",
312 "local_store",
313 "prof_trigger_00",
314 "prof_trigger_01",
315 "prof_trigger_02",
316 "prof_trigger_03",
317 "prof_trigger_04",
318 "prof_trigger_05",
319 "prof_trigger_06",
320 "prof_trigger_07",
321 "shared_load",
322 "shared_store",
323 "threads_launched",
324 "thread_inst_executed_0",
325 "thread_inst_executed_1",
326 "thread_inst_executed_2",
327 "thread_inst_executed_3",
328 "warps_launched",
329 };
330
331 static const uint64_t nvc0_read_hw_sm_counters_code[] =
332 {
333 /* mov b32 $r8 $tidx
334 * mov b32 $r9 $physid
335 * mov b32 $r0 $pm0
336 * mov b32 $r1 $pm1
337 * mov b32 $r2 $pm2
338 * mov b32 $r3 $pm3
339 * mov b32 $r4 $pm4
340 * mov b32 $r5 $pm5
341 * mov b32 $r6 $pm6
342 * mov b32 $r7 $pm7
343 * set $p0 0x1 eq u32 $r8 0x0
344 * mov b32 $r10 c0[0x0]
345 * mov b32 $r11 c0[0x4]
346 * ext u32 $r8 $r9 0x414
347 * (not $p0) exit
348 * mul $r8 u32 $r8 u32 48
349 * add b32 $r10 $c $r10 $r8
350 * add b32 $r11 $r11 0x0 $c
351 * mov b32 $r8 c0[0x8]
352 * st b128 wt g[$r10d+0x00] $r0q
353 * st b128 wt g[$r10d+0x10] $r4q
354 * st b32 wt g[$r10d+0x20] $r8
355 * exit */
356 0x2c00000084021c04ULL,
357 0x2c0000000c025c04ULL,
358 0x2c00000010001c04ULL,
359 0x2c00000014005c04ULL,
360 0x2c00000018009c04ULL,
361 0x2c0000001c00dc04ULL,
362 0x2c00000020011c04ULL,
363 0x2c00000024015c04ULL,
364 0x2c00000028019c04ULL,
365 0x2c0000002c01dc04ULL,
366 0x190e0000fc81dc03ULL,
367 0x2800400000029de4ULL,
368 0x280040001002dde4ULL,
369 0x7000c01050921c03ULL,
370 0x80000000000021e7ULL,
371 0x10000000c0821c02ULL,
372 0x4801000020a29c03ULL,
373 0x0800000000b2dc42ULL,
374 0x2800400020021de4ULL,
375 0x9400000000a01fc5ULL,
376 0x9400000040a11fc5ULL,
377 0x9400000080a21f85ULL,
378 0x8000000000001de7ULL
379 };
380
381 #define _C(f, o, g, m, s) { f, NVC0_COMPUTE_MP_PM_OP_MODE_##o, 0, g, m, s }
382 #define _Q(n, c) [NVC0_HW_SM_QUERY_##n] = c
383
384 /* ==== Compute capability 2.0 (GF100/GF110) ==== */
385 static const struct nvc0_hw_sm_query_cfg
386 sm20_active_cycles =
387 {
388 .ctr[0] = _C(0xaaaa, LOGOP, 0x11, 0x000000ff, 0x00000000),
389 .num_counters = 1,
390 .op = NVC0_COUNTER_OPn_SUM,
391 .norm = { 1, 1 },
392 };
393
394 static const struct nvc0_hw_sm_query_cfg
395 sm20_active_warps =
396 {
397 .ctr[0] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000010),
398 .ctr[1] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000020),
399 .ctr[2] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000030),
400 .ctr[3] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000040),
401 .ctr[4] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000050),
402 .ctr[5] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000060),
403 .num_counters = 6,
404 .op = NVC0_COUNTER_OPn_SUM,
405 .norm = { 1, 1 },
406 };
407
408 static const struct nvc0_hw_sm_query_cfg
409 sm20_atom_count =
410 {
411 .ctr[0] = _C(0xaaaa, LOGOP, 0x63, 0x000000ff, 0x00000030),
412 .num_counters = 1,
413 .op = NVC0_COUNTER_OPn_SUM,
414 .norm = { 1, 1 },
415 };
416
417 static const struct nvc0_hw_sm_query_cfg
418 sm20_branch =
419 {
420 .ctr[0] = _C(0xaaaa, LOGOP, 0x1a, 0x000000ff, 0x00000000),
421 .ctr[1] = _C(0xaaaa, LOGOP, 0x1a, 0x000000ff, 0x00000010),
422 .num_counters = 2,
423 .op = NVC0_COUNTER_OPn_SUM,
424 .norm = { 1, 1 },
425 };
426
427 static const struct nvc0_hw_sm_query_cfg
428 sm20_divergent_branch =
429 {
430 .ctr[0] = _C(0xaaaa, LOGOP, 0x19, 0x000000ff, 0x00000020),
431 .ctr[1] = _C(0xaaaa, LOGOP, 0x19, 0x000000ff, 0x00000030),
432 .num_counters = 2,
433 .op = NVC0_COUNTER_OPn_SUM,
434 .norm = { 1, 1 },
435 };
436
437 static const struct nvc0_hw_sm_query_cfg
438 sm20_gld_request =
439 {
440 .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000030),
441 .num_counters = 1,
442 .op = NVC0_COUNTER_OPn_SUM,
443 .norm = { 1, 1 },
444 };
445
446 static const struct nvc0_hw_sm_query_cfg
447 sm20_gred_count =
448 {
449 .ctr[0] = _C(0xaaaa, LOGOP, 0x63, 0x000000ff, 0x00000040),
450 .num_counters = 1,
451 .op = NVC0_COUNTER_OPn_SUM,
452 .norm = { 1, 1 },
453 };
454
455 static const struct nvc0_hw_sm_query_cfg
456 sm20_gst_request =
457 {
458 .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000060),
459 .num_counters = 1,
460 .op = NVC0_COUNTER_OPn_SUM,
461 .norm = { 1, 1 },
462 };
463
464 static const struct nvc0_hw_sm_query_cfg
465 sm20_inst_executed =
466 {
467 .ctr[0] = _C(0xaaaa, LOGOP, 0x2d, 0x0000ffff, 0x00001000),
468 .ctr[1] = _C(0xaaaa, LOGOP, 0x2d, 0x0000ffff, 0x00001010),
469 .num_counters = 2,
470 .op = NVC0_COUNTER_OPn_SUM,
471 .norm = { 1, 1 },
472 };
473
474 static const struct nvc0_hw_sm_query_cfg
475 sm20_inst_issued =
476 {
477 .ctr[0] = _C(0xaaaa, LOGOP, 0x27, 0x0000ffff, 0x00007060),
478 .ctr[1] = _C(0xaaaa, LOGOP, 0x27, 0x0000ffff, 0x00007070),
479 .num_counters = 2,
480 .op = NVC0_COUNTER_OPn_SUM,
481 .norm = { 1, 1 },
482 };
483
484 static const struct nvc0_hw_sm_query_cfg
485 sm20_local_ld =
486 {
487 .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000020),
488 .num_counters = 1,
489 .op = NVC0_COUNTER_OPn_SUM,
490 .norm = { 1, 1 },
491 };
492
493 static const struct nvc0_hw_sm_query_cfg
494 sm20_local_st =
495 {
496 .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000050),
497 .num_counters = 1,
498 .op = NVC0_COUNTER_OPn_SUM,
499 .norm = { 1, 1 },
500 };
501
502 static const struct nvc0_hw_sm_query_cfg
503 sm20_prof_trigger_0 =
504 {
505 .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000000),
506 .num_counters = 1,
507 .op = NVC0_COUNTER_OPn_SUM,
508 .norm = { 1, 1 },
509 };
510
511 static const struct nvc0_hw_sm_query_cfg
512 sm20_prof_trigger_1 =
513 {
514 .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000010),
515 .num_counters = 1,
516 .op = NVC0_COUNTER_OPn_SUM,
517 .norm = { 1, 1 },
518 };
519
520 static const struct nvc0_hw_sm_query_cfg
521 sm20_prof_trigger_2 =
522 {
523 .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000020),
524 .num_counters = 1,
525 .op = NVC0_COUNTER_OPn_SUM,
526 .norm = { 1, 1 },
527 };
528
529 static const struct nvc0_hw_sm_query_cfg
530 sm20_prof_trigger_3 =
531 {
532 .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000030),
533 .num_counters = 1,
534 .op = NVC0_COUNTER_OPn_SUM,
535 .norm = { 1, 1 },
536 };
537
538 static const struct nvc0_hw_sm_query_cfg
539 sm20_prof_trigger_4 =
540 {
541 .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000040),
542 .num_counters = 1,
543 .op = NVC0_COUNTER_OPn_SUM,
544 .norm = { 1, 1 },
545 };
546
547 static const struct nvc0_hw_sm_query_cfg
548 sm20_prof_trigger_5 =
549 {
550 .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000050),
551 .num_counters = 1,
552 .op = NVC0_COUNTER_OPn_SUM,
553 .norm = { 1, 1 },
554 };
555
556 static const struct nvc0_hw_sm_query_cfg
557 sm20_prof_trigger_6 =
558 {
559 .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000060),
560 .num_counters = 1,
561 .op = NVC0_COUNTER_OPn_SUM,
562 .norm = { 1, 1 },
563 };
564
565 static const struct nvc0_hw_sm_query_cfg
566 sm20_prof_trigger_7 =
567 {
568 .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000070),
569 .num_counters = 1,
570 .op = NVC0_COUNTER_OPn_SUM,
571 .norm = { 1, 1 },
572 };
573
574 static const struct nvc0_hw_sm_query_cfg
575 sm20_shared_ld =
576 {
577 .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000010),
578 .num_counters = 1,
579 .op = NVC0_COUNTER_OPn_SUM,
580 .norm = { 1, 1 },
581 };
582
583 static const struct nvc0_hw_sm_query_cfg
584 sm20_shared_st =
585 {
586 .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000040),
587 .num_counters = 1,
588 .op = NVC0_COUNTER_OPn_SUM,
589 .norm = { 1, 1 },
590 };
591
592 static const struct nvc0_hw_sm_query_cfg
593 sm20_threads_launched =
594 {
595 .ctr[0] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000010),
596 .ctr[1] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000020),
597 .ctr[2] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000030),
598 .ctr[3] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000040),
599 .ctr[4] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000050),
600 .ctr[5] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000060),
601 .num_counters = 6,
602 .op = NVC0_COUNTER_OPn_SUM,
603 .norm = { 1, 1 },
604 };
605
606 static const struct nvc0_hw_sm_query_cfg
607 sm20_th_inst_executed_0 =
608 {
609 .ctr[0] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000000),
610 .ctr[1] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000010),
611 .ctr[2] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000020),
612 .ctr[3] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000030),
613 .ctr[4] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000040),
614 .ctr[5] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000050),
615 .num_counters = 6,
616 .op = NVC0_COUNTER_OPn_SUM,
617 .norm = { 1, 1 },
618 };
619
620 static const struct nvc0_hw_sm_query_cfg
621 sm20_th_inst_executed_1 =
622 {
623 .ctr[0] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000000),
624 .ctr[1] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000010),
625 .ctr[2] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000020),
626 .ctr[3] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000030),
627 .ctr[4] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000040),
628 .ctr[5] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000050),
629 .num_counters = 6,
630 .op = NVC0_COUNTER_OPn_SUM,
631 .norm = { 1, 1 },
632 };
633
634 static const struct nvc0_hw_sm_query_cfg
635 sm20_warps_launched =
636 {
637 .ctr[0] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000000),
638 .num_counters = 1,
639 .op = NVC0_COUNTER_OPn_SUM,
640 .norm = { 1, 1 },
641 };
642
643 static const struct nvc0_hw_sm_query_cfg *sm20_hw_sm_queries[] =
644 {
645 _Q(ACTIVE_CYCLES, &sm20_active_cycles),
646 _Q(ACTIVE_WARPS, &sm20_active_warps),
647 _Q(ATOM_COUNT, &sm20_atom_count),
648 _Q(BRANCH, &sm20_branch),
649 _Q(DIVERGENT_BRANCH, &sm20_divergent_branch),
650 _Q(GLD_REQUEST, &sm20_gld_request),
651 _Q(GRED_COUNT, &sm20_gred_count),
652 _Q(GST_REQUEST, &sm20_gst_request),
653 _Q(INST_EXECUTED, &sm20_inst_executed),
654 _Q(INST_ISSUED, &sm20_inst_issued),
655 _Q(INST_ISSUED1_0, NULL),
656 _Q(INST_ISSUED1_1, NULL),
657 _Q(INST_ISSUED2_0, NULL),
658 _Q(INST_ISSUED2_1, NULL),
659 _Q(LOCAL_LD, &sm20_local_ld),
660 _Q(LOCAL_ST, &sm20_local_st),
661 _Q(PROF_TRIGGER_0, &sm20_prof_trigger_0),
662 _Q(PROF_TRIGGER_1, &sm20_prof_trigger_1),
663 _Q(PROF_TRIGGER_2, &sm20_prof_trigger_2),
664 _Q(PROF_TRIGGER_3, &sm20_prof_trigger_3),
665 _Q(PROF_TRIGGER_4, &sm20_prof_trigger_4),
666 _Q(PROF_TRIGGER_5, &sm20_prof_trigger_5),
667 _Q(PROF_TRIGGER_6, &sm20_prof_trigger_6),
668 _Q(PROF_TRIGGER_7, &sm20_prof_trigger_7),
669 _Q(SHARED_LD, &sm20_shared_ld),
670 _Q(SHARED_ST, &sm20_shared_st),
671 _Q(THREADS_LAUNCHED, &sm20_threads_launched),
672 _Q(TH_INST_EXECUTED_0, &sm20_th_inst_executed_0),
673 _Q(TH_INST_EXECUTED_1, &sm20_th_inst_executed_1),
674 _Q(TH_INST_EXECUTED_2, NULL),
675 _Q(TH_INST_EXECUTED_3, NULL),
676 _Q(WARPS_LAUNCHED, &sm20_warps_launched),
677 };
678
679 /* ==== Compute capability 2.1 (GF108+ except GF110) ==== */
680 static const struct nvc0_hw_sm_query_cfg
681 sm21_inst_executed =
682 {
683 .ctr[0] = _C(0xaaaa, LOGOP, 0x2d, 0x000000ff, 0x00000000),
684 .ctr[1] = _C(0xaaaa, LOGOP, 0x2d, 0x000000ff, 0x00000010),
685 .ctr[2] = _C(0xaaaa, LOGOP, 0x2d, 0x000000ff, 0x00000020),
686 .num_counters = 3,
687 .op = NVC0_COUNTER_OPn_SUM,
688 .norm = { 1, 1 },
689 };
690
691 static const struct nvc0_hw_sm_query_cfg
692 sm21_inst_issued1_0 =
693 {
694 .ctr[0] = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000010),
695 .num_counters = 1,
696 .op = NVC0_COUNTER_OPn_SUM,
697 .norm = { 1, 1 },
698 };
699
700 static const struct nvc0_hw_sm_query_cfg
701 sm21_inst_issued1_1 =
702 {
703 .ctr[0] = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000040),
704 .num_counters = 1,
705 .op = NVC0_COUNTER_OPn_SUM,
706 .norm = { 1, 1 },
707 };
708
709 static const struct nvc0_hw_sm_query_cfg
710 sm21_inst_issued2_0 =
711 {
712 .ctr[0] = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000020),
713 .num_counters = 1,
714 .op = NVC0_COUNTER_OPn_SUM,
715 .norm = { 1, 1 },
716 };
717
718 static const struct nvc0_hw_sm_query_cfg
719 sm21_inst_issued2_1 =
720 {
721 .ctr[0] = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000050),
722 .num_counters = 1,
723 .op = NVC0_COUNTER_OPn_SUM,
724 .norm = { 1, 1 },
725 };
726
727 static const struct nvc0_hw_sm_query_cfg
728 sm21_th_inst_executed_0 =
729 {
730 .ctr[0] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000000),
731 .ctr[1] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000010),
732 .ctr[2] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000020),
733 .ctr[3] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000030),
734 .ctr[4] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000040),
735 .ctr[5] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000050),
736 .num_counters = 6,
737 .op = NVC0_COUNTER_OPn_SUM,
738 .norm = { 1, 1 },
739 };
740
741 static const struct nvc0_hw_sm_query_cfg
742 sm21_th_inst_executed_1 =
743 {
744 .ctr[0] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000000),
745 .ctr[1] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000010),
746 .ctr[2] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000020),
747 .ctr[3] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000030),
748 .ctr[4] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000040),
749 .ctr[5] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000050),
750 .num_counters = 6,
751 .op = NVC0_COUNTER_OPn_SUM,
752 .norm = { 1, 1 },
753 };
754
755 static const struct nvc0_hw_sm_query_cfg
756 sm21_th_inst_executed_2 =
757 {
758 .ctr[0] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000000),
759 .ctr[1] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000010),
760 .ctr[2] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000020),
761 .ctr[3] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000030),
762 .ctr[4] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000040),
763 .ctr[5] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000050),
764 .num_counters = 6,
765 .op = NVC0_COUNTER_OPn_SUM,
766 .norm = { 1, 1 },
767 };
768
769 static const struct nvc0_hw_sm_query_cfg
770 sm21_th_inst_executed_3 =
771 {
772 .ctr[0] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000000),
773 .ctr[1] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000010),
774 .ctr[2] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000020),
775 .ctr[3] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000030),
776 .ctr[4] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000040),
777 .ctr[5] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000050),
778 .num_counters = 6,
779 .op = NVC0_COUNTER_OPn_SUM,
780 .norm = { 1, 1 },
781 };
782
783 static const struct nvc0_hw_sm_query_cfg *sm21_hw_sm_queries[] =
784 {
785 _Q(ACTIVE_CYCLES, &sm20_active_cycles),
786 _Q(ACTIVE_WARPS, &sm20_active_warps),
787 _Q(ATOM_COUNT, &sm20_atom_count),
788 _Q(BRANCH, &sm20_branch),
789 _Q(DIVERGENT_BRANCH, &sm20_divergent_branch),
790 _Q(GLD_REQUEST, &sm20_gld_request),
791 _Q(GRED_COUNT, &sm20_gred_count),
792 _Q(GST_REQUEST, &sm20_gst_request),
793 _Q(INST_EXECUTED, &sm21_inst_executed),
794 _Q(INST_ISSUED, NULL),
795 _Q(INST_ISSUED1_0, &sm21_inst_issued1_0),
796 _Q(INST_ISSUED1_1, &sm21_inst_issued1_1),
797 _Q(INST_ISSUED2_0, &sm21_inst_issued2_0),
798 _Q(INST_ISSUED2_1, &sm21_inst_issued2_1),
799 _Q(LOCAL_LD, &sm20_local_ld),
800 _Q(LOCAL_ST, &sm20_local_st),
801 _Q(PROF_TRIGGER_0, &sm20_prof_trigger_0),
802 _Q(PROF_TRIGGER_1, &sm20_prof_trigger_1),
803 _Q(PROF_TRIGGER_2, &sm20_prof_trigger_2),
804 _Q(PROF_TRIGGER_3, &sm20_prof_trigger_3),
805 _Q(PROF_TRIGGER_4, &sm20_prof_trigger_4),
806 _Q(PROF_TRIGGER_5, &sm20_prof_trigger_5),
807 _Q(PROF_TRIGGER_6, &sm20_prof_trigger_6),
808 _Q(PROF_TRIGGER_7, &sm20_prof_trigger_7),
809 _Q(SHARED_LD, &sm20_shared_ld),
810 _Q(SHARED_ST, &sm20_shared_st),
811 _Q(THREADS_LAUNCHED, &sm20_threads_launched),
812 _Q(TH_INST_EXECUTED_0, &sm21_th_inst_executed_0),
813 _Q(TH_INST_EXECUTED_1, &sm21_th_inst_executed_1),
814 _Q(TH_INST_EXECUTED_2, &sm21_th_inst_executed_2),
815 _Q(TH_INST_EXECUTED_3, &sm21_th_inst_executed_3),
816 _Q(WARPS_LAUNCHED, &sm20_warps_launched),
817 };
818
819 #undef _Q
820 #undef _C
821
822 static inline const struct nvc0_hw_sm_query_cfg **
823 nvc0_hw_sm_get_queries(struct nvc0_screen *screen)
824 {
825 struct nouveau_device *dev = screen->base.device;
826
827 if (dev->chipset == 0xc0 || dev->chipset == 0xc8)
828 return sm20_hw_sm_queries;
829 return sm21_hw_sm_queries;
830 }
831
832 static const struct nvc0_hw_sm_query_cfg *
833 nvc0_hw_sm_query_get_cfg(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
834 {
835 struct nvc0_screen *screen = nvc0->screen;
836 struct nvc0_query *q = &hq->base;
837
838 if (screen->base.class_3d >= NVE4_3D_CLASS)
839 return &nve4_hw_sm_queries[q->type - PIPE_QUERY_DRIVER_SPECIFIC];
840
841 if (q->type >= NVC0_HW_SM_QUERY(0) && q->type <= NVC0_HW_SM_QUERY_LAST) {
842 const struct nvc0_hw_sm_query_cfg **queries =
843 nvc0_hw_sm_get_queries(screen);
844 return queries[q->type - NVC0_HW_SM_QUERY(0)];
845 }
846 debug_printf("invalid query type: %d\n", q->type);
847 return NULL;
848 }
849
850 static void
851 nvc0_hw_sm_destroy_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
852 {
853 struct nvc0_query *q = &hq->base;
854 q->funcs->destroy_query(nvc0, q);
855 }
856
857 static boolean
858 nve4_hw_sm_begin_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
859 {
860 struct nvc0_screen *screen = nvc0->screen;
861 struct nouveau_pushbuf *push = nvc0->base.pushbuf;
862 struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
863 const struct nvc0_hw_sm_query_cfg *cfg;
864 unsigned i, c;
865 unsigned num_ab[2] = { 0, 0 };
866
867 cfg = nvc0_hw_sm_query_get_cfg(nvc0, hq);
868
869 /* check if we have enough free counter slots */
870 for (i = 0; i < cfg->num_counters; ++i)
871 num_ab[cfg->ctr[i].sig_dom]++;
872
873 if (screen->pm.num_hw_sm_active[0] + num_ab[0] > 4 ||
874 screen->pm.num_hw_sm_active[1] + num_ab[1] > 4) {
875 NOUVEAU_ERR("Not enough free MP counter slots !\n");
876 return false;
877 }
878
879 assert(cfg->num_counters <= 4);
880 PUSH_SPACE(push, 4 * 8 * + 6);
881
882 if (!screen->pm.mp_counters_enabled) {
883 screen->pm.mp_counters_enabled = true;
884 BEGIN_NVC0(push, SUBC_SW(0x06ac), 1);
885 PUSH_DATA (push, 0x1fcb);
886 }
887
888 /* set sequence field to 0 (used to check if result is available) */
889 for (i = 0; i < screen->mp_count; ++i)
890 hq->data[i * 10 + 10] = 0;
891 hq->sequence++;
892
893 for (i = 0; i < cfg->num_counters; ++i) {
894 const unsigned d = cfg->ctr[i].sig_dom;
895
896 if (!screen->pm.num_hw_sm_active[d]) {
897 uint32_t m = (1 << 22) | (1 << (7 + (8 * !d)));
898 if (screen->pm.num_hw_sm_active[!d])
899 m |= 1 << (7 + (8 * d));
900 BEGIN_NVC0(push, SUBC_SW(0x0600), 1);
901 PUSH_DATA (push, m);
902 }
903 screen->pm.num_hw_sm_active[d]++;
904
905 for (c = d * 4; c < (d * 4 + 4); ++c) {
906 if (!screen->pm.mp_counter[c]) {
907 hsq->ctr[i] = c;
908 screen->pm.mp_counter[c] = hsq;
909 break;
910 }
911 }
912 assert(c <= (d * 4 + 3)); /* must succeed, already checked for space */
913
914 /* configure and reset the counter(s) */
915 if (d == 0)
916 BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_A_SIGSEL(c & 3)), 1);
917 else
918 BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_B_SIGSEL(c & 3)), 1);
919 PUSH_DATA (push, cfg->ctr[i].sig_sel);
920 BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_SRCSEL(c)), 1);
921 PUSH_DATA (push, cfg->ctr[i].src_sel + 0x2108421 * (c & 3));
922 BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(c)), 1);
923 PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode);
924 BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_SET(c)), 1);
925 PUSH_DATA (push, 0);
926 }
927 return true;
928 }
929
930 static boolean
931 nvc0_hw_sm_begin_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
932 {
933 struct nvc0_screen *screen = nvc0->screen;
934 struct nouveau_pushbuf *push = nvc0->base.pushbuf;
935 struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
936 const struct nvc0_hw_sm_query_cfg *cfg;
937 unsigned i, c;
938
939 if (screen->base.class_3d >= NVE4_3D_CLASS)
940 return nve4_hw_sm_begin_query(nvc0, hq);
941
942 cfg = nvc0_hw_sm_query_get_cfg(nvc0, hq);
943
944 /* check if we have enough free counter slots */
945 if (screen->pm.num_hw_sm_active[0] + cfg->num_counters > 8) {
946 NOUVEAU_ERR("Not enough free MP counter slots !\n");
947 return false;
948 }
949
950 assert(cfg->num_counters <= 8);
951 PUSH_SPACE(push, 8 * 8 + 2);
952
953 /* set sequence field to 0 (used to check if result is available) */
954 for (i = 0; i < screen->mp_count; ++i) {
955 const unsigned b = (0x30 / 4) * i;
956 hq->data[b + 8] = 0;
957 }
958 hq->sequence++;
959
960 for (i = 0; i < cfg->num_counters; ++i) {
961 uint32_t mask_sel = 0x00000000;
962
963 if (!screen->pm.num_hw_sm_active[0]) {
964 BEGIN_NVC0(push, SUBC_SW(0x0600), 1);
965 PUSH_DATA (push, 0x80000000);
966 }
967 screen->pm.num_hw_sm_active[0]++;
968
969 for (c = 0; c < 8; ++c) {
970 if (!screen->pm.mp_counter[c]) {
971 hsq->ctr[i] = c;
972 screen->pm.mp_counter[c] = hsq;
973 break;
974 }
975 }
976
977 /* Oddly-enough, the signal id depends on the slot selected on Fermi but
978 * not on Kepler. Fortunately, the signal ids are just offseted by the
979 * slot id! */
980 mask_sel |= c;
981 mask_sel |= (c << 8);
982 mask_sel |= (c << 16);
983 mask_sel |= (c << 24);
984 mask_sel &= cfg->ctr[i].src_mask;
985
986 /* configure and reset the counter(s) */
987 BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SIGSEL(c)), 1);
988 PUSH_DATA (push, cfg->ctr[i].sig_sel);
989 BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SRCSEL(c)), 1);
990 PUSH_DATA (push, cfg->ctr[i].src_sel | mask_sel);
991 BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_OP(c)), 1);
992 PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode);
993 BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SET(c)), 1);
994 PUSH_DATA (push, 0);
995 }
996 return true;
997 }
998
999 static void
1000 nvc0_hw_sm_end_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
1001 {
1002 struct nvc0_screen *screen = nvc0->screen;
1003 struct pipe_context *pipe = &nvc0->base.pipe;
1004 struct nouveau_pushbuf *push = nvc0->base.pushbuf;
1005 const bool is_nve4 = screen->base.class_3d >= NVE4_3D_CLASS;
1006 struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
1007 uint32_t mask;
1008 uint32_t input[3];
1009 const uint block[3] = { 32, is_nve4 ? 4 : 1, 1 };
1010 const uint grid[3] = { screen->mp_count, screen->gpc_count, 1 };
1011 unsigned c;
1012
1013 if (unlikely(!screen->pm.prog)) {
1014 struct nvc0_program *prog = CALLOC_STRUCT(nvc0_program);
1015 prog->type = PIPE_SHADER_COMPUTE;
1016 prog->translated = true;
1017 prog->parm_size = 12;
1018 if (is_nve4) {
1019 prog->code = (uint32_t *)nve4_read_hw_sm_counters_code;
1020 prog->code_size = sizeof(nve4_read_hw_sm_counters_code);
1021 prog->num_gprs = 14;
1022 } else {
1023 prog->code = (uint32_t *)nvc0_read_hw_sm_counters_code;
1024 prog->code_size = sizeof(nvc0_read_hw_sm_counters_code);
1025 prog->num_gprs = 12;
1026 }
1027 screen->pm.prog = prog;
1028 }
1029
1030 /* disable all counting */
1031 PUSH_SPACE(push, 8);
1032 for (c = 0; c < 8; ++c)
1033 if (screen->pm.mp_counter[c]) {
1034 if (is_nve4) {
1035 IMMED_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(c)), 0);
1036 } else {
1037 IMMED_NVC0(push, NVC0_COMPUTE(MP_PM_OP(c)), 0);
1038 }
1039 }
1040 /* release counters for this query */
1041 for (c = 0; c < 8; ++c) {
1042 if (screen->pm.mp_counter[c] == hsq) {
1043 uint8_t d = is_nve4 ? c / 4 : 0; /* only one domain for NVC0:NVE4 */
1044 screen->pm.num_hw_sm_active[d]--;
1045 screen->pm.mp_counter[c] = NULL;
1046 }
1047 }
1048
1049 BCTX_REFN_bo(nvc0->bufctx_cp, CP_QUERY, NOUVEAU_BO_GART | NOUVEAU_BO_WR,
1050 hq->bo);
1051
1052 PUSH_SPACE(push, 1);
1053 IMMED_NVC0(push, SUBC_COMPUTE(NV50_GRAPH_SERIALIZE), 0);
1054
1055 pipe->bind_compute_state(pipe, screen->pm.prog);
1056 input[0] = (hq->bo->offset + hq->base_offset);
1057 input[1] = (hq->bo->offset + hq->base_offset) >> 32;
1058 input[2] = hq->sequence;
1059 pipe->launch_grid(pipe, block, grid, 0, input);
1060
1061 nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_QUERY);
1062
1063 /* re-activate other counters */
1064 PUSH_SPACE(push, 16);
1065 mask = 0;
1066 for (c = 0; c < 8; ++c) {
1067 const struct nvc0_hw_sm_query_cfg *cfg;
1068 unsigned i;
1069
1070 hsq = screen->pm.mp_counter[c];
1071 if (!hsq)
1072 continue;
1073
1074 cfg = nvc0_hw_sm_query_get_cfg(nvc0, &hsq->base);
1075 for (i = 0; i < cfg->num_counters; ++i) {
1076 if (mask & (1 << hsq->ctr[i]))
1077 break;
1078 mask |= 1 << hsq->ctr[i];
1079 if (is_nve4) {
1080 BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(hsq->ctr[i])), 1);
1081 } else {
1082 BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_OP(hsq->ctr[i])), 1);
1083 }
1084 PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode);
1085 }
1086 }
1087 }
1088
1089 static inline bool
1090 nvc0_hw_sm_query_read_data(uint32_t count[32][8],
1091 struct nvc0_context *nvc0, bool wait,
1092 struct nvc0_hw_query *hq,
1093 const struct nvc0_hw_sm_query_cfg *cfg,
1094 unsigned mp_count)
1095 {
1096 struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
1097 unsigned p, c;
1098
1099 for (p = 0; p < mp_count; ++p) {
1100 const unsigned b = (0x30 / 4) * p;
1101
1102 for (c = 0; c < cfg->num_counters; ++c) {
1103 if (hq->data[b + 8] != hq->sequence) {
1104 if (!wait)
1105 return false;
1106 if (nouveau_bo_wait(hq->bo, NOUVEAU_BO_RD, nvc0->base.client))
1107 return false;
1108 }
1109 count[p][c] = hq->data[b + hsq->ctr[c]] * (1 << c);
1110 }
1111 }
1112 return true;
1113 }
1114
1115 static inline bool
1116 nve4_hw_sm_query_read_data(uint32_t count[32][8],
1117 struct nvc0_context *nvc0, bool wait,
1118 struct nvc0_hw_query *hq,
1119 const struct nvc0_hw_sm_query_cfg *cfg,
1120 unsigned mp_count)
1121 {
1122 struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
1123 unsigned p, c, d;
1124
1125 for (p = 0; p < mp_count; ++p) {
1126 const unsigned b = (0x60 / 4) * p;
1127
1128 for (c = 0; c < cfg->num_counters; ++c) {
1129 count[p][c] = 0;
1130 for (d = 0; d < ((hsq->ctr[c] & ~3) ? 1 : 4); ++d) {
1131 if (hq->data[b + 20 + d] != hq->sequence) {
1132 if (!wait)
1133 return false;
1134 if (nouveau_bo_wait(hq->bo, NOUVEAU_BO_RD, nvc0->base.client))
1135 return false;
1136 }
1137 if (hsq->ctr[c] & ~0x3)
1138 count[p][c] = hq->data[b + 16 + (hsq->ctr[c] & 3)];
1139 else
1140 count[p][c] += hq->data[b + d * 4 + hsq->ctr[c]];
1141 }
1142 }
1143 }
1144 return true;
1145 }
1146
1147 /* Metric calculations:
1148 * sum(x) ... sum of x over all MPs
1149 * avg(x) ... average of x over all MPs
1150 *
1151 * IPC : sum(inst_executed) / clock
1152 * INST_REPLAY_OHEAD: (sum(inst_issued) - sum(inst_executed)) / sum(inst_issued)
1153 * MP_OCCUPANCY : avg((active_warps / 64) / active_cycles)
1154 * MP_EFFICIENCY : avg(active_cycles / clock)
1155 *
1156 * NOTE: Interpretation of IPC requires knowledge of MP count.
1157 */
1158 static boolean
1159 nvc0_hw_sm_get_query_result(struct nvc0_context *nvc0, struct nvc0_hw_query *hq,
1160 boolean wait, union pipe_query_result *result)
1161 {
1162 uint32_t count[32][8];
1163 uint64_t value = 0;
1164 unsigned mp_count = MIN2(nvc0->screen->mp_count_compute, 32);
1165 unsigned p, c;
1166 const struct nvc0_hw_sm_query_cfg *cfg;
1167 bool ret;
1168
1169 cfg = nvc0_hw_sm_query_get_cfg(nvc0, hq);
1170
1171 if (nvc0->screen->base.class_3d >= NVE4_3D_CLASS)
1172 ret = nve4_hw_sm_query_read_data(count, nvc0, wait, hq, cfg, mp_count);
1173 else
1174 ret = nvc0_hw_sm_query_read_data(count, nvc0, wait, hq, cfg, mp_count);
1175 if (!ret)
1176 return false;
1177
1178 if (cfg->op == NVC0_COUNTER_OPn_SUM) {
1179 for (c = 0; c < cfg->num_counters; ++c)
1180 for (p = 0; p < mp_count; ++p)
1181 value += count[p][c];
1182 value = (value * cfg->norm[0]) / cfg->norm[1];
1183 } else
1184 if (cfg->op == NVC0_COUNTER_OPn_OR) {
1185 uint32_t v = 0;
1186 for (c = 0; c < cfg->num_counters; ++c)
1187 for (p = 0; p < mp_count; ++p)
1188 v |= count[p][c];
1189 value = ((uint64_t)v * cfg->norm[0]) / cfg->norm[1];
1190 } else
1191 if (cfg->op == NVC0_COUNTER_OPn_AND) {
1192 uint32_t v = ~0;
1193 for (c = 0; c < cfg->num_counters; ++c)
1194 for (p = 0; p < mp_count; ++p)
1195 v &= count[p][c];
1196 value = ((uint64_t)v * cfg->norm[0]) / cfg->norm[1];
1197 } else
1198 if (cfg->op == NVC0_COUNTER_OP2_REL_SUM_MM) {
1199 uint64_t v[2] = { 0, 0 };
1200 for (p = 0; p < mp_count; ++p) {
1201 v[0] += count[p][0];
1202 v[1] += count[p][1];
1203 }
1204 if (v[0])
1205 value = ((v[0] - v[1]) * cfg->norm[0]) / (v[0] * cfg->norm[1]);
1206 } else
1207 if (cfg->op == NVC0_COUNTER_OP2_DIV_SUM_M0) {
1208 for (p = 0; p < mp_count; ++p)
1209 value += count[p][0];
1210 if (count[0][1])
1211 value = (value * cfg->norm[0]) / (count[0][1] * cfg->norm[1]);
1212 else
1213 value = 0;
1214 } else
1215 if (cfg->op == NVC0_COUNTER_OP2_AVG_DIV_MM) {
1216 unsigned mp_used = 0;
1217 for (p = 0; p < mp_count; ++p, mp_used += !!count[p][0])
1218 if (count[p][1])
1219 value += (count[p][0] * cfg->norm[0]) / count[p][1];
1220 if (mp_used)
1221 value /= (uint64_t)mp_used * cfg->norm[1];
1222 } else
1223 if (cfg->op == NVC0_COUNTER_OP2_AVG_DIV_M0) {
1224 unsigned mp_used = 0;
1225 for (p = 0; p < mp_count; ++p, mp_used += !!count[p][0])
1226 value += count[p][0];
1227 if (count[0][1] && mp_used) {
1228 value *= cfg->norm[0];
1229 value /= (uint64_t)count[0][1] * mp_used * cfg->norm[1];
1230 } else {
1231 value = 0;
1232 }
1233 }
1234
1235 *(uint64_t *)result = value;
1236 return true;
1237 }
1238
1239 static const struct nvc0_hw_query_funcs hw_sm_query_funcs = {
1240 .destroy_query = nvc0_hw_sm_destroy_query,
1241 .begin_query = nvc0_hw_sm_begin_query,
1242 .end_query = nvc0_hw_sm_end_query,
1243 .get_query_result = nvc0_hw_sm_get_query_result,
1244 };
1245
1246 struct nvc0_hw_query *
1247 nvc0_hw_sm_create_query(struct nvc0_context *nvc0, unsigned type)
1248 {
1249 struct nvc0_screen *screen = nvc0->screen;
1250 struct nvc0_hw_sm_query *hsq;
1251 struct nvc0_hw_query *hq;
1252 unsigned space;
1253
1254 if (nvc0->screen->base.device->drm_version < 0x01000101)
1255 return NULL;
1256
1257 if ((type < NVE4_HW_SM_QUERY(0) || type > NVE4_HW_SM_QUERY_LAST) &&
1258 (type < NVC0_HW_SM_QUERY(0) || type > NVC0_HW_SM_QUERY_LAST))
1259 return NULL;
1260
1261 hsq = CALLOC_STRUCT(nvc0_hw_sm_query);
1262 if (!hsq)
1263 return NULL;
1264
1265 hq = &hsq->base;
1266 hq->funcs = &hw_sm_query_funcs;
1267 hq->base.type = type;
1268
1269 if (screen->base.class_3d >= NVE4_3D_CLASS) {
1270 /* for each MP:
1271 * [00] = WS0.C0
1272 * [04] = WS0.C1
1273 * [08] = WS0.C2
1274 * [0c] = WS0.C3
1275 * [24] = WS2.C1
1276 * [28] = WS2.C2
1277 * [2c] = WS2.C3
1278 * [30] = WS3.C0
1279 * [34] = WS3.C1
1280 * [38] = WS3.C2
1281 * [3c] = WS3.C3
1282 * [40] = MP.C4
1283 * [44] = MP.C5
1284 * [48] = MP.C6
1285 * [4c] = MP.C7
1286 * [50] = WS0.sequence
1287 * [54] = WS1.sequence
1288 * [58] = WS2.sequence
1289 * [5c] = WS3.sequence
1290 */
1291 space = (4 * 4 + 4 + 4) * nvc0->screen->mp_count * sizeof(uint32_t);
1292 } else {
1293 /*
1294 * Note that padding is used to align memory access to 128 bits.
1295 *
1296 * for each MP:
1297 * [00] = MP.C0
1298 * [04] = MP.C1
1299 * [08] = MP.C2
1300 * [0c] = MP.C3
1301 * [10] = MP.C4
1302 * [14] = MP.C5
1303 * [18] = MP.C6
1304 * [1c] = MP.C7
1305 * [20] = MP.sequence
1306 * [24] = padding
1307 * [28] = padding
1308 * [2c] = padding
1309 */
1310 space = (8 + 1 + 3) * nvc0->screen->mp_count * sizeof(uint32_t);
1311 }
1312
1313 if (!nvc0_hw_query_allocate(nvc0, &hq->base, space)) {
1314 FREE(hq);
1315 return NULL;
1316 }
1317
1318 return hq;
1319 }
1320
1321 static int
1322 nvc0_hw_sm_get_next_query_id(const struct nvc0_hw_sm_query_cfg **queries,
1323 unsigned id)
1324 {
1325 unsigned i, next = 0;
1326
1327 for (i = 0; i < NVC0_HW_SM_QUERY_COUNT; i++) {
1328 if (!queries[i]) {
1329 next++;
1330 } else
1331 if (i >= id && queries[id + next]) {
1332 break;
1333 }
1334 }
1335 return id + next;
1336 }
1337
1338 int
1339 nvc0_hw_sm_get_driver_query_info(struct nvc0_screen *screen, unsigned id,
1340 struct pipe_driver_query_info *info)
1341 {
1342 int count = 0;
1343
1344 if (screen->base.device->drm_version >= 0x01000101) {
1345 if (screen->compute) {
1346 if (screen->base.class_3d == NVE4_3D_CLASS) {
1347 count += NVE4_HW_SM_QUERY_COUNT;
1348 } else
1349 if (screen->base.class_3d < NVE4_3D_CLASS) {
1350 const struct nvc0_hw_sm_query_cfg **queries =
1351 nvc0_hw_sm_get_queries(screen);
1352 unsigned i;
1353
1354 for (i = 0; i < NVC0_HW_SM_QUERY_COUNT; i++) {
1355 if (queries[i])
1356 count++;
1357 }
1358 }
1359 }
1360 }
1361
1362 if (!info)
1363 return count;
1364
1365 if (id < count) {
1366 if (screen->compute) {
1367 if (screen->base.class_3d == NVE4_3D_CLASS) {
1368 info->name = nve4_hw_sm_query_names[id];
1369 info->query_type = NVE4_HW_SM_QUERY(id);
1370 info->max_value.u64 =
1371 (id < NVE4_HW_SM_QUERY_METRIC_MP_OCCUPANCY) ? 0 : 100;
1372 info->group_id = NVC0_HW_SM_QUERY_GROUP;
1373 return 1;
1374 } else
1375 if (screen->base.class_3d < NVE4_3D_CLASS) {
1376 const struct nvc0_hw_sm_query_cfg **queries =
1377 nvc0_hw_sm_get_queries(screen);
1378
1379 id = nvc0_hw_sm_get_next_query_id(queries, id);
1380 info->name = nvc0_hw_sm_query_names[id];
1381 info->query_type = NVC0_HW_SM_QUERY(id);
1382 info->group_id = NVC0_HW_SM_QUERY_GROUP;
1383 return 1;
1384 }
1385 }
1386 }
1387 return 0;
1388 }