3f5a87676ade137456a154ff99d793f40de3753c
[mesa.git] / src / gallium / drivers / nouveau / nvc0 / nvc0_query_hw_sm.c
1 /*
2 * Copyright 2011 Christoph Bumiller
3 * Copyright 2015 Samuel Pitoiset
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21 * OTHER DEALINGS IN THE SOFTWARE.
22 */
23
24 #define NVC0_PUSH_EXPLICIT_SPACE_CHECKING
25
26 #include "nvc0/nvc0_context.h"
27 #include "nvc0/nvc0_query_hw_sm.h"
28
29 #include "nv_object.xml.h"
30 #include "nvc0/nve4_compute.xml.h"
31 #include "nvc0/nvc0_compute.xml.h"
32
33 /* === PERFORMANCE MONITORING COUNTERS for NVE4+ === */
34
35 /* NOTE: intentionally using the same names as NV */
36 static const char *nve4_hw_sm_query_names[] =
37 {
38 /* MP counters */
39 "active_cycles",
40 "active_warps",
41 "atom_cas_count",
42 "atom_count",
43 "branch",
44 "divergent_branch",
45 "gld_request",
46 "global_ld_mem_divergence_replays",
47 "global_store_transaction",
48 "global_st_mem_divergence_replays",
49 "gred_count",
50 "gst_request",
51 "inst_executed",
52 "inst_issued1",
53 "inst_issued2",
54 "l1_global_load_hit",
55 "l1_global_load_miss",
56 "__l1_global_load_transactions",
57 "__l1_global_store_transactions",
58 "l1_local_load_hit",
59 "l1_local_load_miss",
60 "l1_local_store_hit",
61 "l1_local_store_miss",
62 "l1_shared_load_transactions",
63 "l1_shared_store_transactions",
64 "local_load",
65 "local_load_transactions",
66 "local_store",
67 "local_store_transactions",
68 "prof_trigger_00",
69 "prof_trigger_01",
70 "prof_trigger_02",
71 "prof_trigger_03",
72 "prof_trigger_04",
73 "prof_trigger_05",
74 "prof_trigger_06",
75 "prof_trigger_07",
76 "shared_load",
77 "shared_load_replay",
78 "shared_store",
79 "shared_store_replay",
80 "sm_cta_launched",
81 "threads_launched",
82 "uncached_global_load_transaction",
83 "warps_launched",
84 };
85
86 /* Code to read out MP counters: They are accessible via mmio, too, but let's
87 * just avoid mapping registers in userspace. We'd have to know which MPs are
88 * enabled/present, too, and that information is not presently exposed.
89 * We could add a kernel interface for it, but reading the counters like this
90 * has the advantage of being async (if get_result isn't called immediately).
91 */
92 static const uint64_t nve4_read_hw_sm_counters_code[] =
93 {
94 /* sched 0x20 0x20 0x20 0x20 0x20 0x20 0x20
95 * mov b32 $r8 $tidx
96 * mov b32 $r12 $physid
97 * mov b32 $r0 $pm0
98 * mov b32 $r1 $pm1
99 * mov b32 $r2 $pm2
100 * mov b32 $r3 $pm3
101 * mov b32 $r4 $pm4
102 * sched 0x20 0x20 0x23 0x04 0x20 0x04 0x2b
103 * mov b32 $r5 $pm5
104 * mov b32 $r6 $pm6
105 * mov b32 $r7 $pm7
106 * set $p0 0x1 eq u32 $r8 0x0
107 * mov b32 $r10 c0[0x0]
108 * ext u32 $r8 $r12 0x414
109 * mov b32 $r11 c0[0x4]
110 * sched 0x04 0x2e 0x04 0x20 0x20 0x28 0x04
111 * ext u32 $r9 $r12 0x208
112 * (not $p0) exit
113 * set $p1 0x1 eq u32 $r9 0x0
114 * mul $r8 u32 $r8 u32 96
115 * mul $r12 u32 $r9 u32 16
116 * mul $r13 u32 $r9 u32 4
117 * add b32 $r9 $r8 $r13
118 * sched 0x28 0x04 0x2c 0x04 0x2c 0x04 0x2c
119 * add b32 $r8 $r8 $r12
120 * mov b32 $r12 $r10
121 * add b32 $r10 $c $r10 $r8
122 * mov b32 $r13 $r11
123 * add b32 $r11 $r11 0x0 $c
124 * add b32 $r12 $c $r12 $r9
125 * st b128 wt g[$r10d] $r0q
126 * sched 0x4 0x2c 0x20 0x04 0x2e 0x00 0x00
127 * mov b32 $r0 c0[0x8]
128 * add b32 $r13 $r13 0x0 $c
129 * $p1 st b128 wt g[$r12d+0x40] $r4q
130 * st b32 wt g[$r12d+0x50] $r0
131 * exit */
132 0x2202020202020207ULL,
133 0x2c00000084021c04ULL,
134 0x2c0000000c031c04ULL,
135 0x2c00000010001c04ULL,
136 0x2c00000014005c04ULL,
137 0x2c00000018009c04ULL,
138 0x2c0000001c00dc04ULL,
139 0x2c00000020011c04ULL,
140 0x22b0420042320207ULL,
141 0x2c00000024015c04ULL,
142 0x2c00000028019c04ULL,
143 0x2c0000002c01dc04ULL,
144 0x190e0000fc81dc03ULL,
145 0x2800400000029de4ULL,
146 0x7000c01050c21c03ULL,
147 0x280040001002dde4ULL,
148 0x204282020042e047ULL,
149 0x7000c00820c25c03ULL,
150 0x80000000000021e7ULL,
151 0x190e0000fc93dc03ULL,
152 0x1000000180821c02ULL,
153 0x1000000040931c02ULL,
154 0x1000000010935c02ULL,
155 0x4800000034825c03ULL,
156 0x22c042c042c04287ULL,
157 0x4800000030821c03ULL,
158 0x2800000028031de4ULL,
159 0x4801000020a29c03ULL,
160 0x280000002c035de4ULL,
161 0x0800000000b2dc42ULL,
162 0x4801000024c31c03ULL,
163 0x9400000000a01fc5ULL,
164 0x200002e04202c047ULL,
165 0x2800400020001de4ULL,
166 0x0800000000d35c42ULL,
167 0x9400000100c107c5ULL,
168 0x9400000140c01f85ULL,
169 0x8000000000001de7ULL
170 };
171
172 /* For simplicity, we will allocate as many group slots as we allocate counter
173 * slots. This means that a single counter which wants to source from 2 groups
174 * will have to be declared as using 2 counter slots. This shouldn't really be
175 * a problem because such queries don't make much sense ... (unless someone is
176 * really creative).
177 */
178 struct nvc0_hw_sm_counter_cfg
179 {
180 uint32_t func : 16; /* mask or 4-bit logic op (depending on mode) */
181 uint32_t mode : 4; /* LOGOP,B6,LOGOP_B6(_PULSE) */
182 uint32_t sig_dom : 1; /* if 0, MP_PM_A (per warp-sched), if 1, MP_PM_B */
183 uint32_t sig_sel : 8; /* signal group */
184 uint32_t src_mask; /* mask for signal selection (only for NVC0:NVE4) */
185 uint32_t src_sel; /* signal selection for up to 4 sources */
186 };
187
188 struct nvc0_hw_sm_query_cfg
189 {
190 struct nvc0_hw_sm_counter_cfg ctr[8];
191 uint8_t num_counters;
192 uint8_t norm[2]; /* normalization num,denom */
193 };
194
195 #define _Q1A(n, f, m, g, s, nu, dn) [NVE4_HW_SM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g, 0, s }, {}, {}, {} }, 1, { nu, dn } }
196 #define _Q1B(n, f, m, g, s, nu, dn) [NVE4_HW_SM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g, 0, s }, {}, {}, {} }, 1, { nu, dn } }
197
198 /* NOTES:
199 * active_warps: bit 0 alternates btw 0 and 1 for odd nr of warps
200 * inst_executed etc.: we only count a single warp scheduler
201 */
202 static const struct nvc0_hw_sm_query_cfg nve4_hw_sm_queries[] =
203 {
204 _Q1B(ACTIVE_CYCLES, 0x0001, B6, WARP, 0x00000000, 1, 1),
205 _Q1B(ACTIVE_WARPS, 0x003f, B6, WARP, 0x31483104, 2, 1),
206 _Q1A(ATOM_CAS_COUNT, 0x0001, B6, BRANCH, 0x000000004, 1, 1),
207 _Q1A(ATOM_COUNT, 0x0001, B6, BRANCH, 0x00000000, 1, 1),
208 _Q1A(BRANCH, 0x0001, B6, BRANCH, 0x0000000c, 1, 1),
209 _Q1A(DIVERGENT_BRANCH, 0x0001, B6, BRANCH, 0x00000010, 1, 1),
210 _Q1A(GLD_REQUEST, 0x0001, B6, LDST, 0x00000010, 1, 1),
211 _Q1B(GLD_MEM_DIV_REPLAY, 0x0001, B6, REPLAY, 0x00000010, 1, 1),
212 _Q1B(GST_TRANSACTIONS, 0x0001, B6, MEM, 0x00000004, 1, 1),
213 _Q1B(GST_MEM_DIV_REPLAY, 0x0001, B6, REPLAY, 0x00000014, 1, 1),
214 _Q1A(GRED_COUNT, 0x0001, B6, BRANCH, 0x00000008, 1, 1),
215 _Q1A(GST_REQUEST, 0x0001, B6, LDST, 0x00000014, 1, 1),
216 _Q1A(INST_EXECUTED, 0x0003, B6, EXEC, 0x00000398, 1, 1),
217 _Q1A(INST_ISSUED1, 0x0001, B6, ISSUE, 0x00000004, 1, 1),
218 _Q1A(INST_ISSUED2, 0x0001, B6, ISSUE, 0x00000008, 1, 1),
219 _Q1B(L1_GLD_HIT, 0x0001, B6, L1, 0x00000010, 1, 1),
220 _Q1B(L1_GLD_MISS, 0x0001, B6, L1, 0x00000014, 1, 1),
221 _Q1B(L1_GLD_TRANSACTIONS, 0x0001, B6, UNK0F, 0x00000000, 1, 1),
222 _Q1B(L1_GST_TRANSACTIONS, 0x0001, B6, UNK0F, 0x00000004, 1, 1),
223 _Q1B(L1_LOCAL_LD_HIT, 0x0001, B6, L1, 0x00000000, 1, 1),
224 _Q1B(L1_LOCAL_LD_MISS, 0x0001, B6, L1, 0x00000004, 1, 1),
225 _Q1B(L1_LOCAL_ST_HIT, 0x0001, B6, L1, 0x00000008, 1, 1),
226 _Q1B(L1_LOCAL_ST_MISS, 0x0001, B6, L1, 0x0000000c, 1, 1),
227 _Q1B(L1_SHARED_LD_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000008, 1, 1),
228 _Q1B(L1_SHARED_ST_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x0000000c, 1, 1),
229 _Q1A(LOCAL_LD, 0x0001, B6, LDST, 0x00000008, 1, 1),
230 _Q1B(LOCAL_LD_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000000, 1, 1),
231 _Q1A(LOCAL_ST, 0x0001, B6, LDST, 0x0000000c, 1, 1),
232 _Q1B(LOCAL_ST_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000004, 1, 1),
233 _Q1A(PROF_TRIGGER_0, 0x0001, B6, USER, 0x00000000, 1, 1),
234 _Q1A(PROF_TRIGGER_1, 0x0001, B6, USER, 0x00000004, 1, 1),
235 _Q1A(PROF_TRIGGER_2, 0x0001, B6, USER, 0x00000008, 1, 1),
236 _Q1A(PROF_TRIGGER_3, 0x0001, B6, USER, 0x0000000c, 1, 1),
237 _Q1A(PROF_TRIGGER_4, 0x0001, B6, USER, 0x00000010, 1, 1),
238 _Q1A(PROF_TRIGGER_5, 0x0001, B6, USER, 0x00000014, 1, 1),
239 _Q1A(PROF_TRIGGER_6, 0x0001, B6, USER, 0x00000018, 1, 1),
240 _Q1A(PROF_TRIGGER_7, 0x0001, B6, USER, 0x0000001c, 1, 1),
241 _Q1A(SHARED_LD, 0x0001, B6, LDST, 0x00000000, 1, 1),
242 _Q1B(SHARED_LD_REPLAY, 0x0001, B6, REPLAY, 0x00000008, 1, 1),
243 _Q1A(SHARED_ST, 0x0001, B6, LDST, 0x00000004, 1, 1),
244 _Q1B(SHARED_ST_REPLAY, 0x0001, B6, REPLAY, 0x0000000c, 1, 1),
245 _Q1B(SM_CTA_LAUNCHED, 0x0001, B6, WARP, 0x0000001c, 1, 1),
246 _Q1A(THREADS_LAUNCHED, 0x003f, B6, LAUNCH, 0x398a4188, 1, 1),
247 _Q1B(UNCACHED_GLD_TRANSACTIONS, 0x0001, B6, MEM, 0x00000000, 1, 1),
248 _Q1A(WARPS_LAUNCHED, 0x0001, B6, LAUNCH, 0x00000004, 1, 1),
249 };
250
251 #undef _Q1A
252 #undef _Q1B
253
254 /* === PERFORMANCE MONITORING COUNTERS for NVC0:NVE4 === */
255 /* NOTES:
256 * - MP counters on GF100/GF110 (compute capability 2.0) are buggy
257 * because there is a context-switch problem that we need to fix.
258 * Results might be wrong sometimes, be careful!
259 */
260 static const char *nvc0_hw_sm_query_names[] =
261 {
262 /* MP counters */
263 "active_cycles",
264 "active_warps",
265 "atom_count",
266 "branch",
267 "divergent_branch",
268 "gld_request",
269 "gred_count",
270 "gst_request",
271 "inst_executed",
272 "inst_issued",
273 "inst_issued1_0",
274 "inst_issued1_1",
275 "inst_issued2_0",
276 "inst_issued2_1",
277 "local_load",
278 "local_store",
279 "prof_trigger_00",
280 "prof_trigger_01",
281 "prof_trigger_02",
282 "prof_trigger_03",
283 "prof_trigger_04",
284 "prof_trigger_05",
285 "prof_trigger_06",
286 "prof_trigger_07",
287 "shared_load",
288 "shared_store",
289 "threads_launched",
290 "thread_inst_executed_0",
291 "thread_inst_executed_1",
292 "thread_inst_executed_2",
293 "thread_inst_executed_3",
294 "warps_launched",
295 };
296
297 static const uint64_t nvc0_read_hw_sm_counters_code[] =
298 {
299 /* mov b32 $r8 $tidx
300 * mov b32 $r9 $physid
301 * mov b32 $r0 $pm0
302 * mov b32 $r1 $pm1
303 * mov b32 $r2 $pm2
304 * mov b32 $r3 $pm3
305 * mov b32 $r4 $pm4
306 * mov b32 $r5 $pm5
307 * mov b32 $r6 $pm6
308 * mov b32 $r7 $pm7
309 * set $p0 0x1 eq u32 $r8 0x0
310 * mov b32 $r10 c0[0x0]
311 * mov b32 $r11 c0[0x4]
312 * ext u32 $r8 $r9 0x414
313 * (not $p0) exit
314 * mul $r8 u32 $r8 u32 48
315 * add b32 $r10 $c $r10 $r8
316 * add b32 $r11 $r11 0x0 $c
317 * mov b32 $r8 c0[0x8]
318 * st b128 wt g[$r10d+0x00] $r0q
319 * st b128 wt g[$r10d+0x10] $r4q
320 * st b32 wt g[$r10d+0x20] $r8
321 * exit */
322 0x2c00000084021c04ULL,
323 0x2c0000000c025c04ULL,
324 0x2c00000010001c04ULL,
325 0x2c00000014005c04ULL,
326 0x2c00000018009c04ULL,
327 0x2c0000001c00dc04ULL,
328 0x2c00000020011c04ULL,
329 0x2c00000024015c04ULL,
330 0x2c00000028019c04ULL,
331 0x2c0000002c01dc04ULL,
332 0x190e0000fc81dc03ULL,
333 0x2800400000029de4ULL,
334 0x280040001002dde4ULL,
335 0x7000c01050921c03ULL,
336 0x80000000000021e7ULL,
337 0x10000000c0821c02ULL,
338 0x4801000020a29c03ULL,
339 0x0800000000b2dc42ULL,
340 0x2800400020021de4ULL,
341 0x9400000000a01fc5ULL,
342 0x9400000040a11fc5ULL,
343 0x9400000080a21f85ULL,
344 0x8000000000001de7ULL
345 };
346
347 #define _C(f, o, g, m, s) { f, NVC0_COMPUTE_MP_PM_OP_MODE_##o, 0, g, m, s }
348 #define _Q(n, c) [NVC0_HW_SM_QUERY_##n] = c
349
350 /* ==== Compute capability 2.0 (GF100/GF110) ==== */
351 static const struct nvc0_hw_sm_query_cfg
352 sm20_active_cycles =
353 {
354 .ctr[0] = _C(0xaaaa, LOGOP, 0x11, 0x000000ff, 0x00000000),
355 .num_counters = 1,
356 .norm = { 1, 1 },
357 };
358
359 static const struct nvc0_hw_sm_query_cfg
360 sm20_active_warps =
361 {
362 .ctr[0] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000010),
363 .ctr[1] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000020),
364 .ctr[2] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000030),
365 .ctr[3] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000040),
366 .ctr[4] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000050),
367 .ctr[5] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000060),
368 .num_counters = 6,
369 .norm = { 1, 1 },
370 };
371
372 static const struct nvc0_hw_sm_query_cfg
373 sm20_atom_count =
374 {
375 .ctr[0] = _C(0xaaaa, LOGOP, 0x63, 0x000000ff, 0x00000030),
376 .num_counters = 1,
377 .norm = { 1, 1 },
378 };
379
380 static const struct nvc0_hw_sm_query_cfg
381 sm20_branch =
382 {
383 .ctr[0] = _C(0xaaaa, LOGOP, 0x1a, 0x000000ff, 0x00000000),
384 .ctr[1] = _C(0xaaaa, LOGOP, 0x1a, 0x000000ff, 0x00000010),
385 .num_counters = 2,
386 .norm = { 1, 1 },
387 };
388
389 static const struct nvc0_hw_sm_query_cfg
390 sm20_divergent_branch =
391 {
392 .ctr[0] = _C(0xaaaa, LOGOP, 0x19, 0x000000ff, 0x00000020),
393 .ctr[1] = _C(0xaaaa, LOGOP, 0x19, 0x000000ff, 0x00000030),
394 .num_counters = 2,
395 .norm = { 1, 1 },
396 };
397
398 static const struct nvc0_hw_sm_query_cfg
399 sm20_gld_request =
400 {
401 .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000030),
402 .num_counters = 1,
403 .norm = { 1, 1 },
404 };
405
406 static const struct nvc0_hw_sm_query_cfg
407 sm20_gred_count =
408 {
409 .ctr[0] = _C(0xaaaa, LOGOP, 0x63, 0x000000ff, 0x00000040),
410 .num_counters = 1,
411 .norm = { 1, 1 },
412 };
413
414 static const struct nvc0_hw_sm_query_cfg
415 sm20_gst_request =
416 {
417 .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000060),
418 .num_counters = 1,
419 .norm = { 1, 1 },
420 };
421
422 static const struct nvc0_hw_sm_query_cfg
423 sm20_inst_executed =
424 {
425 .ctr[0] = _C(0xaaaa, LOGOP, 0x2d, 0x0000ffff, 0x00001000),
426 .ctr[1] = _C(0xaaaa, LOGOP, 0x2d, 0x0000ffff, 0x00001010),
427 .num_counters = 2,
428 .norm = { 1, 1 },
429 };
430
431 static const struct nvc0_hw_sm_query_cfg
432 sm20_inst_issued =
433 {
434 .ctr[0] = _C(0xaaaa, LOGOP, 0x27, 0x0000ffff, 0x00007060),
435 .ctr[1] = _C(0xaaaa, LOGOP, 0x27, 0x0000ffff, 0x00007070),
436 .num_counters = 2,
437 .norm = { 1, 1 },
438 };
439
440 static const struct nvc0_hw_sm_query_cfg
441 sm20_local_ld =
442 {
443 .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000020),
444 .num_counters = 1,
445 .norm = { 1, 1 },
446 };
447
448 static const struct nvc0_hw_sm_query_cfg
449 sm20_local_st =
450 {
451 .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000050),
452 .num_counters = 1,
453 .norm = { 1, 1 },
454 };
455
456 static const struct nvc0_hw_sm_query_cfg
457 sm20_prof_trigger_0 =
458 {
459 .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000000),
460 .num_counters = 1,
461 .norm = { 1, 1 },
462 };
463
464 static const struct nvc0_hw_sm_query_cfg
465 sm20_prof_trigger_1 =
466 {
467 .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000010),
468 .num_counters = 1,
469 .norm = { 1, 1 },
470 };
471
472 static const struct nvc0_hw_sm_query_cfg
473 sm20_prof_trigger_2 =
474 {
475 .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000020),
476 .num_counters = 1,
477 .norm = { 1, 1 },
478 };
479
480 static const struct nvc0_hw_sm_query_cfg
481 sm20_prof_trigger_3 =
482 {
483 .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000030),
484 .num_counters = 1,
485 .norm = { 1, 1 },
486 };
487
488 static const struct nvc0_hw_sm_query_cfg
489 sm20_prof_trigger_4 =
490 {
491 .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000040),
492 .num_counters = 1,
493 .norm = { 1, 1 },
494 };
495
496 static const struct nvc0_hw_sm_query_cfg
497 sm20_prof_trigger_5 =
498 {
499 .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000050),
500 .num_counters = 1,
501 .norm = { 1, 1 },
502 };
503
504 static const struct nvc0_hw_sm_query_cfg
505 sm20_prof_trigger_6 =
506 {
507 .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000060),
508 .num_counters = 1,
509 .norm = { 1, 1 },
510 };
511
512 static const struct nvc0_hw_sm_query_cfg
513 sm20_prof_trigger_7 =
514 {
515 .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000070),
516 .num_counters = 1,
517 .norm = { 1, 1 },
518 };
519
520 static const struct nvc0_hw_sm_query_cfg
521 sm20_shared_ld =
522 {
523 .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000010),
524 .num_counters = 1,
525 .norm = { 1, 1 },
526 };
527
528 static const struct nvc0_hw_sm_query_cfg
529 sm20_shared_st =
530 {
531 .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000040),
532 .num_counters = 1,
533 .norm = { 1, 1 },
534 };
535
536 static const struct nvc0_hw_sm_query_cfg
537 sm20_threads_launched =
538 {
539 .ctr[0] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000010),
540 .ctr[1] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000020),
541 .ctr[2] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000030),
542 .ctr[3] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000040),
543 .ctr[4] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000050),
544 .ctr[5] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000060),
545 .num_counters = 6,
546 .norm = { 1, 1 },
547 };
548
549 static const struct nvc0_hw_sm_query_cfg
550 sm20_th_inst_executed_0 =
551 {
552 .ctr[0] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000000),
553 .ctr[1] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000010),
554 .ctr[2] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000020),
555 .ctr[3] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000030),
556 .ctr[4] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000040),
557 .ctr[5] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000050),
558 .num_counters = 6,
559 .norm = { 1, 1 },
560 };
561
562 static const struct nvc0_hw_sm_query_cfg
563 sm20_th_inst_executed_1 =
564 {
565 .ctr[0] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000000),
566 .ctr[1] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000010),
567 .ctr[2] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000020),
568 .ctr[3] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000030),
569 .ctr[4] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000040),
570 .ctr[5] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000050),
571 .num_counters = 6,
572 .norm = { 1, 1 },
573 };
574
575 static const struct nvc0_hw_sm_query_cfg
576 sm20_warps_launched =
577 {
578 .ctr[0] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000000),
579 .num_counters = 1,
580 .norm = { 1, 1 },
581 };
582
583 static const struct nvc0_hw_sm_query_cfg *sm20_hw_sm_queries[] =
584 {
585 _Q(ACTIVE_CYCLES, &sm20_active_cycles),
586 _Q(ACTIVE_WARPS, &sm20_active_warps),
587 _Q(ATOM_COUNT, &sm20_atom_count),
588 _Q(BRANCH, &sm20_branch),
589 _Q(DIVERGENT_BRANCH, &sm20_divergent_branch),
590 _Q(GLD_REQUEST, &sm20_gld_request),
591 _Q(GRED_COUNT, &sm20_gred_count),
592 _Q(GST_REQUEST, &sm20_gst_request),
593 _Q(INST_EXECUTED, &sm20_inst_executed),
594 _Q(INST_ISSUED, &sm20_inst_issued),
595 _Q(INST_ISSUED1_0, NULL),
596 _Q(INST_ISSUED1_1, NULL),
597 _Q(INST_ISSUED2_0, NULL),
598 _Q(INST_ISSUED2_1, NULL),
599 _Q(LOCAL_LD, &sm20_local_ld),
600 _Q(LOCAL_ST, &sm20_local_st),
601 _Q(PROF_TRIGGER_0, &sm20_prof_trigger_0),
602 _Q(PROF_TRIGGER_1, &sm20_prof_trigger_1),
603 _Q(PROF_TRIGGER_2, &sm20_prof_trigger_2),
604 _Q(PROF_TRIGGER_3, &sm20_prof_trigger_3),
605 _Q(PROF_TRIGGER_4, &sm20_prof_trigger_4),
606 _Q(PROF_TRIGGER_5, &sm20_prof_trigger_5),
607 _Q(PROF_TRIGGER_6, &sm20_prof_trigger_6),
608 _Q(PROF_TRIGGER_7, &sm20_prof_trigger_7),
609 _Q(SHARED_LD, &sm20_shared_ld),
610 _Q(SHARED_ST, &sm20_shared_st),
611 _Q(THREADS_LAUNCHED, &sm20_threads_launched),
612 _Q(TH_INST_EXECUTED_0, &sm20_th_inst_executed_0),
613 _Q(TH_INST_EXECUTED_1, &sm20_th_inst_executed_1),
614 _Q(TH_INST_EXECUTED_2, NULL),
615 _Q(TH_INST_EXECUTED_3, NULL),
616 _Q(WARPS_LAUNCHED, &sm20_warps_launched),
617 };
618
619 /* ==== Compute capability 2.1 (GF108+ except GF110) ==== */
620 static const struct nvc0_hw_sm_query_cfg
621 sm21_inst_executed =
622 {
623 .ctr[0] = _C(0xaaaa, LOGOP, 0x2d, 0x000000ff, 0x00000000),
624 .ctr[1] = _C(0xaaaa, LOGOP, 0x2d, 0x000000ff, 0x00000010),
625 .ctr[2] = _C(0xaaaa, LOGOP, 0x2d, 0x000000ff, 0x00000020),
626 .num_counters = 3,
627 .norm = { 1, 1 },
628 };
629
630 static const struct nvc0_hw_sm_query_cfg
631 sm21_inst_issued1_0 =
632 {
633 .ctr[0] = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000010),
634 .num_counters = 1,
635 .norm = { 1, 1 },
636 };
637
638 static const struct nvc0_hw_sm_query_cfg
639 sm21_inst_issued1_1 =
640 {
641 .ctr[0] = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000040),
642 .num_counters = 1,
643 .norm = { 1, 1 },
644 };
645
646 static const struct nvc0_hw_sm_query_cfg
647 sm21_inst_issued2_0 =
648 {
649 .ctr[0] = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000020),
650 .num_counters = 1,
651 .norm = { 1, 1 },
652 };
653
654 static const struct nvc0_hw_sm_query_cfg
655 sm21_inst_issued2_1 =
656 {
657 .ctr[0] = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000050),
658 .num_counters = 1,
659 .norm = { 1, 1 },
660 };
661
662 static const struct nvc0_hw_sm_query_cfg
663 sm21_th_inst_executed_0 =
664 {
665 .ctr[0] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000000),
666 .ctr[1] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000010),
667 .ctr[2] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000020),
668 .ctr[3] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000030),
669 .ctr[4] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000040),
670 .ctr[5] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000050),
671 .num_counters = 6,
672 .norm = { 1, 1 },
673 };
674
675 static const struct nvc0_hw_sm_query_cfg
676 sm21_th_inst_executed_1 =
677 {
678 .ctr[0] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000000),
679 .ctr[1] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000010),
680 .ctr[2] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000020),
681 .ctr[3] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000030),
682 .ctr[4] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000040),
683 .ctr[5] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000050),
684 .num_counters = 6,
685 .norm = { 1, 1 },
686 };
687
688 static const struct nvc0_hw_sm_query_cfg
689 sm21_th_inst_executed_2 =
690 {
691 .ctr[0] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000000),
692 .ctr[1] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000010),
693 .ctr[2] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000020),
694 .ctr[3] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000030),
695 .ctr[4] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000040),
696 .ctr[5] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000050),
697 .num_counters = 6,
698 .norm = { 1, 1 },
699 };
700
701 static const struct nvc0_hw_sm_query_cfg
702 sm21_th_inst_executed_3 =
703 {
704 .ctr[0] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000000),
705 .ctr[1] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000010),
706 .ctr[2] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000020),
707 .ctr[3] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000030),
708 .ctr[4] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000040),
709 .ctr[5] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000050),
710 .num_counters = 6,
711 .norm = { 1, 1 },
712 };
713
714 static const struct nvc0_hw_sm_query_cfg *sm21_hw_sm_queries[] =
715 {
716 _Q(ACTIVE_CYCLES, &sm20_active_cycles),
717 _Q(ACTIVE_WARPS, &sm20_active_warps),
718 _Q(ATOM_COUNT, &sm20_atom_count),
719 _Q(BRANCH, &sm20_branch),
720 _Q(DIVERGENT_BRANCH, &sm20_divergent_branch),
721 _Q(GLD_REQUEST, &sm20_gld_request),
722 _Q(GRED_COUNT, &sm20_gred_count),
723 _Q(GST_REQUEST, &sm20_gst_request),
724 _Q(INST_EXECUTED, &sm21_inst_executed),
725 _Q(INST_ISSUED, NULL),
726 _Q(INST_ISSUED1_0, &sm21_inst_issued1_0),
727 _Q(INST_ISSUED1_1, &sm21_inst_issued1_1),
728 _Q(INST_ISSUED2_0, &sm21_inst_issued2_0),
729 _Q(INST_ISSUED2_1, &sm21_inst_issued2_1),
730 _Q(LOCAL_LD, &sm20_local_ld),
731 _Q(LOCAL_ST, &sm20_local_st),
732 _Q(PROF_TRIGGER_0, &sm20_prof_trigger_0),
733 _Q(PROF_TRIGGER_1, &sm20_prof_trigger_1),
734 _Q(PROF_TRIGGER_2, &sm20_prof_trigger_2),
735 _Q(PROF_TRIGGER_3, &sm20_prof_trigger_3),
736 _Q(PROF_TRIGGER_4, &sm20_prof_trigger_4),
737 _Q(PROF_TRIGGER_5, &sm20_prof_trigger_5),
738 _Q(PROF_TRIGGER_6, &sm20_prof_trigger_6),
739 _Q(PROF_TRIGGER_7, &sm20_prof_trigger_7),
740 _Q(SHARED_LD, &sm20_shared_ld),
741 _Q(SHARED_ST, &sm20_shared_st),
742 _Q(THREADS_LAUNCHED, &sm20_threads_launched),
743 _Q(TH_INST_EXECUTED_0, &sm21_th_inst_executed_0),
744 _Q(TH_INST_EXECUTED_1, &sm21_th_inst_executed_1),
745 _Q(TH_INST_EXECUTED_2, &sm21_th_inst_executed_2),
746 _Q(TH_INST_EXECUTED_3, &sm21_th_inst_executed_3),
747 _Q(WARPS_LAUNCHED, &sm20_warps_launched),
748 };
749
750 #undef _Q
751 #undef _C
752
753 static inline const struct nvc0_hw_sm_query_cfg **
754 nvc0_hw_sm_get_queries(struct nvc0_screen *screen)
755 {
756 struct nouveau_device *dev = screen->base.device;
757
758 if (dev->chipset == 0xc0 || dev->chipset == 0xc8)
759 return sm20_hw_sm_queries;
760 return sm21_hw_sm_queries;
761 }
762
763 static const struct nvc0_hw_sm_query_cfg *
764 nvc0_hw_sm_query_get_cfg(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
765 {
766 struct nvc0_screen *screen = nvc0->screen;
767 struct nvc0_query *q = &hq->base;
768
769 if (screen->base.class_3d >= NVE4_3D_CLASS)
770 return &nve4_hw_sm_queries[q->type - PIPE_QUERY_DRIVER_SPECIFIC];
771
772 if (q->type >= NVC0_HW_SM_QUERY(0) && q->type <= NVC0_HW_SM_QUERY_LAST) {
773 const struct nvc0_hw_sm_query_cfg **queries =
774 nvc0_hw_sm_get_queries(screen);
775 return queries[q->type - NVC0_HW_SM_QUERY(0)];
776 }
777 debug_printf("invalid query type: %d\n", q->type);
778 return NULL;
779 }
780
781 static void
782 nvc0_hw_sm_destroy_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
783 {
784 struct nvc0_query *q = &hq->base;
785 nvc0_hw_query_allocate(nvc0, q, 0);
786 nouveau_fence_ref(NULL, &hq->fence);
787 FREE(hq);
788 }
789
790 static boolean
791 nve4_hw_sm_begin_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
792 {
793 struct nvc0_screen *screen = nvc0->screen;
794 struct nouveau_pushbuf *push = nvc0->base.pushbuf;
795 struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
796 const struct nvc0_hw_sm_query_cfg *cfg;
797 unsigned i, c;
798 unsigned num_ab[2] = { 0, 0 };
799
800 cfg = nvc0_hw_sm_query_get_cfg(nvc0, hq);
801
802 /* check if we have enough free counter slots */
803 for (i = 0; i < cfg->num_counters; ++i)
804 num_ab[cfg->ctr[i].sig_dom]++;
805
806 if (screen->pm.num_hw_sm_active[0] + num_ab[0] > 4 ||
807 screen->pm.num_hw_sm_active[1] + num_ab[1] > 4) {
808 NOUVEAU_ERR("Not enough free MP counter slots !\n");
809 return false;
810 }
811
812 assert(cfg->num_counters <= 4);
813 PUSH_SPACE(push, 4 * 8 * + 6);
814
815 if (!screen->pm.mp_counters_enabled) {
816 screen->pm.mp_counters_enabled = true;
817 BEGIN_NVC0(push, SUBC_SW(0x06ac), 1);
818 PUSH_DATA (push, 0x1fcb);
819 }
820
821 /* set sequence field to 0 (used to check if result is available) */
822 for (i = 0; i < screen->mp_count; ++i)
823 hq->data[i * 10 + 10] = 0;
824 hq->sequence++;
825
826 for (i = 0; i < cfg->num_counters; ++i) {
827 const unsigned d = cfg->ctr[i].sig_dom;
828
829 if (!screen->pm.num_hw_sm_active[d]) {
830 uint32_t m = (1 << 22) | (1 << (7 + (8 * !d)));
831 if (screen->pm.num_hw_sm_active[!d])
832 m |= 1 << (7 + (8 * d));
833 BEGIN_NVC0(push, SUBC_SW(0x0600), 1);
834 PUSH_DATA (push, m);
835 }
836 screen->pm.num_hw_sm_active[d]++;
837
838 for (c = d * 4; c < (d * 4 + 4); ++c) {
839 if (!screen->pm.mp_counter[c]) {
840 hsq->ctr[i] = c;
841 screen->pm.mp_counter[c] = hsq;
842 break;
843 }
844 }
845 assert(c <= (d * 4 + 3)); /* must succeed, already checked for space */
846
847 /* configure and reset the counter(s) */
848 if (d == 0)
849 BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_A_SIGSEL(c & 3)), 1);
850 else
851 BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_B_SIGSEL(c & 3)), 1);
852 PUSH_DATA (push, cfg->ctr[i].sig_sel);
853 BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_SRCSEL(c)), 1);
854 PUSH_DATA (push, cfg->ctr[i].src_sel + 0x2108421 * (c & 3));
855 BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(c)), 1);
856 PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode);
857 BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_SET(c)), 1);
858 PUSH_DATA (push, 0);
859 }
860 return true;
861 }
862
863 static boolean
864 nvc0_hw_sm_begin_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
865 {
866 struct nvc0_screen *screen = nvc0->screen;
867 struct nouveau_pushbuf *push = nvc0->base.pushbuf;
868 struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
869 const struct nvc0_hw_sm_query_cfg *cfg;
870 unsigned i, c;
871
872 if (screen->base.class_3d >= NVE4_3D_CLASS)
873 return nve4_hw_sm_begin_query(nvc0, hq);
874
875 cfg = nvc0_hw_sm_query_get_cfg(nvc0, hq);
876
877 /* check if we have enough free counter slots */
878 if (screen->pm.num_hw_sm_active[0] + cfg->num_counters > 8) {
879 NOUVEAU_ERR("Not enough free MP counter slots !\n");
880 return false;
881 }
882
883 assert(cfg->num_counters <= 8);
884 PUSH_SPACE(push, 8 * 8 + 2);
885
886 /* set sequence field to 0 (used to check if result is available) */
887 for (i = 0; i < screen->mp_count; ++i) {
888 const unsigned b = (0x30 / 4) * i;
889 hq->data[b + 8] = 0;
890 }
891 hq->sequence++;
892
893 for (i = 0; i < cfg->num_counters; ++i) {
894 uint32_t mask_sel = 0x00000000;
895
896 if (!screen->pm.num_hw_sm_active[0]) {
897 BEGIN_NVC0(push, SUBC_SW(0x0600), 1);
898 PUSH_DATA (push, 0x80000000);
899 }
900 screen->pm.num_hw_sm_active[0]++;
901
902 for (c = 0; c < 8; ++c) {
903 if (!screen->pm.mp_counter[c]) {
904 hsq->ctr[i] = c;
905 screen->pm.mp_counter[c] = hsq;
906 break;
907 }
908 }
909
910 /* Oddly-enough, the signal id depends on the slot selected on Fermi but
911 * not on Kepler. Fortunately, the signal ids are just offseted by the
912 * slot id! */
913 mask_sel |= c;
914 mask_sel |= (c << 8);
915 mask_sel |= (c << 16);
916 mask_sel |= (c << 24);
917 mask_sel &= cfg->ctr[i].src_mask;
918
919 /* configure and reset the counter(s) */
920 BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SIGSEL(c)), 1);
921 PUSH_DATA (push, cfg->ctr[i].sig_sel);
922 BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SRCSEL(c)), 1);
923 PUSH_DATA (push, cfg->ctr[i].src_sel | mask_sel);
924 BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_OP(c)), 1);
925 PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode);
926 BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SET(c)), 1);
927 PUSH_DATA (push, 0);
928 }
929 return true;
930 }
931
932 static void
933 nvc0_hw_sm_end_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
934 {
935 struct nvc0_screen *screen = nvc0->screen;
936 struct pipe_context *pipe = &nvc0->base.pipe;
937 struct nouveau_pushbuf *push = nvc0->base.pushbuf;
938 const bool is_nve4 = screen->base.class_3d >= NVE4_3D_CLASS;
939 struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
940 uint32_t mask;
941 uint32_t input[3];
942 const uint block[3] = { 32, is_nve4 ? 4 : 1, 1 };
943 const uint grid[3] = { screen->mp_count, screen->gpc_count, 1 };
944 unsigned c;
945
946 if (unlikely(!screen->pm.prog)) {
947 struct nvc0_program *prog = CALLOC_STRUCT(nvc0_program);
948 prog->type = PIPE_SHADER_COMPUTE;
949 prog->translated = true;
950 prog->parm_size = 12;
951 if (is_nve4) {
952 prog->code = (uint32_t *)nve4_read_hw_sm_counters_code;
953 prog->code_size = sizeof(nve4_read_hw_sm_counters_code);
954 prog->num_gprs = 14;
955 } else {
956 prog->code = (uint32_t *)nvc0_read_hw_sm_counters_code;
957 prog->code_size = sizeof(nvc0_read_hw_sm_counters_code);
958 prog->num_gprs = 12;
959 }
960 screen->pm.prog = prog;
961 }
962
963 /* disable all counting */
964 PUSH_SPACE(push, 8);
965 for (c = 0; c < 8; ++c)
966 if (screen->pm.mp_counter[c]) {
967 if (is_nve4) {
968 IMMED_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(c)), 0);
969 } else {
970 IMMED_NVC0(push, NVC0_COMPUTE(MP_PM_OP(c)), 0);
971 }
972 }
973 /* release counters for this query */
974 for (c = 0; c < 8; ++c) {
975 if (screen->pm.mp_counter[c] == hsq) {
976 uint8_t d = is_nve4 ? c / 4 : 0; /* only one domain for NVC0:NVE4 */
977 screen->pm.num_hw_sm_active[d]--;
978 screen->pm.mp_counter[c] = NULL;
979 }
980 }
981
982 BCTX_REFN_bo(nvc0->bufctx_cp, CP_QUERY, NOUVEAU_BO_GART | NOUVEAU_BO_WR,
983 hq->bo);
984
985 PUSH_SPACE(push, 1);
986 IMMED_NVC0(push, SUBC_COMPUTE(NV50_GRAPH_SERIALIZE), 0);
987
988 pipe->bind_compute_state(pipe, screen->pm.prog);
989 input[0] = (hq->bo->offset + hq->base_offset);
990 input[1] = (hq->bo->offset + hq->base_offset) >> 32;
991 input[2] = hq->sequence;
992 pipe->launch_grid(pipe, block, grid, 0, input);
993
994 nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_QUERY);
995
996 /* re-activate other counters */
997 PUSH_SPACE(push, 16);
998 mask = 0;
999 for (c = 0; c < 8; ++c) {
1000 const struct nvc0_hw_sm_query_cfg *cfg;
1001 unsigned i;
1002
1003 hsq = screen->pm.mp_counter[c];
1004 if (!hsq)
1005 continue;
1006
1007 cfg = nvc0_hw_sm_query_get_cfg(nvc0, &hsq->base);
1008 for (i = 0; i < cfg->num_counters; ++i) {
1009 if (mask & (1 << hsq->ctr[i]))
1010 break;
1011 mask |= 1 << hsq->ctr[i];
1012 if (is_nve4) {
1013 BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(hsq->ctr[i])), 1);
1014 } else {
1015 BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_OP(hsq->ctr[i])), 1);
1016 }
1017 PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode);
1018 }
1019 }
1020 }
1021
1022 static inline bool
1023 nvc0_hw_sm_query_read_data(uint32_t count[32][8],
1024 struct nvc0_context *nvc0, bool wait,
1025 struct nvc0_hw_query *hq,
1026 const struct nvc0_hw_sm_query_cfg *cfg,
1027 unsigned mp_count)
1028 {
1029 struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
1030 unsigned p, c;
1031
1032 for (p = 0; p < mp_count; ++p) {
1033 const unsigned b = (0x30 / 4) * p;
1034
1035 for (c = 0; c < cfg->num_counters; ++c) {
1036 if (hq->data[b + 8] != hq->sequence) {
1037 if (!wait)
1038 return false;
1039 if (nouveau_bo_wait(hq->bo, NOUVEAU_BO_RD, nvc0->base.client))
1040 return false;
1041 }
1042 count[p][c] = hq->data[b + hsq->ctr[c]] * (1 << c);
1043 }
1044 }
1045 return true;
1046 }
1047
1048 static inline bool
1049 nve4_hw_sm_query_read_data(uint32_t count[32][8],
1050 struct nvc0_context *nvc0, bool wait,
1051 struct nvc0_hw_query *hq,
1052 const struct nvc0_hw_sm_query_cfg *cfg,
1053 unsigned mp_count)
1054 {
1055 struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
1056 unsigned p, c, d;
1057
1058 for (p = 0; p < mp_count; ++p) {
1059 const unsigned b = (0x60 / 4) * p;
1060
1061 for (c = 0; c < cfg->num_counters; ++c) {
1062 count[p][c] = 0;
1063 for (d = 0; d < ((hsq->ctr[c] & ~3) ? 1 : 4); ++d) {
1064 if (hq->data[b + 20 + d] != hq->sequence) {
1065 if (!wait)
1066 return false;
1067 if (nouveau_bo_wait(hq->bo, NOUVEAU_BO_RD, nvc0->base.client))
1068 return false;
1069 }
1070 if (hsq->ctr[c] & ~0x3)
1071 count[p][c] = hq->data[b + 16 + (hsq->ctr[c] & 3)];
1072 else
1073 count[p][c] += hq->data[b + d * 4 + hsq->ctr[c]];
1074 }
1075 }
1076 }
1077 return true;
1078 }
1079
1080 static boolean
1081 nvc0_hw_sm_get_query_result(struct nvc0_context *nvc0, struct nvc0_hw_query *hq,
1082 boolean wait, union pipe_query_result *result)
1083 {
1084 uint32_t count[32][8];
1085 uint64_t value = 0;
1086 unsigned mp_count = MIN2(nvc0->screen->mp_count_compute, 32);
1087 unsigned p, c;
1088 const struct nvc0_hw_sm_query_cfg *cfg;
1089 bool ret;
1090
1091 cfg = nvc0_hw_sm_query_get_cfg(nvc0, hq);
1092
1093 if (nvc0->screen->base.class_3d >= NVE4_3D_CLASS)
1094 ret = nve4_hw_sm_query_read_data(count, nvc0, wait, hq, cfg, mp_count);
1095 else
1096 ret = nvc0_hw_sm_query_read_data(count, nvc0, wait, hq, cfg, mp_count);
1097 if (!ret)
1098 return false;
1099
1100 for (c = 0; c < cfg->num_counters; ++c)
1101 for (p = 0; p < mp_count; ++p)
1102 value += count[p][c];
1103 value = (value * cfg->norm[0]) / cfg->norm[1];
1104
1105 *(uint64_t *)result = value;
1106 return true;
1107 }
1108
1109 static const struct nvc0_hw_query_funcs hw_sm_query_funcs = {
1110 .destroy_query = nvc0_hw_sm_destroy_query,
1111 .begin_query = nvc0_hw_sm_begin_query,
1112 .end_query = nvc0_hw_sm_end_query,
1113 .get_query_result = nvc0_hw_sm_get_query_result,
1114 };
1115
1116 struct nvc0_hw_query *
1117 nvc0_hw_sm_create_query(struct nvc0_context *nvc0, unsigned type)
1118 {
1119 struct nvc0_screen *screen = nvc0->screen;
1120 struct nvc0_hw_sm_query *hsq;
1121 struct nvc0_hw_query *hq;
1122 unsigned space;
1123
1124 if (nvc0->screen->base.device->drm_version < 0x01000101)
1125 return NULL;
1126
1127 if ((type < NVE4_HW_SM_QUERY(0) || type > NVE4_HW_SM_QUERY_LAST) &&
1128 (type < NVC0_HW_SM_QUERY(0) || type > NVC0_HW_SM_QUERY_LAST))
1129 return NULL;
1130
1131 hsq = CALLOC_STRUCT(nvc0_hw_sm_query);
1132 if (!hsq)
1133 return NULL;
1134
1135 hq = &hsq->base;
1136 hq->funcs = &hw_sm_query_funcs;
1137 hq->base.type = type;
1138
1139 if (screen->base.class_3d >= NVE4_3D_CLASS) {
1140 /* for each MP:
1141 * [00] = WS0.C0
1142 * [04] = WS0.C1
1143 * [08] = WS0.C2
1144 * [0c] = WS0.C3
1145 * [10] = WS1.C0
1146 * [14] = WS1.C1
1147 * [18] = WS1.C2
1148 * [1c] = WS1.C3
1149 * [20] = WS2.C0
1150 * [24] = WS2.C1
1151 * [28] = WS2.C2
1152 * [2c] = WS2.C3
1153 * [30] = WS3.C0
1154 * [34] = WS3.C1
1155 * [38] = WS3.C2
1156 * [3c] = WS3.C3
1157 * [40] = MP.C4
1158 * [44] = MP.C5
1159 * [48] = MP.C6
1160 * [4c] = MP.C7
1161 * [50] = WS0.sequence
1162 * [54] = WS1.sequence
1163 * [58] = WS2.sequence
1164 * [5c] = WS3.sequence
1165 */
1166 space = (4 * 4 + 4 + 4) * nvc0->screen->mp_count * sizeof(uint32_t);
1167 } else {
1168 /*
1169 * Note that padding is used to align memory access to 128 bits.
1170 *
1171 * for each MP:
1172 * [00] = MP.C0
1173 * [04] = MP.C1
1174 * [08] = MP.C2
1175 * [0c] = MP.C3
1176 * [10] = MP.C4
1177 * [14] = MP.C5
1178 * [18] = MP.C6
1179 * [1c] = MP.C7
1180 * [20] = MP.sequence
1181 * [24] = padding
1182 * [28] = padding
1183 * [2c] = padding
1184 */
1185 space = (8 + 1 + 3) * nvc0->screen->mp_count * sizeof(uint32_t);
1186 }
1187
1188 if (!nvc0_hw_query_allocate(nvc0, &hq->base, space)) {
1189 FREE(hq);
1190 return NULL;
1191 }
1192
1193 return hq;
1194 }
1195
1196 static int
1197 nvc0_hw_sm_get_next_query_id(const struct nvc0_hw_sm_query_cfg **queries,
1198 unsigned id)
1199 {
1200 unsigned i, next = 0;
1201
1202 for (i = 0; i < NVC0_HW_SM_QUERY_COUNT; i++) {
1203 if (!queries[i]) {
1204 next++;
1205 } else
1206 if (i >= id && queries[id + next]) {
1207 break;
1208 }
1209 }
1210 return id + next;
1211 }
1212
1213 int
1214 nvc0_hw_sm_get_driver_query_info(struct nvc0_screen *screen, unsigned id,
1215 struct pipe_driver_query_info *info)
1216 {
1217 int count = 0;
1218
1219 if (screen->base.device->drm_version >= 0x01000101) {
1220 if (screen->compute) {
1221 if (screen->base.class_3d == NVE4_3D_CLASS) {
1222 count += NVE4_HW_SM_QUERY_COUNT;
1223 } else
1224 if (screen->base.class_3d < NVE4_3D_CLASS) {
1225 const struct nvc0_hw_sm_query_cfg **queries =
1226 nvc0_hw_sm_get_queries(screen);
1227 unsigned i;
1228
1229 for (i = 0; i < NVC0_HW_SM_QUERY_COUNT; i++) {
1230 if (queries[i])
1231 count++;
1232 }
1233 }
1234 }
1235 }
1236
1237 if (!info)
1238 return count;
1239
1240 if (id < count) {
1241 if (screen->compute) {
1242 if (screen->base.class_3d == NVE4_3D_CLASS) {
1243 info->name = nve4_hw_sm_query_names[id];
1244 info->query_type = NVE4_HW_SM_QUERY(id);
1245 info->group_id = NVC0_HW_SM_QUERY_GROUP;
1246 return 1;
1247 } else
1248 if (screen->base.class_3d < NVE4_3D_CLASS) {
1249 const struct nvc0_hw_sm_query_cfg **queries =
1250 nvc0_hw_sm_get_queries(screen);
1251
1252 id = nvc0_hw_sm_get_next_query_id(queries, id);
1253 info->name = nvc0_hw_sm_query_names[id];
1254 info->query_type = NVC0_HW_SM_QUERY(id);
1255 info->group_id = NVC0_HW_SM_QUERY_GROUP;
1256 return 1;
1257 }
1258 }
1259 }
1260 return 0;
1261 }