Merge remote-tracking branch 'public/master' into vulkan
[mesa.git] / src / gallium / drivers / nouveau / nvc0 / nvc0_query_hw_sm.c
1 /*
2 * Copyright 2011 Christoph Bumiller
3 * Copyright 2015 Samuel Pitoiset
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21 * OTHER DEALINGS IN THE SOFTWARE.
22 */
23
24 #define NVC0_PUSH_EXPLICIT_SPACE_CHECKING
25
26 #include "nvc0/nvc0_context.h"
27 #include "nvc0/nvc0_query_hw_sm.h"
28
29 #include "nv_object.xml.h"
30 #include "nvc0/nve4_compute.xml.h"
31 #include "nvc0/nvc0_compute.xml.h"
32
33 /* NOTE: intentionally using the same names as NV */
34 #define _Q(t, n) { NVC0_HW_SM_QUERY_##t, n }
35 struct {
36 unsigned type;
37 const char *name;
38 } nvc0_hw_sm_queries[] = {
39 _Q(ACTIVE_CYCLES, "active_cycles" ),
40 _Q(ACTIVE_WARPS, "active_warps" ),
41 _Q(ATOM_CAS_COUNT, "atom_cas_count" ),
42 _Q(ATOM_COUNT, "atom_count" ),
43 _Q(BRANCH, "branch" ),
44 _Q(DIVERGENT_BRANCH, "divergent_branch" ),
45 _Q(GLD_REQUEST, "gld_request" ),
46 _Q(GLD_MEM_DIV_REPLAY, "global_ld_mem_divergence_replays" ),
47 _Q(GST_TRANSACTIONS, "global_store_transaction" ),
48 _Q(GST_MEM_DIV_REPLAY, "global_st_mem_divergence_replays" ),
49 _Q(GRED_COUNT, "gred_count" ),
50 _Q(GST_REQUEST, "gst_request" ),
51 _Q(INST_EXECUTED, "inst_executed" ),
52 _Q(INST_ISSUED, "inst_issued" ),
53 _Q(INST_ISSUED1, "inst_issued1" ),
54 _Q(INST_ISSUED2, "inst_issued2" ),
55 _Q(INST_ISSUED1_0, "inst_issued1_0" ),
56 _Q(INST_ISSUED1_1, "inst_issued1_1" ),
57 _Q(INST_ISSUED2_0, "inst_issued2_0" ),
58 _Q(INST_ISSUED2_1, "inst_issued2_1" ),
59 _Q(L1_GLD_HIT, "l1_global_load_hit" ),
60 _Q(L1_GLD_MISS, "l1_global_load_miss" ),
61 _Q(L1_GLD_TRANSACTIONS, "__l1_global_load_transactions" ),
62 _Q(L1_GST_TRANSACTIONS, "__l1_global_store_transactions" ),
63 _Q(L1_LOCAL_LD_HIT, "l1_local_load_hit" ),
64 _Q(L1_LOCAL_LD_MISS, "l1_local_load_miss" ),
65 _Q(L1_LOCAL_ST_HIT, "l1_local_store_hit" ),
66 _Q(L1_LOCAL_ST_MISS, "l1_local_store_miss" ),
67 _Q(L1_SHARED_LD_TRANSACTIONS, "l1_shared_load_transactions" ),
68 _Q(L1_SHARED_ST_TRANSACTIONS, "l1_shared_store_transactions" ),
69 _Q(LOCAL_LD, "local_load" ),
70 _Q(LOCAL_LD_TRANSACTIONS, "local_load_transactions" ),
71 _Q(LOCAL_ST, "local_store" ),
72 _Q(LOCAL_ST_TRANSACTIONS, "local_store_transactions" ),
73 _Q(NOT_PRED_OFF_INST_EXECUTED, "not_predicated_off_thread_inst_executed" ),
74 _Q(PROF_TRIGGER_0, "prof_trigger_00" ),
75 _Q(PROF_TRIGGER_1, "prof_trigger_01" ),
76 _Q(PROF_TRIGGER_2, "prof_trigger_02" ),
77 _Q(PROF_TRIGGER_3, "prof_trigger_03" ),
78 _Q(PROF_TRIGGER_4, "prof_trigger_04" ),
79 _Q(PROF_TRIGGER_5, "prof_trigger_05" ),
80 _Q(PROF_TRIGGER_6, "prof_trigger_06" ),
81 _Q(PROF_TRIGGER_7, "prof_trigger_07" ),
82 _Q(SHARED_LD, "shared_load" ),
83 _Q(SHARED_LD_REPLAY, "shared_load_replay" ),
84 _Q(SHARED_ST, "shared_store" ),
85 _Q(SHARED_ST_REPLAY, "shared_store_replay" ),
86 _Q(SM_CTA_LAUNCHED, "sm_cta_launched" ),
87 _Q(THREADS_LAUNCHED, "threads_launched" ),
88 _Q(TH_INST_EXECUTED, "thread_inst_executed" ),
89 _Q(TH_INST_EXECUTED_0, "thread_inst_executed_0" ),
90 _Q(TH_INST_EXECUTED_1, "thread_inst_executed_1" ),
91 _Q(TH_INST_EXECUTED_2, "thread_inst_executed_2" ),
92 _Q(TH_INST_EXECUTED_3, "thread_inst_executed_3" ),
93 _Q(UNCACHED_GLD_TRANSACTIONS, "uncached_global_load_transaction" ),
94 _Q(WARPS_LAUNCHED, "warps_launched" ),
95 };
96
97 #undef _Q
98
99 static inline const char *
100 nvc0_hw_sm_query_get_name(unsigned query_type)
101 {
102 unsigned i;
103
104 for (i = 0; i < ARRAY_SIZE(nvc0_hw_sm_queries); i++) {
105 if (nvc0_hw_sm_queries[i].type == query_type)
106 return nvc0_hw_sm_queries[i].name;
107 }
108 assert(0);
109 return NULL;
110 }
111
112 /* === PERFORMANCE MONITORING COUNTERS for NVE4+ === */
113
114 /* Code to read out MP counters: They are accessible via mmio, too, but let's
115 * just avoid mapping registers in userspace. We'd have to know which MPs are
116 * enabled/present, too, and that information is not presently exposed.
117 * We could add a kernel interface for it, but reading the counters like this
118 * has the advantage of being async (if get_result isn't called immediately).
119 */
120 static const uint64_t nve4_read_hw_sm_counters_code[] =
121 {
122 /* sched 0x20 0x20 0x20 0x20 0x20 0x20 0x20
123 * mov b32 $r8 $tidx
124 * mov b32 $r12 $physid
125 * mov b32 $r0 $pm0
126 * mov b32 $r1 $pm1
127 * mov b32 $r2 $pm2
128 * mov b32 $r3 $pm3
129 * mov b32 $r4 $pm4
130 * sched 0x20 0x20 0x23 0x04 0x20 0x04 0x2b
131 * mov b32 $r5 $pm5
132 * mov b32 $r6 $pm6
133 * mov b32 $r7 $pm7
134 * set $p0 0x1 eq u32 $r8 0x0
135 * mov b32 $r10 c0[0x0]
136 * ext u32 $r8 $r12 0x414
137 * mov b32 $r11 c0[0x4]
138 * sched 0x04 0x2e 0x04 0x20 0x20 0x28 0x04
139 * ext u32 $r9 $r12 0x208
140 * (not $p0) exit
141 * set $p1 0x1 eq u32 $r9 0x0
142 * mul $r8 u32 $r8 u32 96
143 * mul $r12 u32 $r9 u32 16
144 * mul $r13 u32 $r9 u32 4
145 * add b32 $r9 $r8 $r13
146 * sched 0x28 0x04 0x2c 0x04 0x2c 0x04 0x2c
147 * add b32 $r8 $r8 $r12
148 * mov b32 $r12 $r10
149 * add b32 $r10 $c $r10 $r8
150 * mov b32 $r13 $r11
151 * add b32 $r11 $r11 0x0 $c
152 * add b32 $r12 $c $r12 $r9
153 * st b128 wt g[$r10d] $r0q
154 * sched 0x4 0x2c 0x20 0x04 0x2e 0x00 0x00
155 * mov b32 $r0 c0[0x8]
156 * add b32 $r13 $r13 0x0 $c
157 * $p1 st b128 wt g[$r12d+0x40] $r4q
158 * st b32 wt g[$r12d+0x50] $r0
159 * exit */
160 0x2202020202020207ULL,
161 0x2c00000084021c04ULL,
162 0x2c0000000c031c04ULL,
163 0x2c00000010001c04ULL,
164 0x2c00000014005c04ULL,
165 0x2c00000018009c04ULL,
166 0x2c0000001c00dc04ULL,
167 0x2c00000020011c04ULL,
168 0x22b0420042320207ULL,
169 0x2c00000024015c04ULL,
170 0x2c00000028019c04ULL,
171 0x2c0000002c01dc04ULL,
172 0x190e0000fc81dc03ULL,
173 0x2800400000029de4ULL,
174 0x7000c01050c21c03ULL,
175 0x280040001002dde4ULL,
176 0x204282020042e047ULL,
177 0x7000c00820c25c03ULL,
178 0x80000000000021e7ULL,
179 0x190e0000fc93dc03ULL,
180 0x1000000180821c02ULL,
181 0x1000000040931c02ULL,
182 0x1000000010935c02ULL,
183 0x4800000034825c03ULL,
184 0x22c042c042c04287ULL,
185 0x4800000030821c03ULL,
186 0x2800000028031de4ULL,
187 0x4801000020a29c03ULL,
188 0x280000002c035de4ULL,
189 0x0800000000b2dc42ULL,
190 0x4801000024c31c03ULL,
191 0x9400000000a01fc5ULL,
192 0x200002e04202c047ULL,
193 0x2800400020001de4ULL,
194 0x0800000000d35c42ULL,
195 0x9400000100c107c5ULL,
196 0x9400000140c01f85ULL,
197 0x8000000000001de7ULL
198 };
199
200 static const uint64_t nvf0_read_hw_sm_counters_code[] =
201 {
202 /* Same kernel as GK104 */
203 0x0880808080808080ULL,
204 0x86400000109c0022ULL,
205 0x86400000019c0032ULL,
206 0x86400000021c0002ULL,
207 0x86400000029c0006ULL,
208 0x86400000031c000aULL,
209 0x86400000039c000eULL,
210 0x86400000041c0012ULL,
211 0x08ac1080108c8080ULL,
212 0x86400000049c0016ULL,
213 0x86400000051c001aULL,
214 0x86400000059c001eULL,
215 0xdb201c007f9c201eULL,
216 0x64c03c00001c002aULL,
217 0xc00000020a1c3021ULL,
218 0x64c03c00009c002eULL,
219 0x0810a0808010b810ULL,
220 0xc0000001041c3025ULL,
221 0x180000000020003cULL,
222 0xdb201c007f9c243eULL,
223 0xc1c00000301c2021ULL,
224 0xc1c00000081c2431ULL,
225 0xc1c00000021c2435ULL,
226 0xe0800000069c2026ULL,
227 0x08b010b010b010a0ULL,
228 0xe0800000061c2022ULL,
229 0xe4c03c00051c0032ULL,
230 0xe0840000041c282aULL,
231 0xe4c03c00059c0036ULL,
232 0xe08040007f9c2c2eULL,
233 0xe0840000049c3032ULL,
234 0xfe800000001c2800ULL,
235 0x080000b81080b010ULL,
236 0x64c03c00011c0002ULL,
237 0xe08040007f9c3436ULL,
238 0xfe80000020043010ULL,
239 0xfc800000281c3000ULL,
240 0x18000000001c003cULL,
241 };
242
243 /* For simplicity, we will allocate as many group slots as we allocate counter
244 * slots. This means that a single counter which wants to source from 2 groups
245 * will have to be declared as using 2 counter slots. This shouldn't really be
246 * a problem because such queries don't make much sense ... (unless someone is
247 * really creative).
248 */
249 struct nvc0_hw_sm_counter_cfg
250 {
251 uint32_t func : 16; /* mask or 4-bit logic op (depending on mode) */
252 uint32_t mode : 4; /* LOGOP,B6,LOGOP_B6(_PULSE) */
253 uint32_t sig_dom : 1; /* if 0, MP_PM_A (per warp-sched), if 1, MP_PM_B */
254 uint32_t sig_sel : 8; /* signal group */
255 uint32_t src_mask; /* mask for signal selection (only for NVC0:NVE4) */
256 uint32_t src_sel; /* signal selection for up to 4 sources */
257 };
258
259 struct nvc0_hw_sm_query_cfg
260 {
261 unsigned type;
262 struct nvc0_hw_sm_counter_cfg ctr[8];
263 uint8_t num_counters;
264 uint8_t norm[2]; /* normalization num,denom */
265 };
266
267 #define _CA(f, m, g, s) { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g, 0, s }
268 #define _CB(f, m, g, s) { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g, 0, s }
269 #define _Q(n, c) [NVE4_HW_SM_QUERY_##n] = c
270
271 /* ==== Compute capability 3.0 (GK104:GK110) ==== */
272 static const struct nvc0_hw_sm_query_cfg
273 sm30_active_cycles =
274 {
275 .type = NVC0_HW_SM_QUERY_ACTIVE_CYCLES,
276 .ctr[0] = _CB(0x0001, B6, WARP, 0x00000000),
277 .num_counters = 1,
278 .norm = { 1, 1 },
279 };
280
281 static const struct nvc0_hw_sm_query_cfg
282 sm30_active_warps =
283 {
284 .type = NVC0_HW_SM_QUERY_ACTIVE_WARPS,
285 .ctr[0] = _CB(0x003f, B6, WARP, 0x31483104),
286 .num_counters = 1,
287 .norm = { 2, 1 },
288 };
289
290 static const struct nvc0_hw_sm_query_cfg
291 sm30_atom_cas_count =
292 {
293 .type = NVC0_HW_SM_QUERY_ATOM_CAS_COUNT,
294 .ctr[0] = _CA(0x0001, B6, BRANCH, 0x000000004),
295 .num_counters = 1,
296 .norm = { 1, 1 },
297 };
298
299 static const struct nvc0_hw_sm_query_cfg
300 sm30_atom_count =
301 {
302 .type = NVC0_HW_SM_QUERY_ATOM_COUNT,
303 .ctr[0] = _CA(0x0001, B6, BRANCH, 0x00000000),
304 .num_counters = 1,
305 .norm = { 1, 1 },
306 };
307
308 static const struct nvc0_hw_sm_query_cfg
309 sm30_branch =
310 {
311 .type = NVC0_HW_SM_QUERY_BRANCH,
312 .ctr[0] = _CA(0x0001, B6, BRANCH, 0x0000000c),
313 .num_counters = 1,
314 .norm = { 1, 1 },
315 };
316
317 static const struct nvc0_hw_sm_query_cfg
318 sm30_divergent_branch =
319 {
320 .type = NVC0_HW_SM_QUERY_DIVERGENT_BRANCH,
321 .ctr[0] = _CA(0x0001, B6, BRANCH, 0x00000010),
322 .num_counters = 1,
323 .norm = { 1, 1 },
324 };
325
326 static const struct nvc0_hw_sm_query_cfg
327 sm30_gld_request =
328 {
329 .type = NVC0_HW_SM_QUERY_GLD_REQUEST,
330 .ctr[0] = _CA(0x0001, B6, LDST, 0x00000010),
331 .num_counters = 1,
332 .norm = { 1, 1 },
333 };
334
335 static const struct nvc0_hw_sm_query_cfg
336 sm30_gld_mem_div_replay =
337 {
338 .type = NVC0_HW_SM_QUERY_GLD_MEM_DIV_REPLAY,
339 .ctr[0] = _CB(0x0001, B6, REPLAY, 0x00000010),
340 .num_counters = 1,
341 .norm = { 1, 1 },
342 };
343
344 static const struct nvc0_hw_sm_query_cfg
345 sm30_gst_transactions =
346 {
347 .type = NVC0_HW_SM_QUERY_GST_TRANSACTIONS,
348 .ctr[0] = _CB(0x0001, B6, MEM, 0x00000004),
349 .num_counters = 1,
350 .norm = { 1, 1 },
351 };
352
353 static const struct nvc0_hw_sm_query_cfg
354 sm30_gst_mem_div_replay =
355 {
356 .type = NVC0_HW_SM_QUERY_GST_MEM_DIV_REPLAY,
357 .ctr[0] = _CB(0x0001, B6, REPLAY, 0x00000014),
358 .num_counters = 1,
359 .norm = { 1, 1 },
360 };
361
362 static const struct nvc0_hw_sm_query_cfg
363 sm30_gred_count =
364 {
365 .type = NVC0_HW_SM_QUERY_GRED_COUNT,
366 .ctr[0] = _CA(0x0001, B6, BRANCH, 0x00000008),
367 .num_counters = 1,
368 .norm = { 1, 1 },
369 };
370
371 static const struct nvc0_hw_sm_query_cfg
372 sm30_gst_request =
373 {
374 .type = NVC0_HW_SM_QUERY_GST_REQUEST,
375 .ctr[0] = _CA(0x0001, B6, LDST, 0x00000014),
376 .num_counters = 1,
377 .norm = { 1, 1 },
378 };
379
380 static const struct nvc0_hw_sm_query_cfg
381 sm30_inst_executed =
382 {
383 .type = NVC0_HW_SM_QUERY_INST_EXECUTED,
384 .ctr[0] = _CA(0x0003, B6, EXEC, 0x00000398),
385 .num_counters = 1,
386 .norm = { 1, 1 },
387 };
388
389 static const struct nvc0_hw_sm_query_cfg
390 sm30_inst_issued1 =
391 {
392 .type = NVC0_HW_SM_QUERY_INST_ISSUED1,
393 .ctr[0] = _CA(0x0001, B6, ISSUE, 0x00000004),
394 .num_counters = 1,
395 .norm = { 1, 1 },
396 };
397
398 static const struct nvc0_hw_sm_query_cfg
399 sm30_inst_issued2 =
400 {
401 .type = NVC0_HW_SM_QUERY_INST_ISSUED2,
402 .ctr[0] = _CA(0x0001, B6, ISSUE, 0x00000008),
403 .num_counters = 1,
404 .norm = { 1, 1 },
405 };
406
407 static const struct nvc0_hw_sm_query_cfg
408 sm30_l1_gld_hit =
409 {
410 .type = NVC0_HW_SM_QUERY_L1_GLD_HIT,
411 .ctr[0] = _CB(0x0001, B6, L1, 0x00000010),
412 .num_counters = 1,
413 .norm = { 1, 1 },
414 };
415
416 static const struct nvc0_hw_sm_query_cfg
417 sm30_l1_gld_miss =
418 {
419 .type = NVC0_HW_SM_QUERY_L1_GLD_MISS,
420 .ctr[0] = _CB(0x0001, B6, L1, 0x00000014),
421 .num_counters = 1,
422 .norm = { 1, 1 },
423 };
424
425 static const struct nvc0_hw_sm_query_cfg
426 sm30_l1_gld_transactions =
427 {
428 .type = NVC0_HW_SM_QUERY_L1_GLD_TRANSACTIONS,
429 .ctr[0] = _CB(0x0001, B6, UNK0F, 0x00000000),
430 .num_counters = 1,
431 .norm = { 1, 1 },
432 };
433
434 static const struct nvc0_hw_sm_query_cfg
435 sm30_l1_gst_transactions =
436 {
437 .type = NVC0_HW_SM_QUERY_L1_GST_TRANSACTIONS,
438 .ctr[0] = _CB(0x0001, B6, UNK0F, 0x00000004),
439 .num_counters = 1,
440 .norm = { 1, 1 },
441 };
442
443 static const struct nvc0_hw_sm_query_cfg
444 sm30_l1_local_ld_hit =
445 {
446 .type = NVC0_HW_SM_QUERY_L1_LOCAL_LD_HIT,
447 .ctr[0] = _CB(0x0001, B6, L1, 0x00000000),
448 .num_counters = 1,
449 .norm = { 1, 1 },
450 };
451
452 static const struct nvc0_hw_sm_query_cfg
453 sm30_l1_local_ld_miss =
454 {
455 .type = NVC0_HW_SM_QUERY_L1_LOCAL_LD_MISS,
456 .ctr[0] = _CB(0x0001, B6, L1, 0x00000004),
457 .num_counters = 1,
458 .norm = { 1, 1 },
459 };
460
461 static const struct nvc0_hw_sm_query_cfg
462 sm30_l1_local_st_hit =
463 {
464 .type = NVC0_HW_SM_QUERY_L1_LOCAL_ST_HIT,
465 .ctr[0] = _CB(0x0001, B6, L1, 0x00000008),
466 .num_counters = 1,
467 .norm = { 1, 1 },
468 };
469
470 static const struct nvc0_hw_sm_query_cfg
471 sm30_l1_local_st_miss =
472 {
473 .type = NVC0_HW_SM_QUERY_L1_LOCAL_ST_MISS,
474 .ctr[0] = _CB(0x0001, B6, L1, 0x0000000c),
475 .num_counters = 1,
476 .norm = { 1, 1 },
477 };
478
479 static const struct nvc0_hw_sm_query_cfg
480 sm30_l1_shared_ld_transactions =
481 {
482 .type = NVC0_HW_SM_QUERY_L1_SHARED_LD_TRANSACTIONS,
483 .ctr[0] = _CB(0x0001, B6, TRANSACTION, 0x00000008),
484 .num_counters = 1,
485 .norm = { 1, 1 },
486 };
487
488 static const struct nvc0_hw_sm_query_cfg
489 sm30_l1_shared_st_transactions =
490 {
491 .type = NVC0_HW_SM_QUERY_L1_SHARED_ST_TRANSACTIONS,
492 .ctr[0] = _CB(0x0001, B6, TRANSACTION, 0x0000000c),
493 .num_counters = 1,
494 .norm = { 1, 1 },
495 };
496
497 static const struct nvc0_hw_sm_query_cfg
498 sm30_local_ld =
499 {
500 .type = NVC0_HW_SM_QUERY_LOCAL_LD,
501 .ctr[0] = _CA(0x0001, B6, LDST, 0x00000008),
502 .num_counters = 1,
503 .norm = { 1, 1 },
504 };
505
506 static const struct nvc0_hw_sm_query_cfg
507 sm30_local_ld_transactions =
508 {
509 .type = NVC0_HW_SM_QUERY_LOCAL_LD_TRANSACTIONS,
510 .ctr[0] = _CB(0x0001, B6, TRANSACTION, 0x00000000),
511 .num_counters = 1,
512 .norm = { 1, 1 },
513 };
514
515 static const struct nvc0_hw_sm_query_cfg
516 sm30_local_st =
517 {
518 .type = NVC0_HW_SM_QUERY_LOCAL_ST,
519 .ctr[0] = _CA(0x0001, B6, LDST, 0x0000000c),
520 .num_counters = 1,
521 .norm = { 1, 1 },
522 };
523
524 static const struct nvc0_hw_sm_query_cfg
525 sm30_local_st_transactions =
526 {
527 .type = NVC0_HW_SM_QUERY_LOCAL_ST_TRANSACTIONS,
528 .ctr[0] = _CB(0x0001, B6, TRANSACTION, 0x00000004),
529 .num_counters = 1,
530 .norm = { 1, 1 },
531 };
532
533 static const struct nvc0_hw_sm_query_cfg
534 sm30_prof_trigger_0 =
535 {
536 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_0,
537 .ctr[0] = _CA(0x0001, B6, USER, 0x00000000),
538 .num_counters = 1,
539 .norm = { 1, 1 },
540 };
541
542 static const struct nvc0_hw_sm_query_cfg
543 sm30_prof_trigger_1 =
544 {
545 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_1,
546 .ctr[0] = _CA(0x0001, B6, USER, 0x00000004),
547 .num_counters = 1,
548 .norm = { 1, 1 },
549 };
550
551 static const struct nvc0_hw_sm_query_cfg
552 sm30_prof_trigger_2 =
553 {
554 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_2,
555 .ctr[0] = _CA(0x0001, B6, USER, 0x00000008),
556 .num_counters = 1,
557 .norm = { 1, 1 },
558 };
559
560 static const struct nvc0_hw_sm_query_cfg
561 sm30_prof_trigger_3 =
562 {
563 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_3,
564 .ctr[0] = _CA(0x0001, B6, USER, 0x0000000c),
565 .num_counters = 1,
566 .norm = { 1, 1 },
567 };
568
569 static const struct nvc0_hw_sm_query_cfg
570 sm30_prof_trigger_4 =
571 {
572 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_4,
573 .ctr[0] = _CA(0x0001, B6, USER, 0x00000010),
574 .num_counters = 1,
575 .norm = { 1, 1 },
576 };
577
578 static const struct nvc0_hw_sm_query_cfg
579 sm30_prof_trigger_5 =
580 {
581 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_5,
582 .ctr[0] = _CA(0x0001, B6, USER, 0x00000014),
583 .num_counters = 1,
584 .norm = { 1, 1 },
585 };
586
587 static const struct nvc0_hw_sm_query_cfg
588 sm30_prof_trigger_6 =
589 {
590 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_6,
591 .ctr[0] = _CA(0x0001, B6, USER, 0x00000018),
592 .num_counters = 1,
593 .norm = { 1, 1 },
594 };
595
596 static const struct nvc0_hw_sm_query_cfg
597 sm30_prof_trigger_7 =
598 {
599 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_7,
600 .ctr[0] = _CA(0x0001, B6, USER, 0x0000001c),
601 .num_counters = 1,
602 .norm = { 1, 1 },
603 };
604
605 static const struct nvc0_hw_sm_query_cfg
606 sm30_shared_ld =
607 {
608 .type = NVC0_HW_SM_QUERY_SHARED_LD,
609 .ctr[0] = _CA(0x0001, B6, LDST, 0x00000000),
610 .num_counters = 1,
611 .norm = { 1, 1 },
612 };
613
614 static const struct nvc0_hw_sm_query_cfg
615 sm30_shared_ld_replay =
616 {
617 .type = NVC0_HW_SM_QUERY_SHARED_LD_REPLAY,
618 .ctr[0] = _CB(0x0001, B6, REPLAY, 0x00000008),
619 .num_counters = 1,
620 .norm = { 1, 1 },
621 };
622
623 static const struct nvc0_hw_sm_query_cfg
624 sm30_shared_st =
625 {
626 .type = NVC0_HW_SM_QUERY_SHARED_ST,
627 .ctr[0] = _CA(0x0001, B6, LDST, 0x00000004),
628 .num_counters = 1,
629 .norm = { 1, 1 },
630 };
631
632 static const struct nvc0_hw_sm_query_cfg
633 sm30_shared_st_replay =
634 {
635 .type = NVC0_HW_SM_QUERY_SHARED_ST_REPLAY,
636 .ctr[0] = _CB(0x0001, B6, REPLAY, 0x0000000c),
637 .num_counters = 1,
638 .norm = { 1, 1 },
639 };
640
641 static const struct nvc0_hw_sm_query_cfg
642 sm30_sm_cta_launched =
643 {
644 .type = NVC0_HW_SM_QUERY_SM_CTA_LAUNCHED,
645 .ctr[0] = _CB(0x0001, B6, WARP, 0x0000001c),
646 .num_counters = 1,
647 .norm = { 1, 1 },
648 };
649
650 static const struct nvc0_hw_sm_query_cfg
651 sm30_threads_launched =
652 {
653 .type = NVC0_HW_SM_QUERY_THREADS_LAUNCHED,
654 .ctr[0] = _CA(0x003f, B6, LAUNCH, 0x398a4188),
655 .num_counters = 1,
656 .norm = { 1, 1 },
657 };
658
659 static const struct nvc0_hw_sm_query_cfg
660 sm30_uncached_gld_transactions =
661 {
662 .type = NVC0_HW_SM_QUERY_UNCACHED_GLD_TRANSACTIONS,
663 .ctr[0] = _CB(0x0001, B6, MEM, 0x00000000),
664 .num_counters = 1,
665 .norm = { 1, 1 },
666 };
667
668 static const struct nvc0_hw_sm_query_cfg
669 sm30_warps_launched =
670 {
671 .type = NVC0_HW_SM_QUERY_WARPS_LAUNCHED,
672 .ctr[0] = _CA(0x0001, B6, LAUNCH, 0x00000004),
673 .num_counters = 1,
674 .norm = { 1, 1 },
675 };
676
677 /* NOTES:
678 * active_warps: bit 0 alternates btw 0 and 1 for odd nr of warps
679 * inst_executed etc.: we only count a single warp scheduler
680 */
681 static const struct nvc0_hw_sm_query_cfg *sm30_hw_sm_queries[] =
682 {
683 &sm30_active_cycles,
684 &sm30_active_warps,
685 &sm30_atom_cas_count,
686 &sm30_atom_count,
687 &sm30_branch,
688 &sm30_divergent_branch,
689 &sm30_gld_request,
690 &sm30_gld_mem_div_replay,
691 &sm30_gst_transactions,
692 &sm30_gst_mem_div_replay,
693 &sm30_gred_count,
694 &sm30_gst_request,
695 &sm30_inst_executed,
696 &sm30_inst_issued1,
697 &sm30_inst_issued2,
698 &sm30_l1_gld_hit,
699 &sm30_l1_gld_miss,
700 &sm30_l1_gld_transactions,
701 &sm30_l1_gst_transactions,
702 &sm30_l1_local_ld_hit,
703 &sm30_l1_local_ld_miss,
704 &sm30_l1_local_st_hit,
705 &sm30_l1_local_st_miss,
706 &sm30_l1_shared_ld_transactions,
707 &sm30_l1_shared_st_transactions,
708 &sm30_local_ld,
709 &sm30_local_ld_transactions,
710 &sm30_local_st,
711 &sm30_local_st_transactions,
712 &sm30_prof_trigger_0,
713 &sm30_prof_trigger_1,
714 &sm30_prof_trigger_2,
715 &sm30_prof_trigger_3,
716 &sm30_prof_trigger_4,
717 &sm30_prof_trigger_5,
718 &sm30_prof_trigger_6,
719 &sm30_prof_trigger_7,
720 &sm30_shared_ld,
721 &sm30_shared_ld_replay,
722 &sm30_shared_st,
723 &sm30_shared_st_replay,
724 &sm30_sm_cta_launched,
725 &sm30_threads_launched,
726 &sm30_uncached_gld_transactions,
727 &sm30_warps_launched,
728 };
729
730 /* ==== Compute capability 3.5 (GK110/GK208) ==== */
731 static const struct nvc0_hw_sm_query_cfg
732 sm35_atom_cas_count =
733 {
734 .type = NVC0_HW_SM_QUERY_ATOM_CAS_COUNT,
735 .ctr[0] = _CA(0x0001, B6, UNK1A, 0x00000014),
736 .num_counters = 1,
737 .norm = { 1, 1 },
738 };
739
740 static const struct nvc0_hw_sm_query_cfg
741 sm35_atom_count =
742 {
743 .type = NVC0_HW_SM_QUERY_ATOM_COUNT,
744 .ctr[0] = _CA(0x0001, B6, UNK1A, 0x00000010),
745 .num_counters = 1,
746 .norm = { 1, 1 },
747 };
748
749 static const struct nvc0_hw_sm_query_cfg
750 sm35_gred_count =
751 {
752 .type = NVC0_HW_SM_QUERY_GRED_COUNT,
753 .ctr[0] = _CA(0x0001, B6, UNK1A, 0x00000018),
754 .num_counters = 1,
755 .norm = { 1, 1 },
756 };
757
758 static const struct nvc0_hw_sm_query_cfg
759 sm35_not_pred_off_inst_executed =
760 {
761 .type = NVC0_HW_SM_QUERY_NOT_PRED_OFF_INST_EXECUTED,
762 .ctr[0] = _CA(0x003f, B6, UNK14, 0x29062080),
763 .num_counters = 1,
764 .norm = { 1, 1 },
765 };
766
767 static const struct nvc0_hw_sm_query_cfg
768 sm35_shared_ld_replay =
769 {
770 .type = NVC0_HW_SM_QUERY_SHARED_LD_REPLAY,
771 .ctr[0] = _CB(0xaaaa, LOGOP, UNK13, 0x00000018),
772 .ctr[1] = _CB(0x8888, LOGOP, REPLAY, 0x00000151),
773 .num_counters = 2,
774 .norm = { 1, 1 },
775 };
776
777 static const struct nvc0_hw_sm_query_cfg
778 sm35_shared_st_replay =
779 {
780 .type = NVC0_HW_SM_QUERY_SHARED_ST_REPLAY,
781 .ctr[0] = _CB(0xaaaa, LOGOP, UNK13, 0x00000018),
782 .ctr[1] = _CB(0x8888, LOGOP, REPLAY, 0x000001d1),
783 .num_counters = 2,
784 .norm = { 1, 1 },
785 };
786
787 static const struct nvc0_hw_sm_query_cfg
788 sm35_th_inst_executed =
789 {
790 .type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED,
791 .ctr[0] = _CA(0x003f, B6, UNK11, 0x29062080),
792 .num_counters = 1,
793 .norm = { 1, 1 },
794 };
795
796 static const struct nvc0_hw_sm_query_cfg *sm35_hw_sm_queries[] =
797 {
798 &sm30_active_cycles,
799 &sm30_active_warps,
800 &sm35_atom_cas_count,
801 &sm35_atom_count,
802 &sm30_gld_request,
803 &sm30_gld_mem_div_replay,
804 &sm30_gst_transactions,
805 &sm30_gst_mem_div_replay,
806 &sm35_gred_count,
807 &sm30_gst_request,
808 &sm30_inst_executed,
809 &sm30_inst_issued1,
810 &sm30_inst_issued2,
811 &sm30_l1_gld_hit,
812 &sm30_l1_gld_miss,
813 &sm30_l1_gld_transactions,
814 &sm30_l1_gst_transactions,
815 &sm30_l1_local_ld_hit,
816 &sm30_l1_local_ld_miss,
817 &sm30_l1_local_st_hit,
818 &sm30_l1_local_st_miss,
819 &sm30_l1_shared_ld_transactions,
820 &sm30_l1_shared_st_transactions,
821 &sm30_local_ld,
822 &sm30_local_ld_transactions,
823 &sm30_local_st,
824 &sm30_local_st_transactions,
825 &sm35_not_pred_off_inst_executed,
826 &sm30_prof_trigger_0,
827 &sm30_prof_trigger_1,
828 &sm30_prof_trigger_2,
829 &sm30_prof_trigger_3,
830 &sm30_prof_trigger_4,
831 &sm30_prof_trigger_5,
832 &sm30_prof_trigger_6,
833 &sm30_prof_trigger_7,
834 &sm30_shared_ld,
835 &sm35_shared_ld_replay,
836 &sm30_shared_st,
837 &sm35_shared_st_replay,
838 &sm30_sm_cta_launched,
839 &sm35_th_inst_executed,
840 &sm30_threads_launched,
841 &sm30_uncached_gld_transactions,
842 &sm30_warps_launched,
843 };
844
845 #undef _Q
846 #undef _CA
847 #undef _CB
848
849 /* === PERFORMANCE MONITORING COUNTERS for NVC0:NVE4 === */
850 /* NOTES:
851 * - MP counters on GF100/GF110 (compute capability 2.0) are buggy
852 * because there is a context-switch problem that we need to fix.
853 * Results might be wrong sometimes, be careful!
854 */
855 static const uint64_t nvc0_read_hw_sm_counters_code[] =
856 {
857 /* mov b32 $r8 $tidx
858 * mov b32 $r9 $physid
859 * mov b32 $r0 $pm0
860 * mov b32 $r1 $pm1
861 * mov b32 $r2 $pm2
862 * mov b32 $r3 $pm3
863 * mov b32 $r4 $pm4
864 * mov b32 $r5 $pm5
865 * mov b32 $r6 $pm6
866 * mov b32 $r7 $pm7
867 * set $p0 0x1 eq u32 $r8 0x0
868 * mov b32 $r10 c0[0x0]
869 * mov b32 $r11 c0[0x4]
870 * ext u32 $r8 $r9 0x414
871 * (not $p0) exit
872 * mul $r8 u32 $r8 u32 48
873 * add b32 $r10 $c $r10 $r8
874 * add b32 $r11 $r11 0x0 $c
875 * mov b32 $r8 c0[0x8]
876 * st b128 wt g[$r10d+0x00] $r0q
877 * st b128 wt g[$r10d+0x10] $r4q
878 * st b32 wt g[$r10d+0x20] $r8
879 * exit */
880 0x2c00000084021c04ULL,
881 0x2c0000000c025c04ULL,
882 0x2c00000010001c04ULL,
883 0x2c00000014005c04ULL,
884 0x2c00000018009c04ULL,
885 0x2c0000001c00dc04ULL,
886 0x2c00000020011c04ULL,
887 0x2c00000024015c04ULL,
888 0x2c00000028019c04ULL,
889 0x2c0000002c01dc04ULL,
890 0x190e0000fc81dc03ULL,
891 0x2800400000029de4ULL,
892 0x280040001002dde4ULL,
893 0x7000c01050921c03ULL,
894 0x80000000000021e7ULL,
895 0x10000000c0821c02ULL,
896 0x4801000020a29c03ULL,
897 0x0800000000b2dc42ULL,
898 0x2800400020021de4ULL,
899 0x9400000000a01fc5ULL,
900 0x9400000040a11fc5ULL,
901 0x9400000080a21f85ULL,
902 0x8000000000001de7ULL
903 };
904
905 #define _C(f, o, g, m, s) { f, NVC0_COMPUTE_MP_PM_OP_MODE_##o, 0, g, m, s }
906
907 /* ==== Compute capability 2.0 (GF100/GF110) ==== */
908 static const struct nvc0_hw_sm_query_cfg
909 sm20_active_cycles =
910 {
911 .type = NVC0_HW_SM_QUERY_ACTIVE_CYCLES,
912 .ctr[0] = _C(0xaaaa, LOGOP, 0x11, 0x000000ff, 0x00000000),
913 .num_counters = 1,
914 .norm = { 1, 1 },
915 };
916
917 static const struct nvc0_hw_sm_query_cfg
918 sm20_active_warps =
919 {
920 .type = NVC0_HW_SM_QUERY_ACTIVE_WARPS,
921 .ctr[0] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000010),
922 .ctr[1] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000020),
923 .ctr[2] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000030),
924 .ctr[3] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000040),
925 .ctr[4] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000050),
926 .ctr[5] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000060),
927 .num_counters = 6,
928 .norm = { 1, 1 },
929 };
930
931 static const struct nvc0_hw_sm_query_cfg
932 sm20_atom_count =
933 {
934 .type = NVC0_HW_SM_QUERY_ATOM_COUNT,
935 .ctr[0] = _C(0xaaaa, LOGOP, 0x63, 0x000000ff, 0x00000030),
936 .num_counters = 1,
937 .norm = { 1, 1 },
938 };
939
940 static const struct nvc0_hw_sm_query_cfg
941 sm20_branch =
942 {
943 .type = NVC0_HW_SM_QUERY_BRANCH,
944 .ctr[0] = _C(0xaaaa, LOGOP, 0x1a, 0x000000ff, 0x00000000),
945 .ctr[1] = _C(0xaaaa, LOGOP, 0x1a, 0x000000ff, 0x00000010),
946 .num_counters = 2,
947 .norm = { 1, 1 },
948 };
949
950 static const struct nvc0_hw_sm_query_cfg
951 sm20_divergent_branch =
952 {
953 .type = NVC0_HW_SM_QUERY_DIVERGENT_BRANCH,
954 .ctr[0] = _C(0xaaaa, LOGOP, 0x19, 0x000000ff, 0x00000020),
955 .ctr[1] = _C(0xaaaa, LOGOP, 0x19, 0x000000ff, 0x00000030),
956 .num_counters = 2,
957 .norm = { 1, 1 },
958 };
959
960 static const struct nvc0_hw_sm_query_cfg
961 sm20_gld_request =
962 {
963 .type = NVC0_HW_SM_QUERY_GLD_REQUEST,
964 .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000030),
965 .num_counters = 1,
966 .norm = { 1, 1 },
967 };
968
969 static const struct nvc0_hw_sm_query_cfg
970 sm20_gred_count =
971 {
972 .type = NVC0_HW_SM_QUERY_GRED_COUNT,
973 .ctr[0] = _C(0xaaaa, LOGOP, 0x63, 0x000000ff, 0x00000040),
974 .num_counters = 1,
975 .norm = { 1, 1 },
976 };
977
978 static const struct nvc0_hw_sm_query_cfg
979 sm20_gst_request =
980 {
981 .type = NVC0_HW_SM_QUERY_GST_REQUEST,
982 .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000060),
983 .num_counters = 1,
984 .norm = { 1, 1 },
985 };
986
987 static const struct nvc0_hw_sm_query_cfg
988 sm20_inst_executed =
989 {
990 .type = NVC0_HW_SM_QUERY_INST_EXECUTED,
991 .ctr[0] = _C(0xaaaa, LOGOP, 0x2d, 0x0000ffff, 0x00001000),
992 .ctr[1] = _C(0xaaaa, LOGOP, 0x2d, 0x0000ffff, 0x00001010),
993 .num_counters = 2,
994 .norm = { 1, 1 },
995 };
996
997 static const struct nvc0_hw_sm_query_cfg
998 sm20_inst_issued =
999 {
1000 .type = NVC0_HW_SM_QUERY_INST_ISSUED,
1001 .ctr[0] = _C(0xaaaa, LOGOP, 0x27, 0x0000ffff, 0x00007060),
1002 .ctr[1] = _C(0xaaaa, LOGOP, 0x27, 0x0000ffff, 0x00007070),
1003 .num_counters = 2,
1004 .norm = { 1, 1 },
1005 };
1006
1007 static const struct nvc0_hw_sm_query_cfg
1008 sm20_local_ld =
1009 {
1010 .type = NVC0_HW_SM_QUERY_LOCAL_LD,
1011 .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000020),
1012 .num_counters = 1,
1013 .norm = { 1, 1 },
1014 };
1015
1016 static const struct nvc0_hw_sm_query_cfg
1017 sm20_local_st =
1018 {
1019 .type = NVC0_HW_SM_QUERY_LOCAL_ST,
1020 .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000050),
1021 .num_counters = 1,
1022 .norm = { 1, 1 },
1023 };
1024
1025 static const struct nvc0_hw_sm_query_cfg
1026 sm20_prof_trigger_0 =
1027 {
1028 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_0,
1029 .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000000),
1030 .num_counters = 1,
1031 .norm = { 1, 1 },
1032 };
1033
1034 static const struct nvc0_hw_sm_query_cfg
1035 sm20_prof_trigger_1 =
1036 {
1037 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_1,
1038 .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000010),
1039 .num_counters = 1,
1040 .norm = { 1, 1 },
1041 };
1042
1043 static const struct nvc0_hw_sm_query_cfg
1044 sm20_prof_trigger_2 =
1045 {
1046 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_2,
1047 .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000020),
1048 .num_counters = 1,
1049 .norm = { 1, 1 },
1050 };
1051
1052 static const struct nvc0_hw_sm_query_cfg
1053 sm20_prof_trigger_3 =
1054 {
1055 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_3,
1056 .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000030),
1057 .num_counters = 1,
1058 .norm = { 1, 1 },
1059 };
1060
1061 static const struct nvc0_hw_sm_query_cfg
1062 sm20_prof_trigger_4 =
1063 {
1064 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_4,
1065 .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000040),
1066 .num_counters = 1,
1067 .norm = { 1, 1 },
1068 };
1069
1070 static const struct nvc0_hw_sm_query_cfg
1071 sm20_prof_trigger_5 =
1072 {
1073 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_5,
1074 .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000050),
1075 .num_counters = 1,
1076 .norm = { 1, 1 },
1077 };
1078
1079 static const struct nvc0_hw_sm_query_cfg
1080 sm20_prof_trigger_6 =
1081 {
1082 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_6,
1083 .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000060),
1084 .num_counters = 1,
1085 .norm = { 1, 1 },
1086 };
1087
1088 static const struct nvc0_hw_sm_query_cfg
1089 sm20_prof_trigger_7 =
1090 {
1091 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_7,
1092 .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000070),
1093 .num_counters = 1,
1094 .norm = { 1, 1 },
1095 };
1096
1097 static const struct nvc0_hw_sm_query_cfg
1098 sm20_shared_ld =
1099 {
1100 .type = NVC0_HW_SM_QUERY_SHARED_LD,
1101 .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000010),
1102 .num_counters = 1,
1103 .norm = { 1, 1 },
1104 };
1105
1106 static const struct nvc0_hw_sm_query_cfg
1107 sm20_shared_st =
1108 {
1109 .type = NVC0_HW_SM_QUERY_SHARED_ST,
1110 .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000040),
1111 .num_counters = 1,
1112 .norm = { 1, 1 },
1113 };
1114
1115 static const struct nvc0_hw_sm_query_cfg
1116 sm20_threads_launched =
1117 {
1118 .type = NVC0_HW_SM_QUERY_THREADS_LAUNCHED,
1119 .ctr[0] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000010),
1120 .ctr[1] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000020),
1121 .ctr[2] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000030),
1122 .ctr[3] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000040),
1123 .ctr[4] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000050),
1124 .ctr[5] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000060),
1125 .num_counters = 6,
1126 .norm = { 1, 1 },
1127 };
1128
1129 static const struct nvc0_hw_sm_query_cfg
1130 sm20_th_inst_executed_0 =
1131 {
1132 .type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED_0,
1133 .ctr[0] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000000),
1134 .ctr[1] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000010),
1135 .ctr[2] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000020),
1136 .ctr[3] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000030),
1137 .ctr[4] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000040),
1138 .ctr[5] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000050),
1139 .num_counters = 6,
1140 .norm = { 1, 1 },
1141 };
1142
1143 static const struct nvc0_hw_sm_query_cfg
1144 sm20_th_inst_executed_1 =
1145 {
1146 .type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED_1,
1147 .ctr[0] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000000),
1148 .ctr[1] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000010),
1149 .ctr[2] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000020),
1150 .ctr[3] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000030),
1151 .ctr[4] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000040),
1152 .ctr[5] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000050),
1153 .num_counters = 6,
1154 .norm = { 1, 1 },
1155 };
1156
1157 static const struct nvc0_hw_sm_query_cfg
1158 sm20_warps_launched =
1159 {
1160 .type = NVC0_HW_SM_QUERY_WARPS_LAUNCHED,
1161 .ctr[0] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000000),
1162 .num_counters = 1,
1163 .norm = { 1, 1 },
1164 };
1165
1166 static const struct nvc0_hw_sm_query_cfg *sm20_hw_sm_queries[] =
1167 {
1168 &sm20_active_cycles,
1169 &sm20_active_warps,
1170 &sm20_atom_count,
1171 &sm20_branch,
1172 &sm20_divergent_branch,
1173 &sm20_gld_request,
1174 &sm20_gred_count,
1175 &sm20_gst_request,
1176 &sm20_inst_executed,
1177 &sm20_inst_issued,
1178 &sm20_local_ld,
1179 &sm20_local_st,
1180 &sm20_prof_trigger_0,
1181 &sm20_prof_trigger_1,
1182 &sm20_prof_trigger_2,
1183 &sm20_prof_trigger_3,
1184 &sm20_prof_trigger_4,
1185 &sm20_prof_trigger_5,
1186 &sm20_prof_trigger_6,
1187 &sm20_prof_trigger_7,
1188 &sm20_shared_ld,
1189 &sm20_shared_st,
1190 &sm20_threads_launched,
1191 &sm20_th_inst_executed_0,
1192 &sm20_th_inst_executed_1,
1193 &sm20_warps_launched,
1194 };
1195
1196 /* ==== Compute capability 2.1 (GF108+ except GF110) ==== */
1197 static const struct nvc0_hw_sm_query_cfg
1198 sm21_inst_executed =
1199 {
1200 .type = NVC0_HW_SM_QUERY_INST_EXECUTED,
1201 .ctr[0] = _C(0xaaaa, LOGOP, 0x2d, 0x000000ff, 0x00000000),
1202 .ctr[1] = _C(0xaaaa, LOGOP, 0x2d, 0x000000ff, 0x00000010),
1203 .ctr[2] = _C(0xaaaa, LOGOP, 0x2d, 0x000000ff, 0x00000020),
1204 .num_counters = 3,
1205 .norm = { 1, 1 },
1206 };
1207
1208 static const struct nvc0_hw_sm_query_cfg
1209 sm21_inst_issued1_0 =
1210 {
1211 .type = NVC0_HW_SM_QUERY_INST_ISSUED1_0,
1212 .ctr[0] = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000010),
1213 .num_counters = 1,
1214 .norm = { 1, 1 },
1215 };
1216
1217 static const struct nvc0_hw_sm_query_cfg
1218 sm21_inst_issued1_1 =
1219 {
1220 .type = NVC0_HW_SM_QUERY_INST_ISSUED1_1,
1221 .ctr[0] = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000040),
1222 .num_counters = 1,
1223 .norm = { 1, 1 },
1224 };
1225
1226 static const struct nvc0_hw_sm_query_cfg
1227 sm21_inst_issued2_0 =
1228 {
1229 .type = NVC0_HW_SM_QUERY_INST_ISSUED2_0,
1230 .ctr[0] = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000020),
1231 .num_counters = 1,
1232 .norm = { 1, 1 },
1233 };
1234
1235 static const struct nvc0_hw_sm_query_cfg
1236 sm21_inst_issued2_1 =
1237 {
1238 .type = NVC0_HW_SM_QUERY_INST_ISSUED2_1,
1239 .ctr[0] = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000050),
1240 .num_counters = 1,
1241 .norm = { 1, 1 },
1242 };
1243
1244 static const struct nvc0_hw_sm_query_cfg
1245 sm21_th_inst_executed_0 =
1246 {
1247 .type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED_0,
1248 .ctr[0] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000000),
1249 .ctr[1] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000010),
1250 .ctr[2] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000020),
1251 .ctr[3] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000030),
1252 .ctr[4] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000040),
1253 .ctr[5] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000050),
1254 .num_counters = 6,
1255 .norm = { 1, 1 },
1256 };
1257
1258 static const struct nvc0_hw_sm_query_cfg
1259 sm21_th_inst_executed_1 =
1260 {
1261 .type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED_1,
1262 .ctr[0] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000000),
1263 .ctr[1] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000010),
1264 .ctr[2] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000020),
1265 .ctr[3] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000030),
1266 .ctr[4] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000040),
1267 .ctr[5] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000050),
1268 .num_counters = 6,
1269 .norm = { 1, 1 },
1270 };
1271
1272 static const struct nvc0_hw_sm_query_cfg
1273 sm21_th_inst_executed_2 =
1274 {
1275 .type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED_2,
1276 .ctr[0] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000000),
1277 .ctr[1] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000010),
1278 .ctr[2] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000020),
1279 .ctr[3] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000030),
1280 .ctr[4] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000040),
1281 .ctr[5] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000050),
1282 .num_counters = 6,
1283 .norm = { 1, 1 },
1284 };
1285
1286 static const struct nvc0_hw_sm_query_cfg
1287 sm21_th_inst_executed_3 =
1288 {
1289 .type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED_3,
1290 .ctr[0] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000000),
1291 .ctr[1] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000010),
1292 .ctr[2] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000020),
1293 .ctr[3] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000030),
1294 .ctr[4] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000040),
1295 .ctr[5] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000050),
1296 .num_counters = 6,
1297 .norm = { 1, 1 },
1298 };
1299
1300 static const struct nvc0_hw_sm_query_cfg *sm21_hw_sm_queries[] =
1301 {
1302 &sm20_active_cycles,
1303 &sm20_active_warps,
1304 &sm20_atom_count,
1305 &sm20_branch,
1306 &sm20_divergent_branch,
1307 &sm20_gld_request,
1308 &sm20_gred_count,
1309 &sm20_gst_request,
1310 &sm21_inst_executed,
1311 &sm21_inst_issued1_0,
1312 &sm21_inst_issued1_1,
1313 &sm21_inst_issued2_0,
1314 &sm21_inst_issued2_1,
1315 &sm20_local_ld,
1316 &sm20_local_st,
1317 &sm20_prof_trigger_0,
1318 &sm20_prof_trigger_1,
1319 &sm20_prof_trigger_2,
1320 &sm20_prof_trigger_3,
1321 &sm20_prof_trigger_4,
1322 &sm20_prof_trigger_5,
1323 &sm20_prof_trigger_6,
1324 &sm20_prof_trigger_7,
1325 &sm20_shared_ld,
1326 &sm20_shared_st,
1327 &sm20_threads_launched,
1328 &sm21_th_inst_executed_0,
1329 &sm21_th_inst_executed_1,
1330 &sm21_th_inst_executed_2,
1331 &sm21_th_inst_executed_3,
1332 &sm20_warps_launched,
1333 };
1334
1335 #undef _C
1336
1337 static inline const struct nvc0_hw_sm_query_cfg **
1338 nvc0_hw_sm_get_queries(struct nvc0_screen *screen)
1339 {
1340 struct nouveau_device *dev = screen->base.device;
1341
1342 switch (screen->base.class_3d) {
1343 case NVF0_3D_CLASS:
1344 return sm35_hw_sm_queries;
1345 case NVE4_3D_CLASS:
1346 return sm30_hw_sm_queries;
1347 default:
1348 if (dev->chipset == 0xc0 || dev->chipset == 0xc8)
1349 return sm20_hw_sm_queries;
1350 return sm21_hw_sm_queries;
1351 }
1352 assert(0);
1353 return NULL;
1354 }
1355
1356 unsigned
1357 nvc0_hw_sm_get_num_queries(struct nvc0_screen *screen)
1358 {
1359 struct nouveau_device *dev = screen->base.device;
1360
1361 switch (screen->base.class_3d) {
1362 case NVF0_3D_CLASS:
1363 return ARRAY_SIZE(sm35_hw_sm_queries);
1364 case NVE4_3D_CLASS:
1365 return ARRAY_SIZE(sm30_hw_sm_queries);
1366 default:
1367 if (dev->chipset == 0xc0 || dev->chipset == 0xc8)
1368 return ARRAY_SIZE(sm20_hw_sm_queries);
1369 return ARRAY_SIZE(sm21_hw_sm_queries);
1370 }
1371 return 0;
1372 }
1373
1374 static const struct nvc0_hw_sm_query_cfg *
1375 nvc0_hw_sm_query_get_cfg(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
1376 {
1377 const struct nvc0_hw_sm_query_cfg **queries;
1378 struct nvc0_screen *screen = nvc0->screen;
1379 struct nvc0_query *q = &hq->base;
1380 unsigned num_queries;
1381 unsigned i;
1382
1383 num_queries = nvc0_hw_sm_get_num_queries(screen);
1384 queries = nvc0_hw_sm_get_queries(screen);
1385
1386 for (i = 0; i < num_queries; i++) {
1387 if (NVC0_HW_SM_QUERY(queries[i]->type) == q->type)
1388 return queries[i];
1389 }
1390 assert(0);
1391 return NULL;
1392 }
1393
1394 static void
1395 nvc0_hw_sm_destroy_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
1396 {
1397 struct nvc0_query *q = &hq->base;
1398 nvc0_hw_query_allocate(nvc0, q, 0);
1399 nouveau_fence_ref(NULL, &hq->fence);
1400 FREE(hq);
1401 }
1402
1403 static boolean
1404 nve4_hw_sm_begin_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
1405 {
1406 struct nvc0_screen *screen = nvc0->screen;
1407 struct nouveau_pushbuf *push = nvc0->base.pushbuf;
1408 struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
1409 const struct nvc0_hw_sm_query_cfg *cfg;
1410 unsigned i, c;
1411 unsigned num_ab[2] = { 0, 0 };
1412
1413 cfg = nvc0_hw_sm_query_get_cfg(nvc0, hq);
1414
1415 /* check if we have enough free counter slots */
1416 for (i = 0; i < cfg->num_counters; ++i)
1417 num_ab[cfg->ctr[i].sig_dom]++;
1418
1419 if (screen->pm.num_hw_sm_active[0] + num_ab[0] > 4 ||
1420 screen->pm.num_hw_sm_active[1] + num_ab[1] > 4) {
1421 NOUVEAU_ERR("Not enough free MP counter slots !\n");
1422 return false;
1423 }
1424
1425 assert(cfg->num_counters <= 4);
1426 PUSH_SPACE(push, 4 * 8 * + 6);
1427
1428 if (!screen->pm.mp_counters_enabled) {
1429 screen->pm.mp_counters_enabled = true;
1430 BEGIN_NVC0(push, SUBC_SW(0x06ac), 1);
1431 PUSH_DATA (push, 0x1fcb);
1432 }
1433
1434 /* set sequence field to 0 (used to check if result is available) */
1435 for (i = 0; i < screen->mp_count; ++i)
1436 hq->data[i * 10 + 10] = 0;
1437 hq->sequence++;
1438
1439 for (i = 0; i < cfg->num_counters; ++i) {
1440 const unsigned d = cfg->ctr[i].sig_dom;
1441
1442 if (!screen->pm.num_hw_sm_active[d]) {
1443 uint32_t m = (1 << 22) | (1 << (7 + (8 * !d)));
1444 if (screen->pm.num_hw_sm_active[!d])
1445 m |= 1 << (7 + (8 * d));
1446 BEGIN_NVC0(push, SUBC_SW(0x0600), 1);
1447 PUSH_DATA (push, m);
1448 }
1449 screen->pm.num_hw_sm_active[d]++;
1450
1451 for (c = d * 4; c < (d * 4 + 4); ++c) {
1452 if (!screen->pm.mp_counter[c]) {
1453 hsq->ctr[i] = c;
1454 screen->pm.mp_counter[c] = hsq;
1455 break;
1456 }
1457 }
1458 assert(c <= (d * 4 + 3)); /* must succeed, already checked for space */
1459
1460 /* configure and reset the counter(s) */
1461 if (d == 0)
1462 BEGIN_NVC0(push, NVE4_CP(MP_PM_A_SIGSEL(c & 3)), 1);
1463 else
1464 BEGIN_NVC0(push, NVE4_CP(MP_PM_B_SIGSEL(c & 3)), 1);
1465 PUSH_DATA (push, cfg->ctr[i].sig_sel);
1466 BEGIN_NVC0(push, NVE4_CP(MP_PM_SRCSEL(c)), 1);
1467 PUSH_DATA (push, cfg->ctr[i].src_sel + 0x2108421 * (c & 3));
1468 BEGIN_NVC0(push, NVE4_CP(MP_PM_FUNC(c)), 1);
1469 PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode);
1470 BEGIN_NVC0(push, NVE4_CP(MP_PM_SET(c)), 1);
1471 PUSH_DATA (push, 0);
1472 }
1473 return true;
1474 }
1475
1476 static boolean
1477 nvc0_hw_sm_begin_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
1478 {
1479 struct nvc0_screen *screen = nvc0->screen;
1480 struct nouveau_pushbuf *push = nvc0->base.pushbuf;
1481 struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
1482 const struct nvc0_hw_sm_query_cfg *cfg;
1483 unsigned i, c;
1484
1485 if (screen->base.class_3d >= NVE4_3D_CLASS)
1486 return nve4_hw_sm_begin_query(nvc0, hq);
1487
1488 cfg = nvc0_hw_sm_query_get_cfg(nvc0, hq);
1489
1490 /* check if we have enough free counter slots */
1491 if (screen->pm.num_hw_sm_active[0] + cfg->num_counters > 8) {
1492 NOUVEAU_ERR("Not enough free MP counter slots !\n");
1493 return false;
1494 }
1495
1496 assert(cfg->num_counters <= 8);
1497 PUSH_SPACE(push, 8 * 8 + 2);
1498
1499 /* set sequence field to 0 (used to check if result is available) */
1500 for (i = 0; i < screen->mp_count; ++i) {
1501 const unsigned b = (0x30 / 4) * i;
1502 hq->data[b + 8] = 0;
1503 }
1504 hq->sequence++;
1505
1506 for (i = 0; i < cfg->num_counters; ++i) {
1507 uint32_t mask_sel = 0x00000000;
1508
1509 if (!screen->pm.num_hw_sm_active[0]) {
1510 BEGIN_NVC0(push, SUBC_SW(0x0600), 1);
1511 PUSH_DATA (push, 0x80000000);
1512 }
1513 screen->pm.num_hw_sm_active[0]++;
1514
1515 for (c = 0; c < 8; ++c) {
1516 if (!screen->pm.mp_counter[c]) {
1517 hsq->ctr[i] = c;
1518 screen->pm.mp_counter[c] = hsq;
1519 break;
1520 }
1521 }
1522
1523 /* Oddly-enough, the signal id depends on the slot selected on Fermi but
1524 * not on Kepler. Fortunately, the signal ids are just offseted by the
1525 * slot id! */
1526 mask_sel |= c;
1527 mask_sel |= (c << 8);
1528 mask_sel |= (c << 16);
1529 mask_sel |= (c << 24);
1530 mask_sel &= cfg->ctr[i].src_mask;
1531
1532 /* configure and reset the counter(s) */
1533 BEGIN_NVC0(push, NVC0_CP(MP_PM_SIGSEL(c)), 1);
1534 PUSH_DATA (push, cfg->ctr[i].sig_sel);
1535 BEGIN_NVC0(push, NVC0_CP(MP_PM_SRCSEL(c)), 1);
1536 PUSH_DATA (push, cfg->ctr[i].src_sel | mask_sel);
1537 BEGIN_NVC0(push, NVC0_CP(MP_PM_OP(c)), 1);
1538 PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode);
1539 BEGIN_NVC0(push, NVC0_CP(MP_PM_SET(c)), 1);
1540 PUSH_DATA (push, 0);
1541 }
1542 return true;
1543 }
1544
1545 static inline struct nvc0_program *
1546 nvc0_hw_sm_get_program(struct nvc0_screen *screen)
1547 {
1548 struct nvc0_program *prog;
1549
1550 prog = CALLOC_STRUCT(nvc0_program);
1551 if (!prog)
1552 return NULL;
1553
1554 prog->type = PIPE_SHADER_COMPUTE;
1555 prog->translated = true;
1556 prog->parm_size = 12;
1557
1558 if (screen->base.class_3d == NVE4_3D_CLASS ||
1559 screen->base.class_3d == NVF0_3D_CLASS) {
1560 if (screen->base.class_3d == NVE4_3D_CLASS) {
1561 prog->code = (uint32_t *)nve4_read_hw_sm_counters_code;
1562 prog->code_size = sizeof(nve4_read_hw_sm_counters_code);
1563 } else {
1564 prog->code = (uint32_t *)nvf0_read_hw_sm_counters_code;
1565 prog->code_size = sizeof(nvf0_read_hw_sm_counters_code);
1566 }
1567 prog->num_gprs = 14;
1568 } else {
1569 prog->code = (uint32_t *)nvc0_read_hw_sm_counters_code;
1570 prog->code_size = sizeof(nvc0_read_hw_sm_counters_code);
1571 prog->num_gprs = 12;
1572 }
1573 return prog;
1574 }
1575
1576 static void
1577 nvc0_hw_sm_end_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
1578 {
1579 struct nvc0_screen *screen = nvc0->screen;
1580 struct pipe_context *pipe = &nvc0->base.pipe;
1581 struct nouveau_pushbuf *push = nvc0->base.pushbuf;
1582 const bool is_nve4 = screen->base.class_3d >= NVE4_3D_CLASS;
1583 struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
1584 struct pipe_grid_info info = {};
1585 uint32_t mask;
1586 uint32_t input[3];
1587 const uint block[3] = { 32, is_nve4 ? 4 : 1, 1 };
1588 const uint grid[3] = { screen->mp_count, screen->gpc_count, 1 };
1589 unsigned c, i;
1590
1591 if (unlikely(!screen->pm.prog))
1592 screen->pm.prog = nvc0_hw_sm_get_program(screen);
1593
1594 /* disable all counting */
1595 PUSH_SPACE(push, 8);
1596 for (c = 0; c < 8; ++c)
1597 if (screen->pm.mp_counter[c]) {
1598 if (is_nve4) {
1599 IMMED_NVC0(push, NVE4_CP(MP_PM_FUNC(c)), 0);
1600 } else {
1601 IMMED_NVC0(push, NVC0_CP(MP_PM_OP(c)), 0);
1602 }
1603 }
1604 /* release counters for this query */
1605 for (c = 0; c < 8; ++c) {
1606 if (screen->pm.mp_counter[c] == hsq) {
1607 uint8_t d = is_nve4 ? c / 4 : 0; /* only one domain for NVC0:NVE4 */
1608 screen->pm.num_hw_sm_active[d]--;
1609 screen->pm.mp_counter[c] = NULL;
1610 }
1611 }
1612
1613 BCTX_REFN_bo(nvc0->bufctx_cp, CP_QUERY, NOUVEAU_BO_GART | NOUVEAU_BO_WR,
1614 hq->bo);
1615
1616 PUSH_SPACE(push, 1);
1617 IMMED_NVC0(push, SUBC_CP(NV50_GRAPH_SERIALIZE), 0);
1618
1619 pipe->bind_compute_state(pipe, screen->pm.prog);
1620 input[0] = (hq->bo->offset + hq->base_offset);
1621 input[1] = (hq->bo->offset + hq->base_offset) >> 32;
1622 input[2] = hq->sequence;
1623
1624 for (i = 0; i < 3; i++) {
1625 info.block[i] = block[i];
1626 info.grid[i] = grid[i];
1627 }
1628 info.pc = 0;
1629 info.input = input;
1630 pipe->launch_grid(pipe, &info);
1631
1632 nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_QUERY);
1633
1634 /* re-activate other counters */
1635 PUSH_SPACE(push, 16);
1636 mask = 0;
1637 for (c = 0; c < 8; ++c) {
1638 const struct nvc0_hw_sm_query_cfg *cfg;
1639 unsigned i;
1640
1641 hsq = screen->pm.mp_counter[c];
1642 if (!hsq)
1643 continue;
1644
1645 cfg = nvc0_hw_sm_query_get_cfg(nvc0, &hsq->base);
1646 for (i = 0; i < cfg->num_counters; ++i) {
1647 if (mask & (1 << hsq->ctr[i]))
1648 break;
1649 mask |= 1 << hsq->ctr[i];
1650 if (is_nve4) {
1651 BEGIN_NVC0(push, NVE4_CP(MP_PM_FUNC(hsq->ctr[i])), 1);
1652 } else {
1653 BEGIN_NVC0(push, NVC0_CP(MP_PM_OP(hsq->ctr[i])), 1);
1654 }
1655 PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode);
1656 }
1657 }
1658 }
1659
1660 static inline bool
1661 nvc0_hw_sm_query_read_data(uint32_t count[32][8],
1662 struct nvc0_context *nvc0, bool wait,
1663 struct nvc0_hw_query *hq,
1664 const struct nvc0_hw_sm_query_cfg *cfg,
1665 unsigned mp_count)
1666 {
1667 struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
1668 unsigned p, c;
1669
1670 for (p = 0; p < mp_count; ++p) {
1671 const unsigned b = (0x30 / 4) * p;
1672
1673 for (c = 0; c < cfg->num_counters; ++c) {
1674 if (hq->data[b + 8] != hq->sequence) {
1675 if (!wait)
1676 return false;
1677 if (nouveau_bo_wait(hq->bo, NOUVEAU_BO_RD, nvc0->base.client))
1678 return false;
1679 }
1680 count[p][c] = hq->data[b + hsq->ctr[c]] * (1 << c);
1681 }
1682 }
1683 return true;
1684 }
1685
1686 static inline bool
1687 nve4_hw_sm_query_read_data(uint32_t count[32][8],
1688 struct nvc0_context *nvc0, bool wait,
1689 struct nvc0_hw_query *hq,
1690 const struct nvc0_hw_sm_query_cfg *cfg,
1691 unsigned mp_count)
1692 {
1693 struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
1694 unsigned p, c, d;
1695
1696 for (p = 0; p < mp_count; ++p) {
1697 const unsigned b = (0x60 / 4) * p;
1698
1699 for (c = 0; c < cfg->num_counters; ++c) {
1700 count[p][c] = 0;
1701 for (d = 0; d < ((hsq->ctr[c] & ~3) ? 1 : 4); ++d) {
1702 if (hq->data[b + 20 + d] != hq->sequence) {
1703 if (!wait)
1704 return false;
1705 if (nouveau_bo_wait(hq->bo, NOUVEAU_BO_RD, nvc0->base.client))
1706 return false;
1707 }
1708 if (hsq->ctr[c] & ~0x3)
1709 count[p][c] = hq->data[b + 16 + (hsq->ctr[c] & 3)];
1710 else
1711 count[p][c] += hq->data[b + d * 4 + hsq->ctr[c]];
1712 }
1713 }
1714 }
1715 return true;
1716 }
1717
1718 static boolean
1719 nvc0_hw_sm_get_query_result(struct nvc0_context *nvc0, struct nvc0_hw_query *hq,
1720 boolean wait, union pipe_query_result *result)
1721 {
1722 uint32_t count[32][8];
1723 uint64_t value = 0;
1724 unsigned mp_count = MIN2(nvc0->screen->mp_count_compute, 32);
1725 unsigned p, c;
1726 const struct nvc0_hw_sm_query_cfg *cfg;
1727 bool ret;
1728
1729 cfg = nvc0_hw_sm_query_get_cfg(nvc0, hq);
1730
1731 if (nvc0->screen->base.class_3d >= NVE4_3D_CLASS)
1732 ret = nve4_hw_sm_query_read_data(count, nvc0, wait, hq, cfg, mp_count);
1733 else
1734 ret = nvc0_hw_sm_query_read_data(count, nvc0, wait, hq, cfg, mp_count);
1735 if (!ret)
1736 return false;
1737
1738 for (c = 0; c < cfg->num_counters; ++c)
1739 for (p = 0; p < mp_count; ++p)
1740 value += count[p][c];
1741 value = (value * cfg->norm[0]) / cfg->norm[1];
1742
1743 *(uint64_t *)result = value;
1744 return true;
1745 }
1746
1747 static const struct nvc0_hw_query_funcs hw_sm_query_funcs = {
1748 .destroy_query = nvc0_hw_sm_destroy_query,
1749 .begin_query = nvc0_hw_sm_begin_query,
1750 .end_query = nvc0_hw_sm_end_query,
1751 .get_query_result = nvc0_hw_sm_get_query_result,
1752 };
1753
1754 struct nvc0_hw_query *
1755 nvc0_hw_sm_create_query(struct nvc0_context *nvc0, unsigned type)
1756 {
1757 struct nvc0_screen *screen = nvc0->screen;
1758 struct nvc0_hw_sm_query *hsq;
1759 struct nvc0_hw_query *hq;
1760 unsigned space;
1761
1762 if (nvc0->screen->base.drm->version < 0x01000101)
1763 return NULL;
1764
1765 if (type < NVC0_HW_SM_QUERY(0) || type > NVC0_HW_SM_QUERY_LAST)
1766 return NULL;
1767
1768 hsq = CALLOC_STRUCT(nvc0_hw_sm_query);
1769 if (!hsq)
1770 return NULL;
1771
1772 hq = &hsq->base;
1773 hq->funcs = &hw_sm_query_funcs;
1774 hq->base.type = type;
1775
1776 if (screen->base.class_3d >= NVE4_3D_CLASS) {
1777 /* for each MP:
1778 * [00] = WS0.C0
1779 * [04] = WS0.C1
1780 * [08] = WS0.C2
1781 * [0c] = WS0.C3
1782 * [10] = WS1.C0
1783 * [14] = WS1.C1
1784 * [18] = WS1.C2
1785 * [1c] = WS1.C3
1786 * [20] = WS2.C0
1787 * [24] = WS2.C1
1788 * [28] = WS2.C2
1789 * [2c] = WS2.C3
1790 * [30] = WS3.C0
1791 * [34] = WS3.C1
1792 * [38] = WS3.C2
1793 * [3c] = WS3.C3
1794 * [40] = MP.C4
1795 * [44] = MP.C5
1796 * [48] = MP.C6
1797 * [4c] = MP.C7
1798 * [50] = WS0.sequence
1799 * [54] = WS1.sequence
1800 * [58] = WS2.sequence
1801 * [5c] = WS3.sequence
1802 */
1803 space = (4 * 4 + 4 + 4) * nvc0->screen->mp_count * sizeof(uint32_t);
1804 } else {
1805 /*
1806 * Note that padding is used to align memory access to 128 bits.
1807 *
1808 * for each MP:
1809 * [00] = MP.C0
1810 * [04] = MP.C1
1811 * [08] = MP.C2
1812 * [0c] = MP.C3
1813 * [10] = MP.C4
1814 * [14] = MP.C5
1815 * [18] = MP.C6
1816 * [1c] = MP.C7
1817 * [20] = MP.sequence
1818 * [24] = padding
1819 * [28] = padding
1820 * [2c] = padding
1821 */
1822 space = (8 + 1 + 3) * nvc0->screen->mp_count * sizeof(uint32_t);
1823 }
1824
1825 if (!nvc0_hw_query_allocate(nvc0, &hq->base, space)) {
1826 FREE(hq);
1827 return NULL;
1828 }
1829
1830 return hq;
1831 }
1832
1833 int
1834 nvc0_hw_sm_get_driver_query_info(struct nvc0_screen *screen, unsigned id,
1835 struct pipe_driver_query_info *info)
1836 {
1837 int count = 0;
1838
1839 if (screen->base.drm->version >= 0x01000101) {
1840 if (screen->compute)
1841 count = nvc0_hw_sm_get_num_queries(screen);
1842 }
1843
1844 if (!info)
1845 return count;
1846
1847 if (id < count) {
1848 if (screen->compute) {
1849 if (screen->base.class_3d <= NVF0_3D_CLASS) {
1850 const struct nvc0_hw_sm_query_cfg **queries =
1851 nvc0_hw_sm_get_queries(screen);
1852
1853 info->name = nvc0_hw_sm_query_get_name(queries[id]->type);
1854 info->query_type = NVC0_HW_SM_QUERY(queries[id]->type);
1855 info->group_id = NVC0_HW_SM_QUERY_GROUP;
1856 return 1;
1857 }
1858 }
1859 }
1860 return 0;
1861 }