2 * Copyright 2011 Christoph Bumiller
3 * Copyright 2015 Samuel Pitoiset
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21 * OTHER DEALINGS IN THE SOFTWARE.
24 #define NVC0_PUSH_EXPLICIT_SPACE_CHECKING
26 #include "nvc0/nvc0_context.h"
27 #include "nvc0/nvc0_query_hw_sm.h"
29 #include "nv_object.xml.h"
30 #include "nvc0/nve4_compute.xml.h"
31 #include "nvc0/nvc0_compute.xml.h"
33 /* NOTE: intentionally using the same names as NV */
34 #define _Q(t, n, d) { NVC0_HW_SM_QUERY_##t, n, d }
39 } nvc0_hw_sm_queries
[] = {
42 "Number of cycles a multiprocessor has at least one active warp"),
46 "Accumulated number of active warps per cycle. For every cycle it "
47 "increments by the number of active warps in the cycle which can be in "
52 "Number of warps executing atomic compare and swap operations. Increments "
53 "by one if at least one thread in a warp executes the instruction."),
57 "Number of warps executing atomic reduction operations. Increments by one "
58 "if at least one thread in a warp executes the instruction"),
62 "Number of branch instructions executed per warp on a multiprocessor"),
66 "Number of divergent branches within a warp. This counter will be "
67 "incremented by one if at least one thread in a warp diverges (that is, "
68 "follows a different execution path) via a conditional branch"),
72 "Number of executed load instructions where the state space is not "
73 "specified and hence generic addressing is used, increments per warp on a "
74 "multiprocessor. It can include the load operations from global,local and "
75 "shared state space"),
77 _Q(GLD_MEM_DIV_REPLAY
,
78 "global_ld_mem_divergence_replays",
79 "Number of instruction replays for global memory loads. Instruction is "
80 "replayed if the instruction is accessing more than one cache line of "
81 "128 bytes. For each extra cache line access the counter is incremented "
85 "global_store_transaction",
86 "Number of global store transactions. Increments by 1 per transaction. "
87 "Transaction can be 32/64/96/128B"),
89 _Q(GST_MEM_DIV_REPLAY
,
90 "global_st_mem_divergence_replays",
91 "Number of instruction replays for global memory stores. Instruction is "
92 "replayed if the instruction is accessing more than one cache line of "
93 "128 bytes. For each extra cache line access the counter is incremented "
98 "Number of warps executing reduction operations on global memory. "
99 "Increments by one if at least one thread in a warp executes the "
104 "Number of executed store instructions where the state space is not "
105 "specified and hence generic addressing is used, increments per warp on a "
106 "multiprocessor. It can include the store operations to global,local and "
107 "shared state space"),
111 "Number of instructions executed, do not include replays"),
115 "Number of instructions issued including replays"),
119 "Number of single instruction issued per cycle"),
123 "Number of dual instructions issued per cycle"),
127 "Number of single instruction issued per cycle in pipeline 0"),
131 "Number of single instruction issued per cycle in pipeline 1"),
135 "Number of dual instructions issued per cycle in pipeline 0"),
139 "Number of dual instructions issued per cycle in pipeline 1"),
142 "l1_global_load_hit",
143 "Number of cache lines that hit in L1 cache for global memory load "
144 "accesses. In case of perfect coalescing this increments by 1,2, and 4 for "
145 "32, 64 and 128 bit accesses by a warp respectively"),
148 "l1_global_load_miss",
149 "Number of cache lines that miss in L1 cache for global memory load "
150 "accesses. In case of perfect coalescing this increments by 1,2, and 4 for "
151 "32, 64 and 128 bit accesses by a warp respectively"),
153 _Q(L1_GLD_TRANSACTIONS
,
154 "__l1_global_load_transactions",
155 "Number of global load transactions from L1 cache. Increments by 1 per "
156 "transaction. Transaction can be 32/64/96/128B"),
158 _Q(L1_GST_TRANSACTIONS
,
159 "__l1_global_store_transactions",
160 "Number of global store transactions from L1 cache. Increments by 1 per "
161 "transaction. Transaction can be 32/64/96/128B"),
165 "Number of cache lines that hit in L1 cache for local memory load "
166 "accesses. In case of perfect coalescing this increments by 1,2, and 4 for "
167 "32, 64 and 128 bit accesses by a warp respectively"),
170 "l1_local_load_miss",
171 "Number of cache lines that miss in L1 cache for local memory load "
172 "accesses. In case of perfect coalescing this increments by 1,2, and 4 for "
173 "32, 64 and 128 bit accesses by a warp respectively"),
176 "l1_local_store_hit",
177 "Number of cache lines that hit in L1 cache for local memory store "
178 "accesses. In case of perfect coalescing this increments by 1,2, and 4 for "
179 "32, 64 and 128 bit accesses by a warp respectively"),
182 "l1_local_store_miss",
183 "Number of cache lines that miss in L1 cache for local memory store "
184 "accesses. In case of perfect coalescing this increments by 1,2, and 4 for "
185 "32,64 and 128 bit accesses by a warp respectively"),
187 _Q(L1_SHARED_LD_TRANSACTIONS
,
188 "l1_shared_load_transactions",
189 "Number of shared load transactions. Increments by 1 per transaction. "
190 "Transaction can be 32/64/96/128B"),
192 _Q(L1_SHARED_ST_TRANSACTIONS
,
193 "l1_shared_store_transactions",
194 "Number of shared store transactions. Increments by 1 per transaction. "
195 "Transaction can be 32/64/96/128B"),
199 "Number of executed load instructions where state space is specified as "
200 "local, increments per warp on a multiprocessor"),
202 _Q(LOCAL_LD_TRANSACTIONS
,
203 "local_load_transactions",
204 "Number of local load transactions from L1 cache. Increments by 1 per "
205 "transaction. Transaction can be 32/64/96/128B"),
209 "Number of executed store instructions where state space is specified as "
210 "local, increments per warp on a multiprocessor"),
212 _Q(LOCAL_ST_TRANSACTIONS
,
213 "local_store_transactions",
214 "Number of local store transactions to L1 cache. Increments by 1 per "
215 "transaction. Transaction can be 32/64/96/128B."),
217 _Q(NOT_PRED_OFF_INST_EXECUTED
,
218 "not_predicated_off_thread_inst_executed",
219 "Number of not predicated off instructions executed by all threads, does "
220 "not include replays. For each instruction it increments by the number of "
221 "threads that execute this instruction"),
225 "User profiled generic trigger that can be inserted in any place of the "
226 "code to collect the related information. Increments per warp."),
230 "User profiled generic trigger that can be inserted in any place of the "
231 "code to collect the related information. Increments per warp."),
235 "User profiled generic trigger that can be inserted in any place of the "
236 "code to collect the related information. Increments per warp."),
240 "User profiled generic trigger that can be inserted in any place of the "
241 "code to collect the related information. Increments per warp."),
245 "User profiled generic trigger that can be inserted in any place of the "
246 "code to collect the related information. Increments per warp."),
250 "User profiled generic trigger that can be inserted in any place of the "
251 "code to collect the related information. Increments per warp."),
255 "User profiled generic trigger that can be inserted in any place of the "
256 "code to collect the related information. Increments per warp."),
260 "User profiled generic trigger that can be inserted in any place of the "
261 "code to collect the related information. Increments per warp."),
265 "Number of executed load instructions where state space is specified as "
266 "shared, increments per warp on a multiprocessor"),
269 "shared_load_replay",
270 "Replays caused due to shared load bank conflict (when the addresses for "
271 "two or more shared memory load requests fall in the same memory bank) or "
272 "when there is no conflict but the total number of words accessed by all "
273 "threads in the warp executing that instruction exceed the number of words "
274 "that can be loaded in one cycle (256 bytes)"),
278 "Number of executed store instructions where state space is specified as "
279 "shared, increments per warp on a multiprocessor"),
282 "shared_store_replay",
283 "Replays caused due to shared store bank conflict (when the addresses for "
284 "two or more shared memory store requests fall in the same memory bank) or "
285 "when there is no conflict but the total number of words accessed by all "
286 "threads in the warp executing that instruction exceed the number of words "
287 "that can be stored in one cycle"),
291 "Number of thread blocks launched on a multiprocessor"),
295 "Number of threads launched on a multiprocessor"),
298 "thread_inst_executed",
299 "Number of instructions executed by all threads, does not include "
300 "replays. For each instruction it increments by the number of threads in "
301 "the warp that execute the instruction"),
303 _Q(TH_INST_EXECUTED_0
,
304 "thread_inst_executed_0",
305 "Number of instructions executed by all threads, does not include "
306 "replays. For each instruction it increments by the number of threads in "
307 "the warp that execute the instruction in pipeline 0"),
309 _Q(TH_INST_EXECUTED_1
,
310 "thread_inst_executed_1",
311 "Number of instructions executed by all threads, does not include "
312 "replays. For each instruction it increments by the number of threads in "
313 "the warp that execute the instruction in pipeline 1"),
315 _Q(TH_INST_EXECUTED_2
,
316 "thread_inst_executed_2",
317 "Number of instructions executed by all threads, does not include "
318 "replays. For each instruction it increments by the number of threads in "
319 "the warp that execute the instruction in pipeline 2"),
321 _Q(TH_INST_EXECUTED_3
,
322 "thread_inst_executed_3",
323 "Number of instructions executed by all threads, does not include "
324 "replays. For each instruction it increments by the number of threads in "
325 "the warp that execute the instruction in pipeline 3"),
327 _Q(UNCACHED_GLD_TRANSACTIONS
,
328 "uncached_global_load_transaction",
329 "Number of uncached global load transactions. Increments by 1 per "
330 "transaction. Transaction can be 32/64/96/128B."),
334 "Number of warps launched on a multiprocessor"),
339 static inline const char *
340 nvc0_hw_sm_query_get_name(unsigned query_type
)
344 for (i
= 0; i
< ARRAY_SIZE(nvc0_hw_sm_queries
); i
++) {
345 if (nvc0_hw_sm_queries
[i
].type
== query_type
)
346 return nvc0_hw_sm_queries
[i
].name
;
352 /* === PERFORMANCE MONITORING COUNTERS for NVE4+ === */
354 /* Code to read out MP counters: They are accessible via mmio, too, but let's
355 * just avoid mapping registers in userspace. We'd have to know which MPs are
356 * enabled/present, too, and that information is not presently exposed.
357 * We could add a kernel interface for it, but reading the counters like this
358 * has the advantage of being async (if get_result isn't called immediately).
360 static const uint64_t nve4_read_hw_sm_counters_code
[] =
362 /* sched 0x20 0x20 0x20 0x20 0x20 0x20 0x20
364 * mov b32 $r12 $physid
370 * sched 0x20 0x20 0x23 0x04 0x20 0x04 0x2b
374 * set $p0 0x1 eq u32 $r8 0x0
375 * mov b32 $r10 c7[0x620]
376 * ext u32 $r8 $r12 0x414
377 * mov b32 $r11 c7[0x624]
378 * sched 0x04 0x2e 0x04 0x20 0x20 0x28 0x04
379 * ext u32 $r9 $r12 0x208
381 * set $p1 0x1 eq u32 $r9 0x0
382 * mul $r8 u32 $r8 u32 96
383 * mul $r12 u32 $r9 u32 16
384 * mul $r13 u32 $r9 u32 4
385 * add b32 $r9 $r8 $r13
386 * sched 0x28 0x04 0x2c 0x04 0x2c 0x04 0x2c
387 * add b32 $r8 $r8 $r12
389 * add b32 $r10 $c $r10 $r8
391 * add b32 $r11 $r11 0x0 $c
392 * add b32 $r12 $c $r12 $r9
393 * st b128 wt g[$r10d] $r0q
394 * sched 0x4 0x2c 0x20 0x04 0x2e 0x00 0x00
395 * mov b32 $r0 c7[0x628]
396 * add b32 $r13 $r13 0x0 $c
397 * $p1 st b128 wt g[$r12d+0x40] $r4q
398 * st b32 wt g[$r12d+0x50] $r0
400 0x2202020202020207ULL
,
401 0x2c00000084021c04ULL
,
402 0x2c0000000c031c04ULL
,
403 0x2c00000010001c04ULL
,
404 0x2c00000014005c04ULL
,
405 0x2c00000018009c04ULL
,
406 0x2c0000001c00dc04ULL
,
407 0x2c00000020011c04ULL
,
408 0x22b0420042320207ULL
,
409 0x2c00000024015c04ULL
,
410 0x2c00000028019c04ULL
,
411 0x2c0000002c01dc04ULL
,
412 0x190e0000fc81dc03ULL
,
413 0x28005c1880029de4ULL
,
414 0x7000c01050c21c03ULL
,
415 0x28005c189002dde4ULL
,
416 0x204282020042e047ULL
,
417 0x7000c00820c25c03ULL
,
418 0x80000000000021e7ULL
,
419 0x190e0000fc93dc03ULL
,
420 0x1000000180821c02ULL
,
421 0x1000000040931c02ULL
,
422 0x1000000010935c02ULL
,
423 0x4800000034825c03ULL
,
424 0x22c042c042c04287ULL
,
425 0x4800000030821c03ULL
,
426 0x2800000028031de4ULL
,
427 0x4801000020a29c03ULL
,
428 0x280000002c035de4ULL
,
429 0x0800000000b2dc42ULL
,
430 0x4801000024c31c03ULL
,
431 0x9400000000a01fc5ULL
,
432 0x200002e04202c047ULL
,
433 0x28005c18a0001de4ULL
,
434 0x0800000000d35c42ULL
,
435 0x9400000100c107c5ULL
,
436 0x9400000140c01f85ULL
,
437 0x8000000000001de7ULL
440 static const uint64_t nvf0_read_hw_sm_counters_code
[] =
442 /* Same kernel as GK104 */
443 0x0880808080808080ULL
,
444 0x86400000109c0022ULL
,
445 0x86400000019c0032ULL
,
446 0x86400000021c0002ULL
,
447 0x86400000029c0006ULL
,
448 0x86400000031c000aULL
,
449 0x86400000039c000eULL
,
450 0x86400000041c0012ULL
,
451 0x08ac1080108c8080ULL
,
452 0x86400000049c0016ULL
,
453 0x86400000051c001aULL
,
454 0x86400000059c001eULL
,
455 0xdb201c007f9c201eULL
,
456 0x64c03ce0c41c002aULL
,
457 0xc00000020a1c3021ULL
,
458 0x64c03ce0c49c002eULL
,
459 0x0810a0808010b810ULL
,
460 0xc0000001041c3025ULL
,
461 0x180000000020003cULL
,
462 0xdb201c007f9c243eULL
,
463 0xc1c00000301c2021ULL
,
464 0xc1c00000081c2431ULL
,
465 0xc1c00000021c2435ULL
,
466 0xe0800000069c2026ULL
,
467 0x08b010b010b010a0ULL
,
468 0xe0800000061c2022ULL
,
469 0xe4c03c00051c0032ULL
,
470 0xe0840000041c282aULL
,
471 0xe4c03c00059c0036ULL
,
472 0xe08040007f9c2c2eULL
,
473 0xe0840000049c3032ULL
,
474 0xfe800000001c2800ULL
,
475 0x080000b81080b010ULL
,
476 0x64c03ce0c51c0002ULL
,
477 0xe08040007f9c3436ULL
,
478 0xfe80000020043010ULL
,
479 0xfc800000281c3000ULL
,
480 0x18000000001c003cULL
,
483 /* For simplicity, we will allocate as many group slots as we allocate counter
484 * slots. This means that a single counter which wants to source from 2 groups
485 * will have to be declared as using 2 counter slots. This shouldn't really be
486 * a problem because such queries don't make much sense ... (unless someone is
489 struct nvc0_hw_sm_counter_cfg
491 uint32_t func
: 16; /* mask or 4-bit logic op (depending on mode) */
492 uint32_t mode
: 4; /* LOGOP,B6,LOGOP_B6(_PULSE) */
493 uint32_t sig_dom
: 1; /* if 0, MP_PM_A (per warp-sched), if 1, MP_PM_B */
494 uint32_t sig_sel
: 8; /* signal group */
495 uint32_t src_mask
; /* mask for signal selection (only for NVC0:NVE4) */
496 uint32_t src_sel
; /* signal selection for up to 4 sources */
499 struct nvc0_hw_sm_query_cfg
502 struct nvc0_hw_sm_counter_cfg ctr
[8];
503 uint8_t num_counters
;
504 uint8_t norm
[2]; /* normalization num,denom */
507 #define _CA(f, m, g, s) { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, g, 0, s }
508 #define _CB(f, m, g, s) { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 1, g, 0, s }
509 #define _Q(n, c) [NVE4_HW_SM_QUERY_##n] = c
511 /* ==== Compute capability 3.0 (GK104:GK110) ==== */
512 static const struct nvc0_hw_sm_query_cfg
515 .type
= NVC0_HW_SM_QUERY_ACTIVE_CYCLES
,
516 .ctr
[0] = _CB(0x0001, B6
, 0x02, 0x00000000),
521 static const struct nvc0_hw_sm_query_cfg
524 .type
= NVC0_HW_SM_QUERY_ACTIVE_WARPS
,
525 .ctr
[0] = _CB(0x003f, B6
, 0x02, 0x31483104),
530 static const struct nvc0_hw_sm_query_cfg
531 sm30_atom_cas_count
=
533 .type
= NVC0_HW_SM_QUERY_ATOM_CAS_COUNT
,
534 .ctr
[0] = _CA(0x0001, B6
, 0x1c, 0x000000004),
539 static const struct nvc0_hw_sm_query_cfg
542 .type
= NVC0_HW_SM_QUERY_ATOM_COUNT
,
543 .ctr
[0] = _CA(0x0001, B6
, 0x1c, 0x00000000),
548 static const struct nvc0_hw_sm_query_cfg
551 .type
= NVC0_HW_SM_QUERY_BRANCH
,
552 .ctr
[0] = _CA(0x0001, B6
, 0x1c, 0x0000000c),
557 static const struct nvc0_hw_sm_query_cfg
558 sm30_divergent_branch
=
560 .type
= NVC0_HW_SM_QUERY_DIVERGENT_BRANCH
,
561 .ctr
[0] = _CA(0x0001, B6
, 0x1c, 0x00000010),
566 static const struct nvc0_hw_sm_query_cfg
569 .type
= NVC0_HW_SM_QUERY_GLD_REQUEST
,
570 .ctr
[0] = _CA(0x0001, B6
, 0x1b, 0x00000010),
575 static const struct nvc0_hw_sm_query_cfg
576 sm30_gld_mem_div_replay
=
578 .type
= NVC0_HW_SM_QUERY_GLD_MEM_DIV_REPLAY
,
579 .ctr
[0] = _CB(0x0001, B6
, 0x08, 0x00000010),
584 static const struct nvc0_hw_sm_query_cfg
585 sm30_gst_transactions
=
587 .type
= NVC0_HW_SM_QUERY_GST_TRANSACTIONS
,
588 .ctr
[0] = _CB(0x0001, B6
, 0x11, 0x00000004),
593 static const struct nvc0_hw_sm_query_cfg
594 sm30_gst_mem_div_replay
=
596 .type
= NVC0_HW_SM_QUERY_GST_MEM_DIV_REPLAY
,
597 .ctr
[0] = _CB(0x0001, B6
, 0x08, 0x00000014),
602 static const struct nvc0_hw_sm_query_cfg
605 .type
= NVC0_HW_SM_QUERY_GRED_COUNT
,
606 .ctr
[0] = _CA(0x0001, B6
, 0x1c, 0x00000008),
611 static const struct nvc0_hw_sm_query_cfg
614 .type
= NVC0_HW_SM_QUERY_GST_REQUEST
,
615 .ctr
[0] = _CA(0x0001, B6
, 0x1b, 0x00000014),
620 static const struct nvc0_hw_sm_query_cfg
623 .type
= NVC0_HW_SM_QUERY_INST_EXECUTED
,
624 .ctr
[0] = _CA(0x0003, B6
, 0x04, 0x00000398),
629 static const struct nvc0_hw_sm_query_cfg
632 .type
= NVC0_HW_SM_QUERY_INST_ISSUED1
,
633 .ctr
[0] = _CA(0x0001, B6
, 0x05, 0x00000004),
638 static const struct nvc0_hw_sm_query_cfg
641 .type
= NVC0_HW_SM_QUERY_INST_ISSUED2
,
642 .ctr
[0] = _CA(0x0001, B6
, 0x05, 0x00000008),
647 static const struct nvc0_hw_sm_query_cfg
650 .type
= NVC0_HW_SM_QUERY_L1_GLD_HIT
,
651 .ctr
[0] = _CB(0x0001, B6
, 0x10, 0x00000010),
656 static const struct nvc0_hw_sm_query_cfg
659 .type
= NVC0_HW_SM_QUERY_L1_GLD_MISS
,
660 .ctr
[0] = _CB(0x0001, B6
, 0x10, 0x00000014),
665 static const struct nvc0_hw_sm_query_cfg
666 sm30_l1_gld_transactions
=
668 .type
= NVC0_HW_SM_QUERY_L1_GLD_TRANSACTIONS
,
669 .ctr
[0] = _CB(0x0001, B6
, 0x0f, 0x00000000),
674 static const struct nvc0_hw_sm_query_cfg
675 sm30_l1_gst_transactions
=
677 .type
= NVC0_HW_SM_QUERY_L1_GST_TRANSACTIONS
,
678 .ctr
[0] = _CB(0x0001, B6
, 0x0f, 0x00000004),
683 static const struct nvc0_hw_sm_query_cfg
684 sm30_l1_local_ld_hit
=
686 .type
= NVC0_HW_SM_QUERY_L1_LOCAL_LD_HIT
,
687 .ctr
[0] = _CB(0x0001, B6
, 0x10, 0x00000000),
692 static const struct nvc0_hw_sm_query_cfg
693 sm30_l1_local_ld_miss
=
695 .type
= NVC0_HW_SM_QUERY_L1_LOCAL_LD_MISS
,
696 .ctr
[0] = _CB(0x0001, B6
, 0x10, 0x00000004),
701 static const struct nvc0_hw_sm_query_cfg
702 sm30_l1_local_st_hit
=
704 .type
= NVC0_HW_SM_QUERY_L1_LOCAL_ST_HIT
,
705 .ctr
[0] = _CB(0x0001, B6
, 0x10, 0x00000008),
710 static const struct nvc0_hw_sm_query_cfg
711 sm30_l1_local_st_miss
=
713 .type
= NVC0_HW_SM_QUERY_L1_LOCAL_ST_MISS
,
714 .ctr
[0] = _CB(0x0001, B6
, 0x10, 0x0000000c),
719 static const struct nvc0_hw_sm_query_cfg
720 sm30_l1_shared_ld_transactions
=
722 .type
= NVC0_HW_SM_QUERY_L1_SHARED_LD_TRANSACTIONS
,
723 .ctr
[0] = _CB(0x0001, B6
, 0x0e, 0x00000008),
728 static const struct nvc0_hw_sm_query_cfg
729 sm30_l1_shared_st_transactions
=
731 .type
= NVC0_HW_SM_QUERY_L1_SHARED_ST_TRANSACTIONS
,
732 .ctr
[0] = _CB(0x0001, B6
, 0x0e, 0x0000000c),
737 static const struct nvc0_hw_sm_query_cfg
740 .type
= NVC0_HW_SM_QUERY_LOCAL_LD
,
741 .ctr
[0] = _CA(0x0001, B6
, 0x1b, 0x00000008),
746 static const struct nvc0_hw_sm_query_cfg
747 sm30_local_ld_transactions
=
749 .type
= NVC0_HW_SM_QUERY_LOCAL_LD_TRANSACTIONS
,
750 .ctr
[0] = _CB(0x0001, B6
, 0x0e, 0x00000000),
755 static const struct nvc0_hw_sm_query_cfg
758 .type
= NVC0_HW_SM_QUERY_LOCAL_ST
,
759 .ctr
[0] = _CA(0x0001, B6
, 0x1b, 0x0000000c),
764 static const struct nvc0_hw_sm_query_cfg
765 sm30_local_st_transactions
=
767 .type
= NVC0_HW_SM_QUERY_LOCAL_ST_TRANSACTIONS
,
768 .ctr
[0] = _CB(0x0001, B6
, 0x0e, 0x00000004),
773 static const struct nvc0_hw_sm_query_cfg
774 sm30_prof_trigger_0
=
776 .type
= NVC0_HW_SM_QUERY_PROF_TRIGGER_0
,
777 .ctr
[0] = _CA(0x0001, B6
, 0x01, 0x00000000),
782 static const struct nvc0_hw_sm_query_cfg
783 sm30_prof_trigger_1
=
785 .type
= NVC0_HW_SM_QUERY_PROF_TRIGGER_1
,
786 .ctr
[0] = _CA(0x0001, B6
, 0x01, 0x00000004),
791 static const struct nvc0_hw_sm_query_cfg
792 sm30_prof_trigger_2
=
794 .type
= NVC0_HW_SM_QUERY_PROF_TRIGGER_2
,
795 .ctr
[0] = _CA(0x0001, B6
, 0x01, 0x00000008),
800 static const struct nvc0_hw_sm_query_cfg
801 sm30_prof_trigger_3
=
803 .type
= NVC0_HW_SM_QUERY_PROF_TRIGGER_3
,
804 .ctr
[0] = _CA(0x0001, B6
, 0x01, 0x0000000c),
809 static const struct nvc0_hw_sm_query_cfg
810 sm30_prof_trigger_4
=
812 .type
= NVC0_HW_SM_QUERY_PROF_TRIGGER_4
,
813 .ctr
[0] = _CA(0x0001, B6
, 0x01, 0x00000010),
818 static const struct nvc0_hw_sm_query_cfg
819 sm30_prof_trigger_5
=
821 .type
= NVC0_HW_SM_QUERY_PROF_TRIGGER_5
,
822 .ctr
[0] = _CA(0x0001, B6
, 0x01, 0x00000014),
827 static const struct nvc0_hw_sm_query_cfg
828 sm30_prof_trigger_6
=
830 .type
= NVC0_HW_SM_QUERY_PROF_TRIGGER_6
,
831 .ctr
[0] = _CA(0x0001, B6
, 0x01, 0x00000018),
836 static const struct nvc0_hw_sm_query_cfg
837 sm30_prof_trigger_7
=
839 .type
= NVC0_HW_SM_QUERY_PROF_TRIGGER_7
,
840 .ctr
[0] = _CA(0x0001, B6
, 0x01, 0x0000001c),
845 static const struct nvc0_hw_sm_query_cfg
848 .type
= NVC0_HW_SM_QUERY_SHARED_LD
,
849 .ctr
[0] = _CA(0x0001, B6
, 0x1b, 0x00000000),
854 static const struct nvc0_hw_sm_query_cfg
855 sm30_shared_ld_replay
=
857 .type
= NVC0_HW_SM_QUERY_SHARED_LD_REPLAY
,
858 .ctr
[0] = _CB(0x0001, B6
, 0x08, 0x00000008),
863 static const struct nvc0_hw_sm_query_cfg
866 .type
= NVC0_HW_SM_QUERY_SHARED_ST
,
867 .ctr
[0] = _CA(0x0001, B6
, 0x1b, 0x00000004),
872 static const struct nvc0_hw_sm_query_cfg
873 sm30_shared_st_replay
=
875 .type
= NVC0_HW_SM_QUERY_SHARED_ST_REPLAY
,
876 .ctr
[0] = _CB(0x0001, B6
, 0x08, 0x0000000c),
881 static const struct nvc0_hw_sm_query_cfg
882 sm30_sm_cta_launched
=
884 .type
= NVC0_HW_SM_QUERY_SM_CTA_LAUNCHED
,
885 .ctr
[0] = _CB(0x0001, B6
, 0x02, 0x0000001c),
890 static const struct nvc0_hw_sm_query_cfg
891 sm30_threads_launched
=
893 .type
= NVC0_HW_SM_QUERY_THREADS_LAUNCHED
,
894 .ctr
[0] = _CA(0x003f, B6
, 0x03, 0x398a4188),
899 static const struct nvc0_hw_sm_query_cfg
900 sm30_uncached_gld_transactions
=
902 .type
= NVC0_HW_SM_QUERY_UNCACHED_GLD_TRANSACTIONS
,
903 .ctr
[0] = _CB(0x0001, B6
, 0x11, 0x00000000),
908 static const struct nvc0_hw_sm_query_cfg
909 sm30_warps_launched
=
911 .type
= NVC0_HW_SM_QUERY_WARPS_LAUNCHED
,
912 .ctr
[0] = _CA(0x0001, B6
, 0x03, 0x00000004),
918 * active_warps: bit 0 alternates btw 0 and 1 for odd nr of warps
919 * inst_executed etc.: we only count a single warp scheduler
921 static const struct nvc0_hw_sm_query_cfg
*sm30_hw_sm_queries
[] =
925 &sm30_atom_cas_count
,
928 &sm30_divergent_branch
,
930 &sm30_gld_mem_div_replay
,
931 &sm30_gst_transactions
,
932 &sm30_gst_mem_div_replay
,
940 &sm30_l1_gld_transactions
,
941 &sm30_l1_gst_transactions
,
942 &sm30_l1_local_ld_hit
,
943 &sm30_l1_local_ld_miss
,
944 &sm30_l1_local_st_hit
,
945 &sm30_l1_local_st_miss
,
946 &sm30_l1_shared_ld_transactions
,
947 &sm30_l1_shared_st_transactions
,
949 &sm30_local_ld_transactions
,
951 &sm30_local_st_transactions
,
952 &sm30_prof_trigger_0
,
953 &sm30_prof_trigger_1
,
954 &sm30_prof_trigger_2
,
955 &sm30_prof_trigger_3
,
956 &sm30_prof_trigger_4
,
957 &sm30_prof_trigger_5
,
958 &sm30_prof_trigger_6
,
959 &sm30_prof_trigger_7
,
961 &sm30_shared_ld_replay
,
963 &sm30_shared_st_replay
,
964 &sm30_sm_cta_launched
,
965 &sm30_threads_launched
,
966 &sm30_uncached_gld_transactions
,
967 &sm30_warps_launched
,
970 /* ==== Compute capability 3.5 (GK110/GK208) ==== */
971 static const struct nvc0_hw_sm_query_cfg
972 sm35_atom_cas_count
=
974 .type
= NVC0_HW_SM_QUERY_ATOM_CAS_COUNT
,
975 .ctr
[0] = _CA(0x0001, B6
, 0x1a, 0x00000014),
980 static const struct nvc0_hw_sm_query_cfg
983 .type
= NVC0_HW_SM_QUERY_ATOM_COUNT
,
984 .ctr
[0] = _CA(0x0001, B6
, 0x1a, 0x00000010),
989 static const struct nvc0_hw_sm_query_cfg
992 .type
= NVC0_HW_SM_QUERY_GRED_COUNT
,
993 .ctr
[0] = _CA(0x0001, B6
, 0x1a, 0x00000018),
998 static const struct nvc0_hw_sm_query_cfg
999 sm35_not_pred_off_inst_executed
=
1001 .type
= NVC0_HW_SM_QUERY_NOT_PRED_OFF_INST_EXECUTED
,
1002 .ctr
[0] = _CA(0x003f, B6
, 0x14, 0x29062080),
1007 static const struct nvc0_hw_sm_query_cfg
1008 sm35_shared_ld_replay
=
1010 .type
= NVC0_HW_SM_QUERY_SHARED_LD_REPLAY
,
1011 .ctr
[0] = _CB(0xaaaa, LOGOP
, 0x13, 0x00000018),
1012 .ctr
[1] = _CB(0x8888, LOGOP
, 0x08, 0x00000151),
1017 static const struct nvc0_hw_sm_query_cfg
1018 sm35_shared_st_replay
=
1020 .type
= NVC0_HW_SM_QUERY_SHARED_ST_REPLAY
,
1021 .ctr
[0] = _CB(0xaaaa, LOGOP
, 0x13, 0x00000018),
1022 .ctr
[1] = _CB(0x8888, LOGOP
, 0x08, 0x000001d1),
1027 static const struct nvc0_hw_sm_query_cfg
1028 sm35_th_inst_executed
=
1030 .type
= NVC0_HW_SM_QUERY_TH_INST_EXECUTED
,
1031 .ctr
[0] = _CA(0x003f, B6
, 0x11, 0x29062080),
1036 static const struct nvc0_hw_sm_query_cfg
*sm35_hw_sm_queries
[] =
1038 &sm30_active_cycles
,
1040 &sm35_atom_cas_count
,
1043 &sm30_gld_mem_div_replay
,
1044 &sm30_gst_transactions
,
1045 &sm30_gst_mem_div_replay
,
1048 &sm30_inst_executed
,
1053 &sm30_l1_gld_transactions
,
1054 &sm30_l1_gst_transactions
,
1055 &sm30_l1_local_ld_hit
,
1056 &sm30_l1_local_ld_miss
,
1057 &sm30_l1_local_st_hit
,
1058 &sm30_l1_local_st_miss
,
1059 &sm30_l1_shared_ld_transactions
,
1060 &sm30_l1_shared_st_transactions
,
1062 &sm30_local_ld_transactions
,
1064 &sm30_local_st_transactions
,
1065 &sm35_not_pred_off_inst_executed
,
1066 &sm30_prof_trigger_0
,
1067 &sm30_prof_trigger_1
,
1068 &sm30_prof_trigger_2
,
1069 &sm30_prof_trigger_3
,
1070 &sm30_prof_trigger_4
,
1071 &sm30_prof_trigger_5
,
1072 &sm30_prof_trigger_6
,
1073 &sm30_prof_trigger_7
,
1075 &sm35_shared_ld_replay
,
1077 &sm35_shared_st_replay
,
1078 &sm30_sm_cta_launched
,
1079 &sm35_th_inst_executed
,
1080 &sm30_threads_launched
,
1081 &sm30_uncached_gld_transactions
,
1082 &sm30_warps_launched
,
1089 /* === PERFORMANCE MONITORING COUNTERS for NVC0:NVE4 === */
1091 * - MP counters on GF100/GF110 (compute capability 2.0) are buggy
1092 * because there is a context-switch problem that we need to fix.
1093 * Results might be wrong sometimes, be careful!
1095 static const uint64_t nvc0_read_hw_sm_counters_code
[] =
1097 /* mov b32 $r8 $tidx
1098 * mov b32 $r9 $physid
1107 * set $p0 0x1 eq u32 $r8 0x0
1108 * mov b32 $r10 c15[0x620]
1109 * mov b32 $r11 c15[0x624]
1110 * ext u32 $r8 $r9 0x414
1112 * mul $r8 u32 $r8 u32 48
1113 * add b32 $r10 $c $r10 $r8
1114 * add b32 $r11 $r11 0x0 $c
1115 * mov b32 $r8 c15[0x628]
1116 * st b128 wt g[$r10d+0x00] $r0q
1117 * st b128 wt g[$r10d+0x10] $r4q
1118 * st b32 wt g[$r10d+0x20] $r8
1120 0x2c00000084021c04ULL
,
1121 0x2c0000000c025c04ULL
,
1122 0x2c00000010001c04ULL
,
1123 0x2c00000014005c04ULL
,
1124 0x2c00000018009c04ULL
,
1125 0x2c0000001c00dc04ULL
,
1126 0x2c00000020011c04ULL
,
1127 0x2c00000024015c04ULL
,
1128 0x2c00000028019c04ULL
,
1129 0x2c0000002c01dc04ULL
,
1130 0x190e0000fc81dc03ULL
,
1131 0x28007c1880029de4ULL
,
1132 0x28007c189002dde4ULL
,
1133 0x7000c01050921c03ULL
,
1134 0x80000000000021e7ULL
,
1135 0x10000000c0821c02ULL
,
1136 0x4801000020a29c03ULL
,
1137 0x0800000000b2dc42ULL
,
1138 0x28007c18a0021de4ULL
,
1139 0x9400000000a01fc5ULL
,
1140 0x9400000040a11fc5ULL
,
1141 0x9400000080a21f85ULL
,
1142 0x8000000000001de7ULL
1145 #define _C(f, o, g, m, s) { f, NVC0_COMPUTE_MP_PM_OP_MODE_##o, 0, g, m, s }
1147 /* ==== Compute capability 2.0 (GF100/GF110) ==== */
1148 static const struct nvc0_hw_sm_query_cfg
1149 sm20_active_cycles
=
1151 .type
= NVC0_HW_SM_QUERY_ACTIVE_CYCLES
,
1152 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x11, 0x000000ff, 0x00000000),
1157 static const struct nvc0_hw_sm_query_cfg
1160 .type
= NVC0_HW_SM_QUERY_ACTIVE_WARPS
,
1161 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x24, 0x000000ff, 0x00000010),
1162 .ctr
[1] = _C(0xaaaa, LOGOP
, 0x24, 0x000000ff, 0x00000020),
1163 .ctr
[2] = _C(0xaaaa, LOGOP
, 0x24, 0x000000ff, 0x00000030),
1164 .ctr
[3] = _C(0xaaaa, LOGOP
, 0x24, 0x000000ff, 0x00000040),
1165 .ctr
[4] = _C(0xaaaa, LOGOP
, 0x24, 0x000000ff, 0x00000050),
1166 .ctr
[5] = _C(0xaaaa, LOGOP
, 0x24, 0x000000ff, 0x00000060),
1171 static const struct nvc0_hw_sm_query_cfg
1174 .type
= NVC0_HW_SM_QUERY_ATOM_COUNT
,
1175 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x63, 0x000000ff, 0x00000030),
1180 static const struct nvc0_hw_sm_query_cfg
1183 .type
= NVC0_HW_SM_QUERY_BRANCH
,
1184 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x1a, 0x000000ff, 0x00000000),
1185 .ctr
[1] = _C(0xaaaa, LOGOP
, 0x1a, 0x000000ff, 0x00000010),
1190 static const struct nvc0_hw_sm_query_cfg
1191 sm20_divergent_branch
=
1193 .type
= NVC0_HW_SM_QUERY_DIVERGENT_BRANCH
,
1194 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x19, 0x000000ff, 0x00000020),
1195 .ctr
[1] = _C(0xaaaa, LOGOP
, 0x19, 0x000000ff, 0x00000030),
1200 static const struct nvc0_hw_sm_query_cfg
1203 .type
= NVC0_HW_SM_QUERY_GLD_REQUEST
,
1204 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x64, 0x000000ff, 0x00000030),
1209 static const struct nvc0_hw_sm_query_cfg
1212 .type
= NVC0_HW_SM_QUERY_GRED_COUNT
,
1213 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x63, 0x000000ff, 0x00000040),
1218 static const struct nvc0_hw_sm_query_cfg
1221 .type
= NVC0_HW_SM_QUERY_GST_REQUEST
,
1222 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x64, 0x000000ff, 0x00000060),
1227 static const struct nvc0_hw_sm_query_cfg
1228 sm20_inst_executed
=
1230 .type
= NVC0_HW_SM_QUERY_INST_EXECUTED
,
1231 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x2d, 0x0000ffff, 0x00001000),
1232 .ctr
[1] = _C(0xaaaa, LOGOP
, 0x2d, 0x0000ffff, 0x00001010),
1237 static const struct nvc0_hw_sm_query_cfg
1240 .type
= NVC0_HW_SM_QUERY_INST_ISSUED
,
1241 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x27, 0x0000ffff, 0x00007060),
1242 .ctr
[1] = _C(0xaaaa, LOGOP
, 0x27, 0x0000ffff, 0x00007070),
1247 static const struct nvc0_hw_sm_query_cfg
1250 .type
= NVC0_HW_SM_QUERY_LOCAL_LD
,
1251 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x64, 0x000000ff, 0x00000020),
1256 static const struct nvc0_hw_sm_query_cfg
1259 .type
= NVC0_HW_SM_QUERY_LOCAL_ST
,
1260 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x64, 0x000000ff, 0x00000050),
1265 static const struct nvc0_hw_sm_query_cfg
1266 sm20_prof_trigger_0
=
1268 .type
= NVC0_HW_SM_QUERY_PROF_TRIGGER_0
,
1269 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x01, 0x000000ff, 0x00000000),
1274 static const struct nvc0_hw_sm_query_cfg
1275 sm20_prof_trigger_1
=
1277 .type
= NVC0_HW_SM_QUERY_PROF_TRIGGER_1
,
1278 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x01, 0x000000ff, 0x00000010),
1283 static const struct nvc0_hw_sm_query_cfg
1284 sm20_prof_trigger_2
=
1286 .type
= NVC0_HW_SM_QUERY_PROF_TRIGGER_2
,
1287 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x01, 0x000000ff, 0x00000020),
1292 static const struct nvc0_hw_sm_query_cfg
1293 sm20_prof_trigger_3
=
1295 .type
= NVC0_HW_SM_QUERY_PROF_TRIGGER_3
,
1296 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x01, 0x000000ff, 0x00000030),
1301 static const struct nvc0_hw_sm_query_cfg
1302 sm20_prof_trigger_4
=
1304 .type
= NVC0_HW_SM_QUERY_PROF_TRIGGER_4
,
1305 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x01, 0x000000ff, 0x00000040),
1310 static const struct nvc0_hw_sm_query_cfg
1311 sm20_prof_trigger_5
=
1313 .type
= NVC0_HW_SM_QUERY_PROF_TRIGGER_5
,
1314 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x01, 0x000000ff, 0x00000050),
1319 static const struct nvc0_hw_sm_query_cfg
1320 sm20_prof_trigger_6
=
1322 .type
= NVC0_HW_SM_QUERY_PROF_TRIGGER_6
,
1323 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x01, 0x000000ff, 0x00000060),
1328 static const struct nvc0_hw_sm_query_cfg
1329 sm20_prof_trigger_7
=
1331 .type
= NVC0_HW_SM_QUERY_PROF_TRIGGER_7
,
1332 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x01, 0x000000ff, 0x00000070),
1337 static const struct nvc0_hw_sm_query_cfg
1340 .type
= NVC0_HW_SM_QUERY_SHARED_LD
,
1341 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x64, 0x000000ff, 0x00000010),
1346 static const struct nvc0_hw_sm_query_cfg
1349 .type
= NVC0_HW_SM_QUERY_SHARED_ST
,
1350 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x64, 0x000000ff, 0x00000040),
1355 static const struct nvc0_hw_sm_query_cfg
1356 sm20_threads_launched
=
1358 .type
= NVC0_HW_SM_QUERY_THREADS_LAUNCHED
,
1359 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x26, 0x000000ff, 0x00000010),
1360 .ctr
[1] = _C(0xaaaa, LOGOP
, 0x26, 0x000000ff, 0x00000020),
1361 .ctr
[2] = _C(0xaaaa, LOGOP
, 0x26, 0x000000ff, 0x00000030),
1362 .ctr
[3] = _C(0xaaaa, LOGOP
, 0x26, 0x000000ff, 0x00000040),
1363 .ctr
[4] = _C(0xaaaa, LOGOP
, 0x26, 0x000000ff, 0x00000050),
1364 .ctr
[5] = _C(0xaaaa, LOGOP
, 0x26, 0x000000ff, 0x00000060),
1369 static const struct nvc0_hw_sm_query_cfg
1370 sm20_th_inst_executed_0
=
1372 .type
= NVC0_HW_SM_QUERY_TH_INST_EXECUTED_0
,
1373 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x2f, 0x000000ff, 0x00000000),
1374 .ctr
[1] = _C(0xaaaa, LOGOP
, 0x2f, 0x000000ff, 0x00000010),
1375 .ctr
[2] = _C(0xaaaa, LOGOP
, 0x2f, 0x000000ff, 0x00000020),
1376 .ctr
[3] = _C(0xaaaa, LOGOP
, 0x2f, 0x000000ff, 0x00000030),
1377 .ctr
[4] = _C(0xaaaa, LOGOP
, 0x2f, 0x000000ff, 0x00000040),
1378 .ctr
[5] = _C(0xaaaa, LOGOP
, 0x2f, 0x000000ff, 0x00000050),
1383 static const struct nvc0_hw_sm_query_cfg
1384 sm20_th_inst_executed_1
=
1386 .type
= NVC0_HW_SM_QUERY_TH_INST_EXECUTED_1
,
1387 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x30, 0x000000ff, 0x00000000),
1388 .ctr
[1] = _C(0xaaaa, LOGOP
, 0x30, 0x000000ff, 0x00000010),
1389 .ctr
[2] = _C(0xaaaa, LOGOP
, 0x30, 0x000000ff, 0x00000020),
1390 .ctr
[3] = _C(0xaaaa, LOGOP
, 0x30, 0x000000ff, 0x00000030),
1391 .ctr
[4] = _C(0xaaaa, LOGOP
, 0x30, 0x000000ff, 0x00000040),
1392 .ctr
[5] = _C(0xaaaa, LOGOP
, 0x30, 0x000000ff, 0x00000050),
1397 static const struct nvc0_hw_sm_query_cfg
1398 sm20_warps_launched
=
1400 .type
= NVC0_HW_SM_QUERY_WARPS_LAUNCHED
,
1401 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x26, 0x000000ff, 0x00000000),
1406 static const struct nvc0_hw_sm_query_cfg
*sm20_hw_sm_queries
[] =
1408 &sm20_active_cycles
,
1412 &sm20_divergent_branch
,
1416 &sm20_inst_executed
,
1420 &sm20_prof_trigger_0
,
1421 &sm20_prof_trigger_1
,
1422 &sm20_prof_trigger_2
,
1423 &sm20_prof_trigger_3
,
1424 &sm20_prof_trigger_4
,
1425 &sm20_prof_trigger_5
,
1426 &sm20_prof_trigger_6
,
1427 &sm20_prof_trigger_7
,
1430 &sm20_threads_launched
,
1431 &sm20_th_inst_executed_0
,
1432 &sm20_th_inst_executed_1
,
1433 &sm20_warps_launched
,
1436 /* ==== Compute capability 2.1 (GF108+ except GF110) ==== */
1437 static const struct nvc0_hw_sm_query_cfg
1438 sm21_inst_executed
=
1440 .type
= NVC0_HW_SM_QUERY_INST_EXECUTED
,
1441 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x2d, 0x000000ff, 0x00000000),
1442 .ctr
[1] = _C(0xaaaa, LOGOP
, 0x2d, 0x000000ff, 0x00000010),
1443 .ctr
[2] = _C(0xaaaa, LOGOP
, 0x2d, 0x000000ff, 0x00000020),
1448 static const struct nvc0_hw_sm_query_cfg
1449 sm21_inst_issued1_0
=
1451 .type
= NVC0_HW_SM_QUERY_INST_ISSUED1_0
,
1452 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x7e, 0x000000ff, 0x00000010),
1457 static const struct nvc0_hw_sm_query_cfg
1458 sm21_inst_issued1_1
=
1460 .type
= NVC0_HW_SM_QUERY_INST_ISSUED1_1
,
1461 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x7e, 0x000000ff, 0x00000040),
1466 static const struct nvc0_hw_sm_query_cfg
1467 sm21_inst_issued2_0
=
1469 .type
= NVC0_HW_SM_QUERY_INST_ISSUED2_0
,
1470 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x7e, 0x000000ff, 0x00000020),
1475 static const struct nvc0_hw_sm_query_cfg
1476 sm21_inst_issued2_1
=
1478 .type
= NVC0_HW_SM_QUERY_INST_ISSUED2_1
,
1479 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x7e, 0x000000ff, 0x00000050),
1484 static const struct nvc0_hw_sm_query_cfg
1485 sm21_th_inst_executed_0
=
1487 .type
= NVC0_HW_SM_QUERY_TH_INST_EXECUTED_0
,
1488 .ctr
[0] = _C(0xaaaa, LOGOP
, 0xa3, 0x000000ff, 0x00000000),
1489 .ctr
[1] = _C(0xaaaa, LOGOP
, 0xa3, 0x000000ff, 0x00000010),
1490 .ctr
[2] = _C(0xaaaa, LOGOP
, 0xa3, 0x000000ff, 0x00000020),
1491 .ctr
[3] = _C(0xaaaa, LOGOP
, 0xa3, 0x000000ff, 0x00000030),
1492 .ctr
[4] = _C(0xaaaa, LOGOP
, 0xa3, 0x000000ff, 0x00000040),
1493 .ctr
[5] = _C(0xaaaa, LOGOP
, 0xa3, 0x000000ff, 0x00000050),
1498 static const struct nvc0_hw_sm_query_cfg
1499 sm21_th_inst_executed_1
=
1501 .type
= NVC0_HW_SM_QUERY_TH_INST_EXECUTED_1
,
1502 .ctr
[0] = _C(0xaaaa, LOGOP
, 0xa5, 0x000000ff, 0x00000000),
1503 .ctr
[1] = _C(0xaaaa, LOGOP
, 0xa5, 0x000000ff, 0x00000010),
1504 .ctr
[2] = _C(0xaaaa, LOGOP
, 0xa5, 0x000000ff, 0x00000020),
1505 .ctr
[3] = _C(0xaaaa, LOGOP
, 0xa5, 0x000000ff, 0x00000030),
1506 .ctr
[4] = _C(0xaaaa, LOGOP
, 0xa5, 0x000000ff, 0x00000040),
1507 .ctr
[5] = _C(0xaaaa, LOGOP
, 0xa5, 0x000000ff, 0x00000050),
1512 static const struct nvc0_hw_sm_query_cfg
1513 sm21_th_inst_executed_2
=
1515 .type
= NVC0_HW_SM_QUERY_TH_INST_EXECUTED_2
,
1516 .ctr
[0] = _C(0xaaaa, LOGOP
, 0xa4, 0x000000ff, 0x00000000),
1517 .ctr
[1] = _C(0xaaaa, LOGOP
, 0xa4, 0x000000ff, 0x00000010),
1518 .ctr
[2] = _C(0xaaaa, LOGOP
, 0xa4, 0x000000ff, 0x00000020),
1519 .ctr
[3] = _C(0xaaaa, LOGOP
, 0xa4, 0x000000ff, 0x00000030),
1520 .ctr
[4] = _C(0xaaaa, LOGOP
, 0xa4, 0x000000ff, 0x00000040),
1521 .ctr
[5] = _C(0xaaaa, LOGOP
, 0xa4, 0x000000ff, 0x00000050),
1526 static const struct nvc0_hw_sm_query_cfg
1527 sm21_th_inst_executed_3
=
1529 .type
= NVC0_HW_SM_QUERY_TH_INST_EXECUTED_3
,
1530 .ctr
[0] = _C(0xaaaa, LOGOP
, 0xa6, 0x000000ff, 0x00000000),
1531 .ctr
[1] = _C(0xaaaa, LOGOP
, 0xa6, 0x000000ff, 0x00000010),
1532 .ctr
[2] = _C(0xaaaa, LOGOP
, 0xa6, 0x000000ff, 0x00000020),
1533 .ctr
[3] = _C(0xaaaa, LOGOP
, 0xa6, 0x000000ff, 0x00000030),
1534 .ctr
[4] = _C(0xaaaa, LOGOP
, 0xa6, 0x000000ff, 0x00000040),
1535 .ctr
[5] = _C(0xaaaa, LOGOP
, 0xa6, 0x000000ff, 0x00000050),
1540 static const struct nvc0_hw_sm_query_cfg
*sm21_hw_sm_queries
[] =
1542 &sm20_active_cycles
,
1546 &sm20_divergent_branch
,
1550 &sm21_inst_executed
,
1551 &sm21_inst_issued1_0
,
1552 &sm21_inst_issued1_1
,
1553 &sm21_inst_issued2_0
,
1554 &sm21_inst_issued2_1
,
1557 &sm20_prof_trigger_0
,
1558 &sm20_prof_trigger_1
,
1559 &sm20_prof_trigger_2
,
1560 &sm20_prof_trigger_3
,
1561 &sm20_prof_trigger_4
,
1562 &sm20_prof_trigger_5
,
1563 &sm20_prof_trigger_6
,
1564 &sm20_prof_trigger_7
,
1567 &sm20_threads_launched
,
1568 &sm21_th_inst_executed_0
,
1569 &sm21_th_inst_executed_1
,
1570 &sm21_th_inst_executed_2
,
1571 &sm21_th_inst_executed_3
,
1572 &sm20_warps_launched
,
1577 static inline const struct nvc0_hw_sm_query_cfg
**
1578 nvc0_hw_sm_get_queries(struct nvc0_screen
*screen
)
1580 struct nouveau_device
*dev
= screen
->base
.device
;
1582 switch (screen
->base
.class_3d
) {
1584 return sm35_hw_sm_queries
;
1586 return sm30_hw_sm_queries
;
1588 if (dev
->chipset
== 0xc0 || dev
->chipset
== 0xc8)
1589 return sm20_hw_sm_queries
;
1590 return sm21_hw_sm_queries
;
1597 nvc0_hw_sm_get_num_queries(struct nvc0_screen
*screen
)
1599 struct nouveau_device
*dev
= screen
->base
.device
;
1601 switch (screen
->base
.class_3d
) {
1603 return ARRAY_SIZE(sm35_hw_sm_queries
);
1605 return ARRAY_SIZE(sm30_hw_sm_queries
);
1607 if (dev
->chipset
== 0xc0 || dev
->chipset
== 0xc8)
1608 return ARRAY_SIZE(sm20_hw_sm_queries
);
1609 return ARRAY_SIZE(sm21_hw_sm_queries
);
1614 static const struct nvc0_hw_sm_query_cfg
*
1615 nvc0_hw_sm_query_get_cfg(struct nvc0_context
*nvc0
, struct nvc0_hw_query
*hq
)
1617 const struct nvc0_hw_sm_query_cfg
**queries
;
1618 struct nvc0_screen
*screen
= nvc0
->screen
;
1619 struct nvc0_query
*q
= &hq
->base
;
1620 unsigned num_queries
;
1623 num_queries
= nvc0_hw_sm_get_num_queries(screen
);
1624 queries
= nvc0_hw_sm_get_queries(screen
);
1626 for (i
= 0; i
< num_queries
; i
++) {
1627 if (NVC0_HW_SM_QUERY(queries
[i
]->type
) == q
->type
)
1635 nvc0_hw_sm_destroy_query(struct nvc0_context
*nvc0
, struct nvc0_hw_query
*hq
)
1637 struct nvc0_query
*q
= &hq
->base
;
1638 nvc0_hw_query_allocate(nvc0
, q
, 0);
1639 nouveau_fence_ref(NULL
, &hq
->fence
);
1644 nve4_hw_sm_begin_query(struct nvc0_context
*nvc0
, struct nvc0_hw_query
*hq
)
1646 struct nvc0_screen
*screen
= nvc0
->screen
;
1647 struct nouveau_pushbuf
*push
= nvc0
->base
.pushbuf
;
1648 struct nvc0_hw_sm_query
*hsq
= nvc0_hw_sm_query(hq
);
1649 const struct nvc0_hw_sm_query_cfg
*cfg
;
1651 unsigned num_ab
[2] = { 0, 0 };
1653 cfg
= nvc0_hw_sm_query_get_cfg(nvc0
, hq
);
1655 /* check if we have enough free counter slots */
1656 for (i
= 0; i
< cfg
->num_counters
; ++i
)
1657 num_ab
[cfg
->ctr
[i
].sig_dom
]++;
1659 if (screen
->pm
.num_hw_sm_active
[0] + num_ab
[0] > 4 ||
1660 screen
->pm
.num_hw_sm_active
[1] + num_ab
[1] > 4) {
1661 NOUVEAU_ERR("Not enough free MP counter slots !\n");
1665 assert(cfg
->num_counters
<= 4);
1666 PUSH_SPACE(push
, 4 * 8 * + 6);
1668 if (!screen
->pm
.mp_counters_enabled
) {
1669 screen
->pm
.mp_counters_enabled
= true;
1670 BEGIN_NVC0(push
, SUBC_SW(0x06ac), 1);
1671 PUSH_DATA (push
, 0x1fcb);
1674 /* set sequence field to 0 (used to check if result is available) */
1675 for (i
= 0; i
< screen
->mp_count
; ++i
)
1676 hq
->data
[i
* 10 + 10] = 0;
1679 for (i
= 0; i
< cfg
->num_counters
; ++i
) {
1680 const unsigned d
= cfg
->ctr
[i
].sig_dom
;
1682 if (!screen
->pm
.num_hw_sm_active
[d
]) {
1683 uint32_t m
= (1 << 22) | (1 << (7 + (8 * !d
)));
1684 if (screen
->pm
.num_hw_sm_active
[!d
])
1685 m
|= 1 << (7 + (8 * d
));
1686 BEGIN_NVC0(push
, SUBC_SW(0x0600), 1);
1687 PUSH_DATA (push
, m
);
1689 screen
->pm
.num_hw_sm_active
[d
]++;
1691 for (c
= d
* 4; c
< (d
* 4 + 4); ++c
) {
1692 if (!screen
->pm
.mp_counter
[c
]) {
1694 screen
->pm
.mp_counter
[c
] = hsq
;
1698 assert(c
<= (d
* 4 + 3)); /* must succeed, already checked for space */
1700 /* configure and reset the counter(s) */
1702 BEGIN_NVC0(push
, NVE4_CP(MP_PM_A_SIGSEL(c
& 3)), 1);
1704 BEGIN_NVC0(push
, NVE4_CP(MP_PM_B_SIGSEL(c
& 3)), 1);
1705 PUSH_DATA (push
, cfg
->ctr
[i
].sig_sel
);
1706 BEGIN_NVC0(push
, NVE4_CP(MP_PM_SRCSEL(c
)), 1);
1707 PUSH_DATA (push
, cfg
->ctr
[i
].src_sel
+ 0x2108421 * (c
& 3));
1708 BEGIN_NVC0(push
, NVE4_CP(MP_PM_FUNC(c
)), 1);
1709 PUSH_DATA (push
, (cfg
->ctr
[i
].func
<< 4) | cfg
->ctr
[i
].mode
);
1710 BEGIN_NVC0(push
, NVE4_CP(MP_PM_SET(c
)), 1);
1711 PUSH_DATA (push
, 0);
1717 nvc0_hw_sm_begin_query(struct nvc0_context
*nvc0
, struct nvc0_hw_query
*hq
)
1719 struct nvc0_screen
*screen
= nvc0
->screen
;
1720 struct nouveau_pushbuf
*push
= nvc0
->base
.pushbuf
;
1721 struct nvc0_hw_sm_query
*hsq
= nvc0_hw_sm_query(hq
);
1722 const struct nvc0_hw_sm_query_cfg
*cfg
;
1725 if (screen
->base
.class_3d
>= NVE4_3D_CLASS
)
1726 return nve4_hw_sm_begin_query(nvc0
, hq
);
1728 cfg
= nvc0_hw_sm_query_get_cfg(nvc0
, hq
);
1730 /* check if we have enough free counter slots */
1731 if (screen
->pm
.num_hw_sm_active
[0] + cfg
->num_counters
> 8) {
1732 NOUVEAU_ERR("Not enough free MP counter slots !\n");
1736 assert(cfg
->num_counters
<= 8);
1737 PUSH_SPACE(push
, 8 * 8 + 2);
1739 /* set sequence field to 0 (used to check if result is available) */
1740 for (i
= 0; i
< screen
->mp_count
; ++i
) {
1741 const unsigned b
= (0x30 / 4) * i
;
1742 hq
->data
[b
+ 8] = 0;
1746 for (i
= 0; i
< cfg
->num_counters
; ++i
) {
1747 uint32_t mask_sel
= 0x00000000;
1749 if (!screen
->pm
.num_hw_sm_active
[0]) {
1750 BEGIN_NVC0(push
, SUBC_SW(0x0600), 1);
1751 PUSH_DATA (push
, 0x80000000);
1753 screen
->pm
.num_hw_sm_active
[0]++;
1755 for (c
= 0; c
< 8; ++c
) {
1756 if (!screen
->pm
.mp_counter
[c
]) {
1758 screen
->pm
.mp_counter
[c
] = hsq
;
1763 /* Oddly-enough, the signal id depends on the slot selected on Fermi but
1764 * not on Kepler. Fortunately, the signal ids are just offseted by the
1767 mask_sel
|= (c
<< 8);
1768 mask_sel
|= (c
<< 16);
1769 mask_sel
|= (c
<< 24);
1770 mask_sel
&= cfg
->ctr
[i
].src_mask
;
1772 /* configure and reset the counter(s) */
1773 BEGIN_NVC0(push
, NVC0_CP(MP_PM_SIGSEL(c
)), 1);
1774 PUSH_DATA (push
, cfg
->ctr
[i
].sig_sel
);
1775 BEGIN_NVC0(push
, NVC0_CP(MP_PM_SRCSEL(c
)), 1);
1776 PUSH_DATA (push
, cfg
->ctr
[i
].src_sel
| mask_sel
);
1777 BEGIN_NVC0(push
, NVC0_CP(MP_PM_OP(c
)), 1);
1778 PUSH_DATA (push
, (cfg
->ctr
[i
].func
<< 4) | cfg
->ctr
[i
].mode
);
1779 BEGIN_NVC0(push
, NVC0_CP(MP_PM_SET(c
)), 1);
1780 PUSH_DATA (push
, 0);
1785 static inline struct nvc0_program
*
1786 nvc0_hw_sm_get_program(struct nvc0_screen
*screen
)
1788 struct nvc0_program
*prog
;
1790 prog
= CALLOC_STRUCT(nvc0_program
);
1794 prog
->type
= PIPE_SHADER_COMPUTE
;
1795 prog
->translated
= true;
1796 prog
->parm_size
= 12;
1798 if (screen
->base
.class_3d
== NVE4_3D_CLASS
||
1799 screen
->base
.class_3d
== NVF0_3D_CLASS
) {
1800 if (screen
->base
.class_3d
== NVE4_3D_CLASS
) {
1801 prog
->code
= (uint32_t *)nve4_read_hw_sm_counters_code
;
1802 prog
->code_size
= sizeof(nve4_read_hw_sm_counters_code
);
1804 prog
->code
= (uint32_t *)nvf0_read_hw_sm_counters_code
;
1805 prog
->code_size
= sizeof(nvf0_read_hw_sm_counters_code
);
1807 prog
->num_gprs
= 14;
1809 prog
->code
= (uint32_t *)nvc0_read_hw_sm_counters_code
;
1810 prog
->code_size
= sizeof(nvc0_read_hw_sm_counters_code
);
1811 prog
->num_gprs
= 12;
1817 nvc0_hw_sm_upload_input(struct nvc0_context
*nvc0
, struct nvc0_hw_query
*hq
)
1819 struct nouveau_pushbuf
*push
= nvc0
->base
.pushbuf
;
1820 struct nvc0_screen
*screen
= nvc0
->screen
;
1824 address
= screen
->uniform_bo
->offset
+ NVC0_CB_AUX_INFO(s
);
1826 PUSH_SPACE(push
, 11);
1828 if (screen
->base
.class_3d
>= NVE4_3D_CLASS
) {
1829 BEGIN_NVC0(push
, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH
), 2);
1830 PUSH_DATAh(push
, address
+ NVC0_CB_AUX_MP_INFO
);
1831 PUSH_DATA (push
, address
+ NVC0_CB_AUX_MP_INFO
);
1832 BEGIN_NVC0(push
, NVE4_CP(UPLOAD_LINE_LENGTH_IN
), 2);
1833 PUSH_DATA (push
, 3 * 4);
1834 PUSH_DATA (push
, 0x1);
1835 BEGIN_1IC0(push
, NVE4_CP(UPLOAD_EXEC
), 1 + 3);
1836 PUSH_DATA (push
, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR
| (0x20 << 1));
1838 BEGIN_NVC0(push
, NVC0_CP(CB_SIZE
), 3);
1839 PUSH_DATA (push
, NVC0_CB_AUX_SIZE
);
1840 PUSH_DATAh(push
, address
);
1841 PUSH_DATA (push
, address
);
1842 BEGIN_1IC0(push
, NVC0_CP(CB_POS
), 1 + 3);
1843 PUSH_DATA (push
, NVC0_CB_AUX_MP_INFO
);
1845 PUSH_DATA (push
, (hq
->bo
->offset
+ hq
->base_offset
));
1846 PUSH_DATAh(push
, (hq
->bo
->offset
+ hq
->base_offset
));
1847 PUSH_DATA (push
, hq
->sequence
);
1851 nvc0_hw_sm_end_query(struct nvc0_context
*nvc0
, struct nvc0_hw_query
*hq
)
1853 struct nvc0_screen
*screen
= nvc0
->screen
;
1854 struct pipe_context
*pipe
= &nvc0
->base
.pipe
;
1855 struct nouveau_pushbuf
*push
= nvc0
->base
.pushbuf
;
1856 const bool is_nve4
= screen
->base
.class_3d
>= NVE4_3D_CLASS
;
1857 struct nvc0_hw_sm_query
*hsq
= nvc0_hw_sm_query(hq
);
1858 struct nvc0_program
*old
= nvc0
->compprog
;
1859 struct pipe_grid_info info
= {};
1862 const uint block
[3] = { 32, is_nve4
? 4 : 1, 1 };
1863 const uint grid
[3] = { screen
->mp_count
, screen
->gpc_count
, 1 };
1866 if (unlikely(!screen
->pm
.prog
))
1867 screen
->pm
.prog
= nvc0_hw_sm_get_program(screen
);
1869 /* disable all counting */
1870 PUSH_SPACE(push
, 8);
1871 for (c
= 0; c
< 8; ++c
)
1872 if (screen
->pm
.mp_counter
[c
]) {
1874 IMMED_NVC0(push
, NVE4_CP(MP_PM_FUNC(c
)), 0);
1876 IMMED_NVC0(push
, NVC0_CP(MP_PM_OP(c
)), 0);
1879 /* release counters for this query */
1880 for (c
= 0; c
< 8; ++c
) {
1881 if (screen
->pm
.mp_counter
[c
] == hsq
) {
1882 uint8_t d
= is_nve4
? c
/ 4 : 0; /* only one domain for NVC0:NVE4 */
1883 screen
->pm
.num_hw_sm_active
[d
]--;
1884 screen
->pm
.mp_counter
[c
] = NULL
;
1888 BCTX_REFN_bo(nvc0
->bufctx_cp
, CP_QUERY
, NOUVEAU_BO_GART
| NOUVEAU_BO_WR
,
1891 PUSH_SPACE(push
, 1);
1892 IMMED_NVC0(push
, SUBC_CP(NV50_GRAPH_SERIALIZE
), 0);
1894 /* upload input data for the compute shader which reads MP counters */
1895 nvc0_hw_sm_upload_input(nvc0
, hq
);
1897 pipe
->bind_compute_state(pipe
, screen
->pm
.prog
);
1898 for (i
= 0; i
< 3; i
++) {
1899 info
.block
[i
] = block
[i
];
1900 info
.grid
[i
] = grid
[i
];
1904 pipe
->launch_grid(pipe
, &info
);
1905 pipe
->bind_compute_state(pipe
, old
);
1907 nouveau_bufctx_reset(nvc0
->bufctx_cp
, NVC0_BIND_CP_QUERY
);
1909 /* re-activate other counters */
1910 PUSH_SPACE(push
, 16);
1912 for (c
= 0; c
< 8; ++c
) {
1913 const struct nvc0_hw_sm_query_cfg
*cfg
;
1916 hsq
= screen
->pm
.mp_counter
[c
];
1920 cfg
= nvc0_hw_sm_query_get_cfg(nvc0
, &hsq
->base
);
1921 for (i
= 0; i
< cfg
->num_counters
; ++i
) {
1922 if (mask
& (1 << hsq
->ctr
[i
]))
1924 mask
|= 1 << hsq
->ctr
[i
];
1926 BEGIN_NVC0(push
, NVE4_CP(MP_PM_FUNC(hsq
->ctr
[i
])), 1);
1928 BEGIN_NVC0(push
, NVC0_CP(MP_PM_OP(hsq
->ctr
[i
])), 1);
1930 PUSH_DATA (push
, (cfg
->ctr
[i
].func
<< 4) | cfg
->ctr
[i
].mode
);
1936 nvc0_hw_sm_query_read_data(uint32_t count
[32][8],
1937 struct nvc0_context
*nvc0
, bool wait
,
1938 struct nvc0_hw_query
*hq
,
1939 const struct nvc0_hw_sm_query_cfg
*cfg
,
1942 struct nvc0_hw_sm_query
*hsq
= nvc0_hw_sm_query(hq
);
1945 for (p
= 0; p
< mp_count
; ++p
) {
1946 const unsigned b
= (0x30 / 4) * p
;
1948 for (c
= 0; c
< cfg
->num_counters
; ++c
) {
1949 if (hq
->data
[b
+ 8] != hq
->sequence
) {
1952 if (nouveau_bo_wait(hq
->bo
, NOUVEAU_BO_RD
, nvc0
->base
.client
))
1955 count
[p
][c
] = hq
->data
[b
+ hsq
->ctr
[c
]] * (1 << c
);
1962 nve4_hw_sm_query_read_data(uint32_t count
[32][8],
1963 struct nvc0_context
*nvc0
, bool wait
,
1964 struct nvc0_hw_query
*hq
,
1965 const struct nvc0_hw_sm_query_cfg
*cfg
,
1968 struct nvc0_hw_sm_query
*hsq
= nvc0_hw_sm_query(hq
);
1971 for (p
= 0; p
< mp_count
; ++p
) {
1972 const unsigned b
= (0x60 / 4) * p
;
1974 for (c
= 0; c
< cfg
->num_counters
; ++c
) {
1976 for (d
= 0; d
< ((hsq
->ctr
[c
] & ~3) ? 1 : 4); ++d
) {
1977 if (hq
->data
[b
+ 20 + d
] != hq
->sequence
) {
1980 if (nouveau_bo_wait(hq
->bo
, NOUVEAU_BO_RD
, nvc0
->base
.client
))
1983 if (hsq
->ctr
[c
] & ~0x3)
1984 count
[p
][c
] = hq
->data
[b
+ 16 + (hsq
->ctr
[c
] & 3)];
1986 count
[p
][c
] += hq
->data
[b
+ d
* 4 + hsq
->ctr
[c
]];
1994 nvc0_hw_sm_get_query_result(struct nvc0_context
*nvc0
, struct nvc0_hw_query
*hq
,
1995 boolean wait
, union pipe_query_result
*result
)
1997 uint32_t count
[32][8];
1999 unsigned mp_count
= MIN2(nvc0
->screen
->mp_count_compute
, 32);
2001 const struct nvc0_hw_sm_query_cfg
*cfg
;
2004 cfg
= nvc0_hw_sm_query_get_cfg(nvc0
, hq
);
2006 if (nvc0
->screen
->base
.class_3d
>= NVE4_3D_CLASS
)
2007 ret
= nve4_hw_sm_query_read_data(count
, nvc0
, wait
, hq
, cfg
, mp_count
);
2009 ret
= nvc0_hw_sm_query_read_data(count
, nvc0
, wait
, hq
, cfg
, mp_count
);
2013 for (c
= 0; c
< cfg
->num_counters
; ++c
)
2014 for (p
= 0; p
< mp_count
; ++p
)
2015 value
+= count
[p
][c
];
2016 value
= (value
* cfg
->norm
[0]) / cfg
->norm
[1];
2018 *(uint64_t *)result
= value
;
2022 static const struct nvc0_hw_query_funcs hw_sm_query_funcs
= {
2023 .destroy_query
= nvc0_hw_sm_destroy_query
,
2024 .begin_query
= nvc0_hw_sm_begin_query
,
2025 .end_query
= nvc0_hw_sm_end_query
,
2026 .get_query_result
= nvc0_hw_sm_get_query_result
,
2029 struct nvc0_hw_query
*
2030 nvc0_hw_sm_create_query(struct nvc0_context
*nvc0
, unsigned type
)
2032 struct nvc0_screen
*screen
= nvc0
->screen
;
2033 struct nvc0_hw_sm_query
*hsq
;
2034 struct nvc0_hw_query
*hq
;
2037 if (nvc0
->screen
->base
.drm
->version
< 0x01000101)
2040 if (type
< NVC0_HW_SM_QUERY(0) || type
> NVC0_HW_SM_QUERY_LAST
)
2043 hsq
= CALLOC_STRUCT(nvc0_hw_sm_query
);
2048 hq
->funcs
= &hw_sm_query_funcs
;
2049 hq
->base
.type
= type
;
2051 if (screen
->base
.class_3d
>= NVE4_3D_CLASS
) {
2073 * [50] = WS0.sequence
2074 * [54] = WS1.sequence
2075 * [58] = WS2.sequence
2076 * [5c] = WS3.sequence
2078 space
= (4 * 4 + 4 + 4) * nvc0
->screen
->mp_count
* sizeof(uint32_t);
2081 * Note that padding is used to align memory access to 128 bits.
2092 * [20] = MP.sequence
2097 space
= (8 + 1 + 3) * nvc0
->screen
->mp_count
* sizeof(uint32_t);
2100 if (!nvc0_hw_query_allocate(nvc0
, &hq
->base
, space
)) {
2109 nvc0_hw_sm_get_driver_query_info(struct nvc0_screen
*screen
, unsigned id
,
2110 struct pipe_driver_query_info
*info
)
2114 if (screen
->base
.drm
->version
>= 0x01000101) {
2115 if (screen
->compute
)
2116 count
= nvc0_hw_sm_get_num_queries(screen
);
2123 if (screen
->compute
) {
2124 if (screen
->base
.class_3d
<= NVF0_3D_CLASS
) {
2125 const struct nvc0_hw_sm_query_cfg
**queries
=
2126 nvc0_hw_sm_get_queries(screen
);
2128 info
->name
= nvc0_hw_sm_query_get_name(queries
[id
]->type
);
2129 info
->query_type
= NVC0_HW_SM_QUERY(queries
[id
]->type
);
2130 info
->group_id
= NVC0_HW_SM_QUERY_GROUP
;