2 * Copyright 2011 Christoph Bumiller
3 * Copyright 2015 Samuel Pitoiset
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21 * OTHER DEALINGS IN THE SOFTWARE.
24 #define NVC0_PUSH_EXPLICIT_SPACE_CHECKING
26 #include "nvc0/nvc0_context.h"
27 #include "nvc0/nvc0_query_hw_sm.h"
29 #include "nv_object.xml.h"
30 #include "nvc0/nve4_compute.xml.h"
31 #include "nvc0/nvc0_compute.xml.h"
33 /* NOTE: intentionally using the same names as NV */
34 #define _Q(t, n) { NVC0_HW_SM_QUERY_##t, n }
38 } nvc0_hw_sm_queries
[] = {
39 _Q(ACTIVE_CYCLES
, "active_cycles" ),
40 _Q(ACTIVE_WARPS
, "active_warps" ),
41 _Q(ATOM_CAS_COUNT
, "atom_cas_count" ),
42 _Q(ATOM_COUNT
, "atom_count" ),
43 _Q(BRANCH
, "branch" ),
44 _Q(DIVERGENT_BRANCH
, "divergent_branch" ),
45 _Q(GLD_REQUEST
, "gld_request" ),
46 _Q(GLD_MEM_DIV_REPLAY
, "global_ld_mem_divergence_replays" ),
47 _Q(GST_TRANSACTIONS
, "global_store_transaction" ),
48 _Q(GST_MEM_DIV_REPLAY
, "global_st_mem_divergence_replays" ),
49 _Q(GRED_COUNT
, "gred_count" ),
50 _Q(GST_REQUEST
, "gst_request" ),
51 _Q(INST_EXECUTED
, "inst_executed" ),
52 _Q(INST_ISSUED
, "inst_issued" ),
53 _Q(INST_ISSUED1
, "inst_issued1" ),
54 _Q(INST_ISSUED2
, "inst_issued2" ),
55 _Q(INST_ISSUED1_0
, "inst_issued1_0" ),
56 _Q(INST_ISSUED1_1
, "inst_issued1_1" ),
57 _Q(INST_ISSUED2_0
, "inst_issued2_0" ),
58 _Q(INST_ISSUED2_1
, "inst_issued2_1" ),
59 _Q(L1_GLD_HIT
, "l1_global_load_hit" ),
60 _Q(L1_GLD_MISS
, "l1_global_load_miss" ),
61 _Q(L1_GLD_TRANSACTIONS
, "__l1_global_load_transactions" ),
62 _Q(L1_GST_TRANSACTIONS
, "__l1_global_store_transactions" ),
63 _Q(L1_LOCAL_LD_HIT
, "l1_local_load_hit" ),
64 _Q(L1_LOCAL_LD_MISS
, "l1_local_load_miss" ),
65 _Q(L1_LOCAL_ST_HIT
, "l1_local_store_hit" ),
66 _Q(L1_LOCAL_ST_MISS
, "l1_local_store_miss" ),
67 _Q(L1_SHARED_LD_TRANSACTIONS
, "l1_shared_load_transactions" ),
68 _Q(L1_SHARED_ST_TRANSACTIONS
, "l1_shared_store_transactions" ),
69 _Q(LOCAL_LD
, "local_load" ),
70 _Q(LOCAL_LD_TRANSACTIONS
, "local_load_transactions" ),
71 _Q(LOCAL_ST
, "local_store" ),
72 _Q(LOCAL_ST_TRANSACTIONS
, "local_store_transactions" ),
73 _Q(NOT_PRED_OFF_INST_EXECUTED
, "not_predicated_off_thread_inst_executed" ),
74 _Q(PROF_TRIGGER_0
, "prof_trigger_00" ),
75 _Q(PROF_TRIGGER_1
, "prof_trigger_01" ),
76 _Q(PROF_TRIGGER_2
, "prof_trigger_02" ),
77 _Q(PROF_TRIGGER_3
, "prof_trigger_03" ),
78 _Q(PROF_TRIGGER_4
, "prof_trigger_04" ),
79 _Q(PROF_TRIGGER_5
, "prof_trigger_05" ),
80 _Q(PROF_TRIGGER_6
, "prof_trigger_06" ),
81 _Q(PROF_TRIGGER_7
, "prof_trigger_07" ),
82 _Q(SHARED_LD
, "shared_load" ),
83 _Q(SHARED_LD_REPLAY
, "shared_load_replay" ),
84 _Q(SHARED_ST
, "shared_store" ),
85 _Q(SHARED_ST_REPLAY
, "shared_store_replay" ),
86 _Q(SM_CTA_LAUNCHED
, "sm_cta_launched" ),
87 _Q(THREADS_LAUNCHED
, "threads_launched" ),
88 _Q(TH_INST_EXECUTED
, "thread_inst_executed" ),
89 _Q(TH_INST_EXECUTED_0
, "thread_inst_executed_0" ),
90 _Q(TH_INST_EXECUTED_1
, "thread_inst_executed_1" ),
91 _Q(TH_INST_EXECUTED_2
, "thread_inst_executed_2" ),
92 _Q(TH_INST_EXECUTED_3
, "thread_inst_executed_3" ),
93 _Q(UNCACHED_GLD_TRANSACTIONS
, "uncached_global_load_transaction" ),
94 _Q(WARPS_LAUNCHED
, "warps_launched" ),
99 static inline const char *
100 nvc0_hw_sm_query_get_name(unsigned query_type
)
104 for (i
= 0; i
< ARRAY_SIZE(nvc0_hw_sm_queries
); i
++) {
105 if (nvc0_hw_sm_queries
[i
].type
== query_type
)
106 return nvc0_hw_sm_queries
[i
].name
;
112 /* === PERFORMANCE MONITORING COUNTERS for NVE4+ === */
114 /* Code to read out MP counters: They are accessible via mmio, too, but let's
115 * just avoid mapping registers in userspace. We'd have to know which MPs are
116 * enabled/present, too, and that information is not presently exposed.
117 * We could add a kernel interface for it, but reading the counters like this
118 * has the advantage of being async (if get_result isn't called immediately).
120 static const uint64_t nve4_read_hw_sm_counters_code
[] =
122 /* sched 0x20 0x20 0x20 0x20 0x20 0x20 0x20
124 * mov b32 $r12 $physid
130 * sched 0x20 0x20 0x23 0x04 0x20 0x04 0x2b
134 * set $p0 0x1 eq u32 $r8 0x0
135 * mov b32 $r10 c0[0x0]
136 * ext u32 $r8 $r12 0x414
137 * mov b32 $r11 c0[0x4]
138 * sched 0x04 0x2e 0x04 0x20 0x20 0x28 0x04
139 * ext u32 $r9 $r12 0x208
141 * set $p1 0x1 eq u32 $r9 0x0
142 * mul $r8 u32 $r8 u32 96
143 * mul $r12 u32 $r9 u32 16
144 * mul $r13 u32 $r9 u32 4
145 * add b32 $r9 $r8 $r13
146 * sched 0x28 0x04 0x2c 0x04 0x2c 0x04 0x2c
147 * add b32 $r8 $r8 $r12
149 * add b32 $r10 $c $r10 $r8
151 * add b32 $r11 $r11 0x0 $c
152 * add b32 $r12 $c $r12 $r9
153 * st b128 wt g[$r10d] $r0q
154 * sched 0x4 0x2c 0x20 0x04 0x2e 0x00 0x00
155 * mov b32 $r0 c0[0x8]
156 * add b32 $r13 $r13 0x0 $c
157 * $p1 st b128 wt g[$r12d+0x40] $r4q
158 * st b32 wt g[$r12d+0x50] $r0
160 0x2202020202020207ULL
,
161 0x2c00000084021c04ULL
,
162 0x2c0000000c031c04ULL
,
163 0x2c00000010001c04ULL
,
164 0x2c00000014005c04ULL
,
165 0x2c00000018009c04ULL
,
166 0x2c0000001c00dc04ULL
,
167 0x2c00000020011c04ULL
,
168 0x22b0420042320207ULL
,
169 0x2c00000024015c04ULL
,
170 0x2c00000028019c04ULL
,
171 0x2c0000002c01dc04ULL
,
172 0x190e0000fc81dc03ULL
,
173 0x2800400000029de4ULL
,
174 0x7000c01050c21c03ULL
,
175 0x280040001002dde4ULL
,
176 0x204282020042e047ULL
,
177 0x7000c00820c25c03ULL
,
178 0x80000000000021e7ULL
,
179 0x190e0000fc93dc03ULL
,
180 0x1000000180821c02ULL
,
181 0x1000000040931c02ULL
,
182 0x1000000010935c02ULL
,
183 0x4800000034825c03ULL
,
184 0x22c042c042c04287ULL
,
185 0x4800000030821c03ULL
,
186 0x2800000028031de4ULL
,
187 0x4801000020a29c03ULL
,
188 0x280000002c035de4ULL
,
189 0x0800000000b2dc42ULL
,
190 0x4801000024c31c03ULL
,
191 0x9400000000a01fc5ULL
,
192 0x200002e04202c047ULL
,
193 0x2800400020001de4ULL
,
194 0x0800000000d35c42ULL
,
195 0x9400000100c107c5ULL
,
196 0x9400000140c01f85ULL
,
197 0x8000000000001de7ULL
200 static const uint64_t nvf0_read_hw_sm_counters_code
[] =
202 /* Same kernel as GK104 */
203 0x0880808080808080ULL
,
204 0x86400000109c0022ULL
,
205 0x86400000019c0032ULL
,
206 0x86400000021c0002ULL
,
207 0x86400000029c0006ULL
,
208 0x86400000031c000aULL
,
209 0x86400000039c000eULL
,
210 0x86400000041c0012ULL
,
211 0x08ac1080108c8080ULL
,
212 0x86400000049c0016ULL
,
213 0x86400000051c001aULL
,
214 0x86400000059c001eULL
,
215 0xdb201c007f9c201eULL
,
216 0x64c03c00001c002aULL
,
217 0xc00000020a1c3021ULL
,
218 0x64c03c00009c002eULL
,
219 0x0810a0808010b810ULL
,
220 0xc0000001041c3025ULL
,
221 0x180000000020003cULL
,
222 0xdb201c007f9c243eULL
,
223 0xc1c00000301c2021ULL
,
224 0xc1c00000081c2431ULL
,
225 0xc1c00000021c2435ULL
,
226 0xe0800000069c2026ULL
,
227 0x08b010b010b010a0ULL
,
228 0xe0800000061c2022ULL
,
229 0xe4c03c00051c0032ULL
,
230 0xe0840000041c282aULL
,
231 0xe4c03c00059c0036ULL
,
232 0xe08040007f9c2c2eULL
,
233 0xe0840000049c3032ULL
,
234 0xfe800000001c2800ULL
,
235 0x080000b81080b010ULL
,
236 0x64c03c00011c0002ULL
,
237 0xe08040007f9c3436ULL
,
238 0xfe80000020043010ULL
,
239 0xfc800000281c3000ULL
,
240 0x18000000001c003cULL
,
243 /* For simplicity, we will allocate as many group slots as we allocate counter
244 * slots. This means that a single counter which wants to source from 2 groups
245 * will have to be declared as using 2 counter slots. This shouldn't really be
246 * a problem because such queries don't make much sense ... (unless someone is
249 struct nvc0_hw_sm_counter_cfg
251 uint32_t func
: 16; /* mask or 4-bit logic op (depending on mode) */
252 uint32_t mode
: 4; /* LOGOP,B6,LOGOP_B6(_PULSE) */
253 uint32_t sig_dom
: 1; /* if 0, MP_PM_A (per warp-sched), if 1, MP_PM_B */
254 uint32_t sig_sel
: 8; /* signal group */
255 uint32_t src_mask
; /* mask for signal selection (only for NVC0:NVE4) */
256 uint32_t src_sel
; /* signal selection for up to 4 sources */
259 struct nvc0_hw_sm_query_cfg
262 struct nvc0_hw_sm_counter_cfg ctr
[8];
263 uint8_t num_counters
;
264 uint8_t norm
[2]; /* normalization num,denom */
267 #define _CA(f, m, g, s) { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g, 0, s }
268 #define _CB(f, m, g, s) { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g, 0, s }
269 #define _Q(n, c) [NVE4_HW_SM_QUERY_##n] = c
271 /* ==== Compute capability 3.0 (GK104:GK110) ==== */
272 static const struct nvc0_hw_sm_query_cfg
275 .type
= NVC0_HW_SM_QUERY_ACTIVE_CYCLES
,
276 .ctr
[0] = _CB(0x0001, B6
, WARP
, 0x00000000),
281 static const struct nvc0_hw_sm_query_cfg
284 .type
= NVC0_HW_SM_QUERY_ACTIVE_WARPS
,
285 .ctr
[0] = _CB(0x003f, B6
, WARP
, 0x31483104),
290 static const struct nvc0_hw_sm_query_cfg
291 sm30_atom_cas_count
=
293 .type
= NVC0_HW_SM_QUERY_ATOM_CAS_COUNT
,
294 .ctr
[0] = _CA(0x0001, B6
, BRANCH
, 0x000000004),
299 static const struct nvc0_hw_sm_query_cfg
302 .type
= NVC0_HW_SM_QUERY_ATOM_COUNT
,
303 .ctr
[0] = _CA(0x0001, B6
, BRANCH
, 0x00000000),
308 static const struct nvc0_hw_sm_query_cfg
311 .type
= NVC0_HW_SM_QUERY_BRANCH
,
312 .ctr
[0] = _CA(0x0001, B6
, BRANCH
, 0x0000000c),
317 static const struct nvc0_hw_sm_query_cfg
318 sm30_divergent_branch
=
320 .type
= NVC0_HW_SM_QUERY_DIVERGENT_BRANCH
,
321 .ctr
[0] = _CA(0x0001, B6
, BRANCH
, 0x00000010),
326 static const struct nvc0_hw_sm_query_cfg
329 .type
= NVC0_HW_SM_QUERY_GLD_REQUEST
,
330 .ctr
[0] = _CA(0x0001, B6
, LDST
, 0x00000010),
335 static const struct nvc0_hw_sm_query_cfg
336 sm30_gld_mem_div_replay
=
338 .type
= NVC0_HW_SM_QUERY_GLD_MEM_DIV_REPLAY
,
339 .ctr
[0] = _CB(0x0001, B6
, REPLAY
, 0x00000010),
344 static const struct nvc0_hw_sm_query_cfg
345 sm30_gst_transactions
=
347 .type
= NVC0_HW_SM_QUERY_GST_TRANSACTIONS
,
348 .ctr
[0] = _CB(0x0001, B6
, MEM
, 0x00000004),
353 static const struct nvc0_hw_sm_query_cfg
354 sm30_gst_mem_div_replay
=
356 .type
= NVC0_HW_SM_QUERY_GST_MEM_DIV_REPLAY
,
357 .ctr
[0] = _CB(0x0001, B6
, REPLAY
, 0x00000014),
362 static const struct nvc0_hw_sm_query_cfg
365 .type
= NVC0_HW_SM_QUERY_GRED_COUNT
,
366 .ctr
[0] = _CA(0x0001, B6
, BRANCH
, 0x00000008),
371 static const struct nvc0_hw_sm_query_cfg
374 .type
= NVC0_HW_SM_QUERY_GST_REQUEST
,
375 .ctr
[0] = _CA(0x0001, B6
, LDST
, 0x00000014),
380 static const struct nvc0_hw_sm_query_cfg
383 .type
= NVC0_HW_SM_QUERY_INST_EXECUTED
,
384 .ctr
[0] = _CA(0x0003, B6
, EXEC
, 0x00000398),
389 static const struct nvc0_hw_sm_query_cfg
392 .type
= NVC0_HW_SM_QUERY_INST_ISSUED1
,
393 .ctr
[0] = _CA(0x0001, B6
, ISSUE
, 0x00000004),
398 static const struct nvc0_hw_sm_query_cfg
401 .type
= NVC0_HW_SM_QUERY_INST_ISSUED2
,
402 .ctr
[0] = _CA(0x0001, B6
, ISSUE
, 0x00000008),
407 static const struct nvc0_hw_sm_query_cfg
410 .type
= NVC0_HW_SM_QUERY_L1_GLD_HIT
,
411 .ctr
[0] = _CB(0x0001, B6
, L1
, 0x00000010),
416 static const struct nvc0_hw_sm_query_cfg
419 .type
= NVC0_HW_SM_QUERY_L1_GLD_MISS
,
420 .ctr
[0] = _CB(0x0001, B6
, L1
, 0x00000014),
425 static const struct nvc0_hw_sm_query_cfg
426 sm30_l1_gld_transactions
=
428 .type
= NVC0_HW_SM_QUERY_L1_GLD_TRANSACTIONS
,
429 .ctr
[0] = _CB(0x0001, B6
, UNK0F
, 0x00000000),
434 static const struct nvc0_hw_sm_query_cfg
435 sm30_l1_gst_transactions
=
437 .type
= NVC0_HW_SM_QUERY_L1_GST_TRANSACTIONS
,
438 .ctr
[0] = _CB(0x0001, B6
, UNK0F
, 0x00000004),
443 static const struct nvc0_hw_sm_query_cfg
444 sm30_l1_local_ld_hit
=
446 .type
= NVC0_HW_SM_QUERY_L1_LOCAL_LD_HIT
,
447 .ctr
[0] = _CB(0x0001, B6
, L1
, 0x00000000),
452 static const struct nvc0_hw_sm_query_cfg
453 sm30_l1_local_ld_miss
=
455 .type
= NVC0_HW_SM_QUERY_L1_LOCAL_LD_MISS
,
456 .ctr
[0] = _CB(0x0001, B6
, L1
, 0x00000004),
461 static const struct nvc0_hw_sm_query_cfg
462 sm30_l1_local_st_hit
=
464 .type
= NVC0_HW_SM_QUERY_L1_LOCAL_ST_HIT
,
465 .ctr
[0] = _CB(0x0001, B6
, L1
, 0x00000008),
470 static const struct nvc0_hw_sm_query_cfg
471 sm30_l1_local_st_miss
=
473 .type
= NVC0_HW_SM_QUERY_L1_LOCAL_ST_MISS
,
474 .ctr
[0] = _CB(0x0001, B6
, L1
, 0x0000000c),
479 static const struct nvc0_hw_sm_query_cfg
480 sm30_l1_shared_ld_transactions
=
482 .type
= NVC0_HW_SM_QUERY_L1_SHARED_LD_TRANSACTIONS
,
483 .ctr
[0] = _CB(0x0001, B6
, TRANSACTION
, 0x00000008),
488 static const struct nvc0_hw_sm_query_cfg
489 sm30_l1_shared_st_transactions
=
491 .type
= NVC0_HW_SM_QUERY_L1_SHARED_ST_TRANSACTIONS
,
492 .ctr
[0] = _CB(0x0001, B6
, TRANSACTION
, 0x0000000c),
497 static const struct nvc0_hw_sm_query_cfg
500 .type
= NVC0_HW_SM_QUERY_LOCAL_LD
,
501 .ctr
[0] = _CA(0x0001, B6
, LDST
, 0x00000008),
506 static const struct nvc0_hw_sm_query_cfg
507 sm30_local_ld_transactions
=
509 .type
= NVC0_HW_SM_QUERY_LOCAL_LD_TRANSACTIONS
,
510 .ctr
[0] = _CB(0x0001, B6
, TRANSACTION
, 0x00000000),
515 static const struct nvc0_hw_sm_query_cfg
518 .type
= NVC0_HW_SM_QUERY_LOCAL_ST
,
519 .ctr
[0] = _CA(0x0001, B6
, LDST
, 0x0000000c),
524 static const struct nvc0_hw_sm_query_cfg
525 sm30_local_st_transactions
=
527 .type
= NVC0_HW_SM_QUERY_LOCAL_ST_TRANSACTIONS
,
528 .ctr
[0] = _CB(0x0001, B6
, TRANSACTION
, 0x00000004),
533 static const struct nvc0_hw_sm_query_cfg
534 sm30_prof_trigger_0
=
536 .type
= NVC0_HW_SM_QUERY_PROF_TRIGGER_0
,
537 .ctr
[0] = _CA(0x0001, B6
, USER
, 0x00000000),
542 static const struct nvc0_hw_sm_query_cfg
543 sm30_prof_trigger_1
=
545 .type
= NVC0_HW_SM_QUERY_PROF_TRIGGER_1
,
546 .ctr
[0] = _CA(0x0001, B6
, USER
, 0x00000004),
551 static const struct nvc0_hw_sm_query_cfg
552 sm30_prof_trigger_2
=
554 .type
= NVC0_HW_SM_QUERY_PROF_TRIGGER_2
,
555 .ctr
[0] = _CA(0x0001, B6
, USER
, 0x00000008),
560 static const struct nvc0_hw_sm_query_cfg
561 sm30_prof_trigger_3
=
563 .type
= NVC0_HW_SM_QUERY_PROF_TRIGGER_3
,
564 .ctr
[0] = _CA(0x0001, B6
, USER
, 0x0000000c),
569 static const struct nvc0_hw_sm_query_cfg
570 sm30_prof_trigger_4
=
572 .type
= NVC0_HW_SM_QUERY_PROF_TRIGGER_4
,
573 .ctr
[0] = _CA(0x0001, B6
, USER
, 0x00000010),
578 static const struct nvc0_hw_sm_query_cfg
579 sm30_prof_trigger_5
=
581 .type
= NVC0_HW_SM_QUERY_PROF_TRIGGER_5
,
582 .ctr
[0] = _CA(0x0001, B6
, USER
, 0x00000014),
587 static const struct nvc0_hw_sm_query_cfg
588 sm30_prof_trigger_6
=
590 .type
= NVC0_HW_SM_QUERY_PROF_TRIGGER_6
,
591 .ctr
[0] = _CA(0x0001, B6
, USER
, 0x00000018),
596 static const struct nvc0_hw_sm_query_cfg
597 sm30_prof_trigger_7
=
599 .type
= NVC0_HW_SM_QUERY_PROF_TRIGGER_7
,
600 .ctr
[0] = _CA(0x0001, B6
, USER
, 0x0000001c),
605 static const struct nvc0_hw_sm_query_cfg
608 .type
= NVC0_HW_SM_QUERY_SHARED_LD
,
609 .ctr
[0] = _CA(0x0001, B6
, LDST
, 0x00000000),
614 static const struct nvc0_hw_sm_query_cfg
615 sm30_shared_ld_replay
=
617 .type
= NVC0_HW_SM_QUERY_SHARED_LD_REPLAY
,
618 .ctr
[0] = _CB(0x0001, B6
, REPLAY
, 0x00000008),
623 static const struct nvc0_hw_sm_query_cfg
626 .type
= NVC0_HW_SM_QUERY_SHARED_ST
,
627 .ctr
[0] = _CA(0x0001, B6
, LDST
, 0x00000004),
632 static const struct nvc0_hw_sm_query_cfg
633 sm30_shared_st_replay
=
635 .type
= NVC0_HW_SM_QUERY_SHARED_ST_REPLAY
,
636 .ctr
[0] = _CB(0x0001, B6
, REPLAY
, 0x0000000c),
641 static const struct nvc0_hw_sm_query_cfg
642 sm30_sm_cta_launched
=
644 .type
= NVC0_HW_SM_QUERY_SM_CTA_LAUNCHED
,
645 .ctr
[0] = _CB(0x0001, B6
, WARP
, 0x0000001c),
650 static const struct nvc0_hw_sm_query_cfg
651 sm30_threads_launched
=
653 .type
= NVC0_HW_SM_QUERY_THREADS_LAUNCHED
,
654 .ctr
[0] = _CA(0x003f, B6
, LAUNCH
, 0x398a4188),
659 static const struct nvc0_hw_sm_query_cfg
660 sm30_uncached_gld_transactions
=
662 .type
= NVC0_HW_SM_QUERY_UNCACHED_GLD_TRANSACTIONS
,
663 .ctr
[0] = _CB(0x0001, B6
, MEM
, 0x00000000),
668 static const struct nvc0_hw_sm_query_cfg
669 sm30_warps_launched
=
671 .type
= NVC0_HW_SM_QUERY_WARPS_LAUNCHED
,
672 .ctr
[0] = _CA(0x0001, B6
, LAUNCH
, 0x00000004),
678 * active_warps: bit 0 alternates btw 0 and 1 for odd nr of warps
679 * inst_executed etc.: we only count a single warp scheduler
681 static const struct nvc0_hw_sm_query_cfg
*sm30_hw_sm_queries
[] =
685 &sm30_atom_cas_count
,
688 &sm30_divergent_branch
,
690 &sm30_gld_mem_div_replay
,
691 &sm30_gst_transactions
,
692 &sm30_gst_mem_div_replay
,
700 &sm30_l1_gld_transactions
,
701 &sm30_l1_gst_transactions
,
702 &sm30_l1_local_ld_hit
,
703 &sm30_l1_local_ld_miss
,
704 &sm30_l1_local_st_hit
,
705 &sm30_l1_local_st_miss
,
706 &sm30_l1_shared_ld_transactions
,
707 &sm30_l1_shared_st_transactions
,
709 &sm30_local_ld_transactions
,
711 &sm30_local_st_transactions
,
712 &sm30_prof_trigger_0
,
713 &sm30_prof_trigger_1
,
714 &sm30_prof_trigger_2
,
715 &sm30_prof_trigger_3
,
716 &sm30_prof_trigger_4
,
717 &sm30_prof_trigger_5
,
718 &sm30_prof_trigger_6
,
719 &sm30_prof_trigger_7
,
721 &sm30_shared_ld_replay
,
723 &sm30_shared_st_replay
,
724 &sm30_sm_cta_launched
,
725 &sm30_threads_launched
,
726 &sm30_uncached_gld_transactions
,
727 &sm30_warps_launched
,
730 /* ==== Compute capability 3.5 (GK110/GK208) ==== */
731 static const struct nvc0_hw_sm_query_cfg
732 sm35_atom_cas_count
=
734 .type
= NVC0_HW_SM_QUERY_ATOM_CAS_COUNT
,
735 .ctr
[0] = _CA(0x0001, B6
, UNK1A
, 0x00000014),
740 static const struct nvc0_hw_sm_query_cfg
743 .type
= NVC0_HW_SM_QUERY_ATOM_COUNT
,
744 .ctr
[0] = _CA(0x0001, B6
, UNK1A
, 0x00000010),
749 static const struct nvc0_hw_sm_query_cfg
752 .type
= NVC0_HW_SM_QUERY_GRED_COUNT
,
753 .ctr
[0] = _CA(0x0001, B6
, UNK1A
, 0x00000018),
758 static const struct nvc0_hw_sm_query_cfg
759 sm35_not_pred_off_inst_executed
=
761 .type
= NVC0_HW_SM_QUERY_NOT_PRED_OFF_INST_EXECUTED
,
762 .ctr
[0] = _CA(0x003f, B6
, UNK14
, 0x29062080),
767 static const struct nvc0_hw_sm_query_cfg
768 sm35_shared_ld_replay
=
770 .type
= NVC0_HW_SM_QUERY_SHARED_LD_REPLAY
,
771 .ctr
[0] = _CB(0xaaaa, LOGOP
, UNK13
, 0x00000018),
772 .ctr
[1] = _CB(0x8888, LOGOP
, REPLAY
, 0x00000151),
777 static const struct nvc0_hw_sm_query_cfg
778 sm35_shared_st_replay
=
780 .type
= NVC0_HW_SM_QUERY_SHARED_ST_REPLAY
,
781 .ctr
[0] = _CB(0xaaaa, LOGOP
, UNK13
, 0x00000018),
782 .ctr
[1] = _CB(0x8888, LOGOP
, REPLAY
, 0x000001d1),
787 static const struct nvc0_hw_sm_query_cfg
788 sm35_th_inst_executed
=
790 .type
= NVC0_HW_SM_QUERY_TH_INST_EXECUTED
,
791 .ctr
[0] = _CA(0x003f, B6
, UNK11
, 0x29062080),
796 static const struct nvc0_hw_sm_query_cfg
*sm35_hw_sm_queries
[] =
800 &sm35_atom_cas_count
,
803 &sm30_gld_mem_div_replay
,
804 &sm30_gst_transactions
,
805 &sm30_gst_mem_div_replay
,
813 &sm30_l1_gld_transactions
,
814 &sm30_l1_gst_transactions
,
815 &sm30_l1_local_ld_hit
,
816 &sm30_l1_local_ld_miss
,
817 &sm30_l1_local_st_hit
,
818 &sm30_l1_local_st_miss
,
819 &sm30_l1_shared_ld_transactions
,
820 &sm30_l1_shared_st_transactions
,
822 &sm30_local_ld_transactions
,
824 &sm30_local_st_transactions
,
825 &sm35_not_pred_off_inst_executed
,
826 &sm30_prof_trigger_0
,
827 &sm30_prof_trigger_1
,
828 &sm30_prof_trigger_2
,
829 &sm30_prof_trigger_3
,
830 &sm30_prof_trigger_4
,
831 &sm30_prof_trigger_5
,
832 &sm30_prof_trigger_6
,
833 &sm30_prof_trigger_7
,
835 &sm35_shared_ld_replay
,
837 &sm35_shared_st_replay
,
838 &sm30_sm_cta_launched
,
839 &sm35_th_inst_executed
,
840 &sm30_threads_launched
,
841 &sm30_uncached_gld_transactions
,
842 &sm30_warps_launched
,
849 /* === PERFORMANCE MONITORING COUNTERS for NVC0:NVE4 === */
851 * - MP counters on GF100/GF110 (compute capability 2.0) are buggy
852 * because there is a context-switch problem that we need to fix.
853 * Results might be wrong sometimes, be careful!
855 static const uint64_t nvc0_read_hw_sm_counters_code
[] =
858 * mov b32 $r9 $physid
867 * set $p0 0x1 eq u32 $r8 0x0
868 * mov b32 $r10 c0[0x0]
869 * mov b32 $r11 c0[0x4]
870 * ext u32 $r8 $r9 0x414
872 * mul $r8 u32 $r8 u32 48
873 * add b32 $r10 $c $r10 $r8
874 * add b32 $r11 $r11 0x0 $c
875 * mov b32 $r8 c0[0x8]
876 * st b128 wt g[$r10d+0x00] $r0q
877 * st b128 wt g[$r10d+0x10] $r4q
878 * st b32 wt g[$r10d+0x20] $r8
880 0x2c00000084021c04ULL
,
881 0x2c0000000c025c04ULL
,
882 0x2c00000010001c04ULL
,
883 0x2c00000014005c04ULL
,
884 0x2c00000018009c04ULL
,
885 0x2c0000001c00dc04ULL
,
886 0x2c00000020011c04ULL
,
887 0x2c00000024015c04ULL
,
888 0x2c00000028019c04ULL
,
889 0x2c0000002c01dc04ULL
,
890 0x190e0000fc81dc03ULL
,
891 0x2800400000029de4ULL
,
892 0x280040001002dde4ULL
,
893 0x7000c01050921c03ULL
,
894 0x80000000000021e7ULL
,
895 0x10000000c0821c02ULL
,
896 0x4801000020a29c03ULL
,
897 0x0800000000b2dc42ULL
,
898 0x2800400020021de4ULL
,
899 0x9400000000a01fc5ULL
,
900 0x9400000040a11fc5ULL
,
901 0x9400000080a21f85ULL
,
902 0x8000000000001de7ULL
905 #define _C(f, o, g, m, s) { f, NVC0_COMPUTE_MP_PM_OP_MODE_##o, 0, g, m, s }
907 /* ==== Compute capability 2.0 (GF100/GF110) ==== */
908 static const struct nvc0_hw_sm_query_cfg
911 .type
= NVC0_HW_SM_QUERY_ACTIVE_CYCLES
,
912 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x11, 0x000000ff, 0x00000000),
917 static const struct nvc0_hw_sm_query_cfg
920 .type
= NVC0_HW_SM_QUERY_ACTIVE_WARPS
,
921 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x24, 0x000000ff, 0x00000010),
922 .ctr
[1] = _C(0xaaaa, LOGOP
, 0x24, 0x000000ff, 0x00000020),
923 .ctr
[2] = _C(0xaaaa, LOGOP
, 0x24, 0x000000ff, 0x00000030),
924 .ctr
[3] = _C(0xaaaa, LOGOP
, 0x24, 0x000000ff, 0x00000040),
925 .ctr
[4] = _C(0xaaaa, LOGOP
, 0x24, 0x000000ff, 0x00000050),
926 .ctr
[5] = _C(0xaaaa, LOGOP
, 0x24, 0x000000ff, 0x00000060),
931 static const struct nvc0_hw_sm_query_cfg
934 .type
= NVC0_HW_SM_QUERY_ATOM_COUNT
,
935 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x63, 0x000000ff, 0x00000030),
940 static const struct nvc0_hw_sm_query_cfg
943 .type
= NVC0_HW_SM_QUERY_BRANCH
,
944 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x1a, 0x000000ff, 0x00000000),
945 .ctr
[1] = _C(0xaaaa, LOGOP
, 0x1a, 0x000000ff, 0x00000010),
950 static const struct nvc0_hw_sm_query_cfg
951 sm20_divergent_branch
=
953 .type
= NVC0_HW_SM_QUERY_DIVERGENT_BRANCH
,
954 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x19, 0x000000ff, 0x00000020),
955 .ctr
[1] = _C(0xaaaa, LOGOP
, 0x19, 0x000000ff, 0x00000030),
960 static const struct nvc0_hw_sm_query_cfg
963 .type
= NVC0_HW_SM_QUERY_GLD_REQUEST
,
964 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x64, 0x000000ff, 0x00000030),
969 static const struct nvc0_hw_sm_query_cfg
972 .type
= NVC0_HW_SM_QUERY_GRED_COUNT
,
973 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x63, 0x000000ff, 0x00000040),
978 static const struct nvc0_hw_sm_query_cfg
981 .type
= NVC0_HW_SM_QUERY_GST_REQUEST
,
982 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x64, 0x000000ff, 0x00000060),
987 static const struct nvc0_hw_sm_query_cfg
990 .type
= NVC0_HW_SM_QUERY_INST_EXECUTED
,
991 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x2d, 0x0000ffff, 0x00001000),
992 .ctr
[1] = _C(0xaaaa, LOGOP
, 0x2d, 0x0000ffff, 0x00001010),
997 static const struct nvc0_hw_sm_query_cfg
1000 .type
= NVC0_HW_SM_QUERY_INST_ISSUED
,
1001 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x27, 0x0000ffff, 0x00007060),
1002 .ctr
[1] = _C(0xaaaa, LOGOP
, 0x27, 0x0000ffff, 0x00007070),
1007 static const struct nvc0_hw_sm_query_cfg
1010 .type
= NVC0_HW_SM_QUERY_LOCAL_LD
,
1011 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x64, 0x000000ff, 0x00000020),
1016 static const struct nvc0_hw_sm_query_cfg
1019 .type
= NVC0_HW_SM_QUERY_LOCAL_ST
,
1020 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x64, 0x000000ff, 0x00000050),
1025 static const struct nvc0_hw_sm_query_cfg
1026 sm20_prof_trigger_0
=
1028 .type
= NVC0_HW_SM_QUERY_PROF_TRIGGER_0
,
1029 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x01, 0x000000ff, 0x00000000),
1034 static const struct nvc0_hw_sm_query_cfg
1035 sm20_prof_trigger_1
=
1037 .type
= NVC0_HW_SM_QUERY_PROF_TRIGGER_1
,
1038 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x01, 0x000000ff, 0x00000010),
1043 static const struct nvc0_hw_sm_query_cfg
1044 sm20_prof_trigger_2
=
1046 .type
= NVC0_HW_SM_QUERY_PROF_TRIGGER_2
,
1047 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x01, 0x000000ff, 0x00000020),
1052 static const struct nvc0_hw_sm_query_cfg
1053 sm20_prof_trigger_3
=
1055 .type
= NVC0_HW_SM_QUERY_PROF_TRIGGER_3
,
1056 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x01, 0x000000ff, 0x00000030),
1061 static const struct nvc0_hw_sm_query_cfg
1062 sm20_prof_trigger_4
=
1064 .type
= NVC0_HW_SM_QUERY_PROF_TRIGGER_4
,
1065 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x01, 0x000000ff, 0x00000040),
1070 static const struct nvc0_hw_sm_query_cfg
1071 sm20_prof_trigger_5
=
1073 .type
= NVC0_HW_SM_QUERY_PROF_TRIGGER_5
,
1074 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x01, 0x000000ff, 0x00000050),
1079 static const struct nvc0_hw_sm_query_cfg
1080 sm20_prof_trigger_6
=
1082 .type
= NVC0_HW_SM_QUERY_PROF_TRIGGER_6
,
1083 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x01, 0x000000ff, 0x00000060),
1088 static const struct nvc0_hw_sm_query_cfg
1089 sm20_prof_trigger_7
=
1091 .type
= NVC0_HW_SM_QUERY_PROF_TRIGGER_7
,
1092 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x01, 0x000000ff, 0x00000070),
1097 static const struct nvc0_hw_sm_query_cfg
1100 .type
= NVC0_HW_SM_QUERY_SHARED_LD
,
1101 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x64, 0x000000ff, 0x00000010),
1106 static const struct nvc0_hw_sm_query_cfg
1109 .type
= NVC0_HW_SM_QUERY_SHARED_ST
,
1110 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x64, 0x000000ff, 0x00000040),
1115 static const struct nvc0_hw_sm_query_cfg
1116 sm20_threads_launched
=
1118 .type
= NVC0_HW_SM_QUERY_THREADS_LAUNCHED
,
1119 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x26, 0x000000ff, 0x00000010),
1120 .ctr
[1] = _C(0xaaaa, LOGOP
, 0x26, 0x000000ff, 0x00000020),
1121 .ctr
[2] = _C(0xaaaa, LOGOP
, 0x26, 0x000000ff, 0x00000030),
1122 .ctr
[3] = _C(0xaaaa, LOGOP
, 0x26, 0x000000ff, 0x00000040),
1123 .ctr
[4] = _C(0xaaaa, LOGOP
, 0x26, 0x000000ff, 0x00000050),
1124 .ctr
[5] = _C(0xaaaa, LOGOP
, 0x26, 0x000000ff, 0x00000060),
1129 static const struct nvc0_hw_sm_query_cfg
1130 sm20_th_inst_executed_0
=
1132 .type
= NVC0_HW_SM_QUERY_TH_INST_EXECUTED_0
,
1133 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x2f, 0x000000ff, 0x00000000),
1134 .ctr
[1] = _C(0xaaaa, LOGOP
, 0x2f, 0x000000ff, 0x00000010),
1135 .ctr
[2] = _C(0xaaaa, LOGOP
, 0x2f, 0x000000ff, 0x00000020),
1136 .ctr
[3] = _C(0xaaaa, LOGOP
, 0x2f, 0x000000ff, 0x00000030),
1137 .ctr
[4] = _C(0xaaaa, LOGOP
, 0x2f, 0x000000ff, 0x00000040),
1138 .ctr
[5] = _C(0xaaaa, LOGOP
, 0x2f, 0x000000ff, 0x00000050),
1143 static const struct nvc0_hw_sm_query_cfg
1144 sm20_th_inst_executed_1
=
1146 .type
= NVC0_HW_SM_QUERY_TH_INST_EXECUTED_1
,
1147 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x30, 0x000000ff, 0x00000000),
1148 .ctr
[1] = _C(0xaaaa, LOGOP
, 0x30, 0x000000ff, 0x00000010),
1149 .ctr
[2] = _C(0xaaaa, LOGOP
, 0x30, 0x000000ff, 0x00000020),
1150 .ctr
[3] = _C(0xaaaa, LOGOP
, 0x30, 0x000000ff, 0x00000030),
1151 .ctr
[4] = _C(0xaaaa, LOGOP
, 0x30, 0x000000ff, 0x00000040),
1152 .ctr
[5] = _C(0xaaaa, LOGOP
, 0x30, 0x000000ff, 0x00000050),
1157 static const struct nvc0_hw_sm_query_cfg
1158 sm20_warps_launched
=
1160 .type
= NVC0_HW_SM_QUERY_WARPS_LAUNCHED
,
1161 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x26, 0x000000ff, 0x00000000),
1166 static const struct nvc0_hw_sm_query_cfg
*sm20_hw_sm_queries
[] =
1168 &sm20_active_cycles
,
1172 &sm20_divergent_branch
,
1176 &sm20_inst_executed
,
1180 &sm20_prof_trigger_0
,
1181 &sm20_prof_trigger_1
,
1182 &sm20_prof_trigger_2
,
1183 &sm20_prof_trigger_3
,
1184 &sm20_prof_trigger_4
,
1185 &sm20_prof_trigger_5
,
1186 &sm20_prof_trigger_6
,
1187 &sm20_prof_trigger_7
,
1190 &sm20_threads_launched
,
1191 &sm20_th_inst_executed_0
,
1192 &sm20_th_inst_executed_1
,
1193 &sm20_warps_launched
,
1196 /* ==== Compute capability 2.1 (GF108+ except GF110) ==== */
1197 static const struct nvc0_hw_sm_query_cfg
1198 sm21_inst_executed
=
1200 .type
= NVC0_HW_SM_QUERY_INST_EXECUTED
,
1201 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x2d, 0x000000ff, 0x00000000),
1202 .ctr
[1] = _C(0xaaaa, LOGOP
, 0x2d, 0x000000ff, 0x00000010),
1203 .ctr
[2] = _C(0xaaaa, LOGOP
, 0x2d, 0x000000ff, 0x00000020),
1208 static const struct nvc0_hw_sm_query_cfg
1209 sm21_inst_issued1_0
=
1211 .type
= NVC0_HW_SM_QUERY_INST_ISSUED1_0
,
1212 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x7e, 0x000000ff, 0x00000010),
1217 static const struct nvc0_hw_sm_query_cfg
1218 sm21_inst_issued1_1
=
1220 .type
= NVC0_HW_SM_QUERY_INST_ISSUED1_1
,
1221 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x7e, 0x000000ff, 0x00000040),
1226 static const struct nvc0_hw_sm_query_cfg
1227 sm21_inst_issued2_0
=
1229 .type
= NVC0_HW_SM_QUERY_INST_ISSUED2_0
,
1230 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x7e, 0x000000ff, 0x00000020),
1235 static const struct nvc0_hw_sm_query_cfg
1236 sm21_inst_issued2_1
=
1238 .type
= NVC0_HW_SM_QUERY_INST_ISSUED2_1
,
1239 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x7e, 0x000000ff, 0x00000050),
1244 static const struct nvc0_hw_sm_query_cfg
1245 sm21_th_inst_executed_0
=
1247 .type
= NVC0_HW_SM_QUERY_TH_INST_EXECUTED_0
,
1248 .ctr
[0] = _C(0xaaaa, LOGOP
, 0xa3, 0x000000ff, 0x00000000),
1249 .ctr
[1] = _C(0xaaaa, LOGOP
, 0xa3, 0x000000ff, 0x00000010),
1250 .ctr
[2] = _C(0xaaaa, LOGOP
, 0xa3, 0x000000ff, 0x00000020),
1251 .ctr
[3] = _C(0xaaaa, LOGOP
, 0xa3, 0x000000ff, 0x00000030),
1252 .ctr
[4] = _C(0xaaaa, LOGOP
, 0xa3, 0x000000ff, 0x00000040),
1253 .ctr
[5] = _C(0xaaaa, LOGOP
, 0xa3, 0x000000ff, 0x00000050),
1258 static const struct nvc0_hw_sm_query_cfg
1259 sm21_th_inst_executed_1
=
1261 .type
= NVC0_HW_SM_QUERY_TH_INST_EXECUTED_1
,
1262 .ctr
[0] = _C(0xaaaa, LOGOP
, 0xa5, 0x000000ff, 0x00000000),
1263 .ctr
[1] = _C(0xaaaa, LOGOP
, 0xa5, 0x000000ff, 0x00000010),
1264 .ctr
[2] = _C(0xaaaa, LOGOP
, 0xa5, 0x000000ff, 0x00000020),
1265 .ctr
[3] = _C(0xaaaa, LOGOP
, 0xa5, 0x000000ff, 0x00000030),
1266 .ctr
[4] = _C(0xaaaa, LOGOP
, 0xa5, 0x000000ff, 0x00000040),
1267 .ctr
[5] = _C(0xaaaa, LOGOP
, 0xa5, 0x000000ff, 0x00000050),
1272 static const struct nvc0_hw_sm_query_cfg
1273 sm21_th_inst_executed_2
=
1275 .type
= NVC0_HW_SM_QUERY_TH_INST_EXECUTED_2
,
1276 .ctr
[0] = _C(0xaaaa, LOGOP
, 0xa4, 0x000000ff, 0x00000000),
1277 .ctr
[1] = _C(0xaaaa, LOGOP
, 0xa4, 0x000000ff, 0x00000010),
1278 .ctr
[2] = _C(0xaaaa, LOGOP
, 0xa4, 0x000000ff, 0x00000020),
1279 .ctr
[3] = _C(0xaaaa, LOGOP
, 0xa4, 0x000000ff, 0x00000030),
1280 .ctr
[4] = _C(0xaaaa, LOGOP
, 0xa4, 0x000000ff, 0x00000040),
1281 .ctr
[5] = _C(0xaaaa, LOGOP
, 0xa4, 0x000000ff, 0x00000050),
1286 static const struct nvc0_hw_sm_query_cfg
1287 sm21_th_inst_executed_3
=
1289 .type
= NVC0_HW_SM_QUERY_TH_INST_EXECUTED_3
,
1290 .ctr
[0] = _C(0xaaaa, LOGOP
, 0xa6, 0x000000ff, 0x00000000),
1291 .ctr
[1] = _C(0xaaaa, LOGOP
, 0xa6, 0x000000ff, 0x00000010),
1292 .ctr
[2] = _C(0xaaaa, LOGOP
, 0xa6, 0x000000ff, 0x00000020),
1293 .ctr
[3] = _C(0xaaaa, LOGOP
, 0xa6, 0x000000ff, 0x00000030),
1294 .ctr
[4] = _C(0xaaaa, LOGOP
, 0xa6, 0x000000ff, 0x00000040),
1295 .ctr
[5] = _C(0xaaaa, LOGOP
, 0xa6, 0x000000ff, 0x00000050),
1300 static const struct nvc0_hw_sm_query_cfg
*sm21_hw_sm_queries
[] =
1302 &sm20_active_cycles
,
1306 &sm20_divergent_branch
,
1310 &sm21_inst_executed
,
1311 &sm21_inst_issued1_0
,
1312 &sm21_inst_issued1_1
,
1313 &sm21_inst_issued2_0
,
1314 &sm21_inst_issued2_1
,
1317 &sm20_prof_trigger_0
,
1318 &sm20_prof_trigger_1
,
1319 &sm20_prof_trigger_2
,
1320 &sm20_prof_trigger_3
,
1321 &sm20_prof_trigger_4
,
1322 &sm20_prof_trigger_5
,
1323 &sm20_prof_trigger_6
,
1324 &sm20_prof_trigger_7
,
1327 &sm20_threads_launched
,
1328 &sm21_th_inst_executed_0
,
1329 &sm21_th_inst_executed_1
,
1330 &sm21_th_inst_executed_2
,
1331 &sm21_th_inst_executed_3
,
1332 &sm20_warps_launched
,
1337 static inline const struct nvc0_hw_sm_query_cfg
**
1338 nvc0_hw_sm_get_queries(struct nvc0_screen
*screen
)
1340 struct nouveau_device
*dev
= screen
->base
.device
;
1342 switch (screen
->base
.class_3d
) {
1344 return sm35_hw_sm_queries
;
1346 return sm30_hw_sm_queries
;
1348 if (dev
->chipset
== 0xc0 || dev
->chipset
== 0xc8)
1349 return sm20_hw_sm_queries
;
1350 return sm21_hw_sm_queries
;
1357 nvc0_hw_sm_get_num_queries(struct nvc0_screen
*screen
)
1359 struct nouveau_device
*dev
= screen
->base
.device
;
1361 switch (screen
->base
.class_3d
) {
1363 return ARRAY_SIZE(sm35_hw_sm_queries
);
1365 return ARRAY_SIZE(sm30_hw_sm_queries
);
1367 if (dev
->chipset
== 0xc0 || dev
->chipset
== 0xc8)
1368 return ARRAY_SIZE(sm20_hw_sm_queries
);
1369 return ARRAY_SIZE(sm21_hw_sm_queries
);
1374 static const struct nvc0_hw_sm_query_cfg
*
1375 nvc0_hw_sm_query_get_cfg(struct nvc0_context
*nvc0
, struct nvc0_hw_query
*hq
)
1377 const struct nvc0_hw_sm_query_cfg
**queries
;
1378 struct nvc0_screen
*screen
= nvc0
->screen
;
1379 struct nvc0_query
*q
= &hq
->base
;
1380 unsigned num_queries
;
1383 num_queries
= nvc0_hw_sm_get_num_queries(screen
);
1384 queries
= nvc0_hw_sm_get_queries(screen
);
1386 for (i
= 0; i
< num_queries
; i
++) {
1387 if (NVC0_HW_SM_QUERY(queries
[i
]->type
) == q
->type
)
1395 nvc0_hw_sm_destroy_query(struct nvc0_context
*nvc0
, struct nvc0_hw_query
*hq
)
1397 struct nvc0_query
*q
= &hq
->base
;
1398 nvc0_hw_query_allocate(nvc0
, q
, 0);
1399 nouveau_fence_ref(NULL
, &hq
->fence
);
1404 nve4_hw_sm_begin_query(struct nvc0_context
*nvc0
, struct nvc0_hw_query
*hq
)
1406 struct nvc0_screen
*screen
= nvc0
->screen
;
1407 struct nouveau_pushbuf
*push
= nvc0
->base
.pushbuf
;
1408 struct nvc0_hw_sm_query
*hsq
= nvc0_hw_sm_query(hq
);
1409 const struct nvc0_hw_sm_query_cfg
*cfg
;
1411 unsigned num_ab
[2] = { 0, 0 };
1413 cfg
= nvc0_hw_sm_query_get_cfg(nvc0
, hq
);
1415 /* check if we have enough free counter slots */
1416 for (i
= 0; i
< cfg
->num_counters
; ++i
)
1417 num_ab
[cfg
->ctr
[i
].sig_dom
]++;
1419 if (screen
->pm
.num_hw_sm_active
[0] + num_ab
[0] > 4 ||
1420 screen
->pm
.num_hw_sm_active
[1] + num_ab
[1] > 4) {
1421 NOUVEAU_ERR("Not enough free MP counter slots !\n");
1425 assert(cfg
->num_counters
<= 4);
1426 PUSH_SPACE(push
, 4 * 8 * + 6);
1428 if (!screen
->pm
.mp_counters_enabled
) {
1429 screen
->pm
.mp_counters_enabled
= true;
1430 BEGIN_NVC0(push
, SUBC_SW(0x06ac), 1);
1431 PUSH_DATA (push
, 0x1fcb);
1434 /* set sequence field to 0 (used to check if result is available) */
1435 for (i
= 0; i
< screen
->mp_count
; ++i
)
1436 hq
->data
[i
* 10 + 10] = 0;
1439 for (i
= 0; i
< cfg
->num_counters
; ++i
) {
1440 const unsigned d
= cfg
->ctr
[i
].sig_dom
;
1442 if (!screen
->pm
.num_hw_sm_active
[d
]) {
1443 uint32_t m
= (1 << 22) | (1 << (7 + (8 * !d
)));
1444 if (screen
->pm
.num_hw_sm_active
[!d
])
1445 m
|= 1 << (7 + (8 * d
));
1446 BEGIN_NVC0(push
, SUBC_SW(0x0600), 1);
1447 PUSH_DATA (push
, m
);
1449 screen
->pm
.num_hw_sm_active
[d
]++;
1451 for (c
= d
* 4; c
< (d
* 4 + 4); ++c
) {
1452 if (!screen
->pm
.mp_counter
[c
]) {
1454 screen
->pm
.mp_counter
[c
] = hsq
;
1458 assert(c
<= (d
* 4 + 3)); /* must succeed, already checked for space */
1460 /* configure and reset the counter(s) */
1462 BEGIN_NVC0(push
, NVE4_CP(MP_PM_A_SIGSEL(c
& 3)), 1);
1464 BEGIN_NVC0(push
, NVE4_CP(MP_PM_B_SIGSEL(c
& 3)), 1);
1465 PUSH_DATA (push
, cfg
->ctr
[i
].sig_sel
);
1466 BEGIN_NVC0(push
, NVE4_CP(MP_PM_SRCSEL(c
)), 1);
1467 PUSH_DATA (push
, cfg
->ctr
[i
].src_sel
+ 0x2108421 * (c
& 3));
1468 BEGIN_NVC0(push
, NVE4_CP(MP_PM_FUNC(c
)), 1);
1469 PUSH_DATA (push
, (cfg
->ctr
[i
].func
<< 4) | cfg
->ctr
[i
].mode
);
1470 BEGIN_NVC0(push
, NVE4_CP(MP_PM_SET(c
)), 1);
1471 PUSH_DATA (push
, 0);
1477 nvc0_hw_sm_begin_query(struct nvc0_context
*nvc0
, struct nvc0_hw_query
*hq
)
1479 struct nvc0_screen
*screen
= nvc0
->screen
;
1480 struct nouveau_pushbuf
*push
= nvc0
->base
.pushbuf
;
1481 struct nvc0_hw_sm_query
*hsq
= nvc0_hw_sm_query(hq
);
1482 const struct nvc0_hw_sm_query_cfg
*cfg
;
1485 if (screen
->base
.class_3d
>= NVE4_3D_CLASS
)
1486 return nve4_hw_sm_begin_query(nvc0
, hq
);
1488 cfg
= nvc0_hw_sm_query_get_cfg(nvc0
, hq
);
1490 /* check if we have enough free counter slots */
1491 if (screen
->pm
.num_hw_sm_active
[0] + cfg
->num_counters
> 8) {
1492 NOUVEAU_ERR("Not enough free MP counter slots !\n");
1496 assert(cfg
->num_counters
<= 8);
1497 PUSH_SPACE(push
, 8 * 8 + 2);
1499 /* set sequence field to 0 (used to check if result is available) */
1500 for (i
= 0; i
< screen
->mp_count
; ++i
) {
1501 const unsigned b
= (0x30 / 4) * i
;
1502 hq
->data
[b
+ 8] = 0;
1506 for (i
= 0; i
< cfg
->num_counters
; ++i
) {
1507 uint32_t mask_sel
= 0x00000000;
1509 if (!screen
->pm
.num_hw_sm_active
[0]) {
1510 BEGIN_NVC0(push
, SUBC_SW(0x0600), 1);
1511 PUSH_DATA (push
, 0x80000000);
1513 screen
->pm
.num_hw_sm_active
[0]++;
1515 for (c
= 0; c
< 8; ++c
) {
1516 if (!screen
->pm
.mp_counter
[c
]) {
1518 screen
->pm
.mp_counter
[c
] = hsq
;
1523 /* Oddly-enough, the signal id depends on the slot selected on Fermi but
1524 * not on Kepler. Fortunately, the signal ids are just offseted by the
1527 mask_sel
|= (c
<< 8);
1528 mask_sel
|= (c
<< 16);
1529 mask_sel
|= (c
<< 24);
1530 mask_sel
&= cfg
->ctr
[i
].src_mask
;
1532 /* configure and reset the counter(s) */
1533 BEGIN_NVC0(push
, NVC0_CP(MP_PM_SIGSEL(c
)), 1);
1534 PUSH_DATA (push
, cfg
->ctr
[i
].sig_sel
);
1535 BEGIN_NVC0(push
, NVC0_CP(MP_PM_SRCSEL(c
)), 1);
1536 PUSH_DATA (push
, cfg
->ctr
[i
].src_sel
| mask_sel
);
1537 BEGIN_NVC0(push
, NVC0_CP(MP_PM_OP(c
)), 1);
1538 PUSH_DATA (push
, (cfg
->ctr
[i
].func
<< 4) | cfg
->ctr
[i
].mode
);
1539 BEGIN_NVC0(push
, NVC0_CP(MP_PM_SET(c
)), 1);
1540 PUSH_DATA (push
, 0);
1545 static inline struct nvc0_program
*
1546 nvc0_hw_sm_get_program(struct nvc0_screen
*screen
)
1548 struct nvc0_program
*prog
;
1550 prog
= CALLOC_STRUCT(nvc0_program
);
1554 prog
->type
= PIPE_SHADER_COMPUTE
;
1555 prog
->translated
= true;
1556 prog
->parm_size
= 12;
1558 if (screen
->base
.class_3d
== NVE4_3D_CLASS
||
1559 screen
->base
.class_3d
== NVF0_3D_CLASS
) {
1560 if (screen
->base
.class_3d
== NVE4_3D_CLASS
) {
1561 prog
->code
= (uint32_t *)nve4_read_hw_sm_counters_code
;
1562 prog
->code_size
= sizeof(nve4_read_hw_sm_counters_code
);
1564 prog
->code
= (uint32_t *)nvf0_read_hw_sm_counters_code
;
1565 prog
->code_size
= sizeof(nvf0_read_hw_sm_counters_code
);
1567 prog
->num_gprs
= 14;
1569 prog
->code
= (uint32_t *)nvc0_read_hw_sm_counters_code
;
1570 prog
->code_size
= sizeof(nvc0_read_hw_sm_counters_code
);
1571 prog
->num_gprs
= 12;
1577 nvc0_hw_sm_end_query(struct nvc0_context
*nvc0
, struct nvc0_hw_query
*hq
)
1579 struct nvc0_screen
*screen
= nvc0
->screen
;
1580 struct pipe_context
*pipe
= &nvc0
->base
.pipe
;
1581 struct nouveau_pushbuf
*push
= nvc0
->base
.pushbuf
;
1582 const bool is_nve4
= screen
->base
.class_3d
>= NVE4_3D_CLASS
;
1583 struct nvc0_hw_sm_query
*hsq
= nvc0_hw_sm_query(hq
);
1584 struct pipe_grid_info info
= {};
1587 const uint block
[3] = { 32, is_nve4
? 4 : 1, 1 };
1588 const uint grid
[3] = { screen
->mp_count
, screen
->gpc_count
, 1 };
1591 if (unlikely(!screen
->pm
.prog
))
1592 screen
->pm
.prog
= nvc0_hw_sm_get_program(screen
);
1594 /* disable all counting */
1595 PUSH_SPACE(push
, 8);
1596 for (c
= 0; c
< 8; ++c
)
1597 if (screen
->pm
.mp_counter
[c
]) {
1599 IMMED_NVC0(push
, NVE4_CP(MP_PM_FUNC(c
)), 0);
1601 IMMED_NVC0(push
, NVC0_CP(MP_PM_OP(c
)), 0);
1604 /* release counters for this query */
1605 for (c
= 0; c
< 8; ++c
) {
1606 if (screen
->pm
.mp_counter
[c
] == hsq
) {
1607 uint8_t d
= is_nve4
? c
/ 4 : 0; /* only one domain for NVC0:NVE4 */
1608 screen
->pm
.num_hw_sm_active
[d
]--;
1609 screen
->pm
.mp_counter
[c
] = NULL
;
1613 BCTX_REFN_bo(nvc0
->bufctx_cp
, CP_QUERY
, NOUVEAU_BO_GART
| NOUVEAU_BO_WR
,
1616 PUSH_SPACE(push
, 1);
1617 IMMED_NVC0(push
, SUBC_CP(NV50_GRAPH_SERIALIZE
), 0);
1619 pipe
->bind_compute_state(pipe
, screen
->pm
.prog
);
1620 input
[0] = (hq
->bo
->offset
+ hq
->base_offset
);
1621 input
[1] = (hq
->bo
->offset
+ hq
->base_offset
) >> 32;
1622 input
[2] = hq
->sequence
;
1624 for (i
= 0; i
< 3; i
++) {
1625 info
.block
[i
] = block
[i
];
1626 info
.grid
[i
] = grid
[i
];
1630 pipe
->launch_grid(pipe
, &info
);
1632 nouveau_bufctx_reset(nvc0
->bufctx_cp
, NVC0_BIND_CP_QUERY
);
1634 /* re-activate other counters */
1635 PUSH_SPACE(push
, 16);
1637 for (c
= 0; c
< 8; ++c
) {
1638 const struct nvc0_hw_sm_query_cfg
*cfg
;
1641 hsq
= screen
->pm
.mp_counter
[c
];
1645 cfg
= nvc0_hw_sm_query_get_cfg(nvc0
, &hsq
->base
);
1646 for (i
= 0; i
< cfg
->num_counters
; ++i
) {
1647 if (mask
& (1 << hsq
->ctr
[i
]))
1649 mask
|= 1 << hsq
->ctr
[i
];
1651 BEGIN_NVC0(push
, NVE4_CP(MP_PM_FUNC(hsq
->ctr
[i
])), 1);
1653 BEGIN_NVC0(push
, NVC0_CP(MP_PM_OP(hsq
->ctr
[i
])), 1);
1655 PUSH_DATA (push
, (cfg
->ctr
[i
].func
<< 4) | cfg
->ctr
[i
].mode
);
1661 nvc0_hw_sm_query_read_data(uint32_t count
[32][8],
1662 struct nvc0_context
*nvc0
, bool wait
,
1663 struct nvc0_hw_query
*hq
,
1664 const struct nvc0_hw_sm_query_cfg
*cfg
,
1667 struct nvc0_hw_sm_query
*hsq
= nvc0_hw_sm_query(hq
);
1670 for (p
= 0; p
< mp_count
; ++p
) {
1671 const unsigned b
= (0x30 / 4) * p
;
1673 for (c
= 0; c
< cfg
->num_counters
; ++c
) {
1674 if (hq
->data
[b
+ 8] != hq
->sequence
) {
1677 if (nouveau_bo_wait(hq
->bo
, NOUVEAU_BO_RD
, nvc0
->base
.client
))
1680 count
[p
][c
] = hq
->data
[b
+ hsq
->ctr
[c
]] * (1 << c
);
1687 nve4_hw_sm_query_read_data(uint32_t count
[32][8],
1688 struct nvc0_context
*nvc0
, bool wait
,
1689 struct nvc0_hw_query
*hq
,
1690 const struct nvc0_hw_sm_query_cfg
*cfg
,
1693 struct nvc0_hw_sm_query
*hsq
= nvc0_hw_sm_query(hq
);
1696 for (p
= 0; p
< mp_count
; ++p
) {
1697 const unsigned b
= (0x60 / 4) * p
;
1699 for (c
= 0; c
< cfg
->num_counters
; ++c
) {
1701 for (d
= 0; d
< ((hsq
->ctr
[c
] & ~3) ? 1 : 4); ++d
) {
1702 if (hq
->data
[b
+ 20 + d
] != hq
->sequence
) {
1705 if (nouveau_bo_wait(hq
->bo
, NOUVEAU_BO_RD
, nvc0
->base
.client
))
1708 if (hsq
->ctr
[c
] & ~0x3)
1709 count
[p
][c
] = hq
->data
[b
+ 16 + (hsq
->ctr
[c
] & 3)];
1711 count
[p
][c
] += hq
->data
[b
+ d
* 4 + hsq
->ctr
[c
]];
1719 nvc0_hw_sm_get_query_result(struct nvc0_context
*nvc0
, struct nvc0_hw_query
*hq
,
1720 boolean wait
, union pipe_query_result
*result
)
1722 uint32_t count
[32][8];
1724 unsigned mp_count
= MIN2(nvc0
->screen
->mp_count_compute
, 32);
1726 const struct nvc0_hw_sm_query_cfg
*cfg
;
1729 cfg
= nvc0_hw_sm_query_get_cfg(nvc0
, hq
);
1731 if (nvc0
->screen
->base
.class_3d
>= NVE4_3D_CLASS
)
1732 ret
= nve4_hw_sm_query_read_data(count
, nvc0
, wait
, hq
, cfg
, mp_count
);
1734 ret
= nvc0_hw_sm_query_read_data(count
, nvc0
, wait
, hq
, cfg
, mp_count
);
1738 for (c
= 0; c
< cfg
->num_counters
; ++c
)
1739 for (p
= 0; p
< mp_count
; ++p
)
1740 value
+= count
[p
][c
];
1741 value
= (value
* cfg
->norm
[0]) / cfg
->norm
[1];
1743 *(uint64_t *)result
= value
;
1747 static const struct nvc0_hw_query_funcs hw_sm_query_funcs
= {
1748 .destroy_query
= nvc0_hw_sm_destroy_query
,
1749 .begin_query
= nvc0_hw_sm_begin_query
,
1750 .end_query
= nvc0_hw_sm_end_query
,
1751 .get_query_result
= nvc0_hw_sm_get_query_result
,
1754 struct nvc0_hw_query
*
1755 nvc0_hw_sm_create_query(struct nvc0_context
*nvc0
, unsigned type
)
1757 struct nvc0_screen
*screen
= nvc0
->screen
;
1758 struct nvc0_hw_sm_query
*hsq
;
1759 struct nvc0_hw_query
*hq
;
1762 if (nvc0
->screen
->base
.drm
->version
< 0x01000101)
1765 if (type
< NVC0_HW_SM_QUERY(0) || type
> NVC0_HW_SM_QUERY_LAST
)
1768 hsq
= CALLOC_STRUCT(nvc0_hw_sm_query
);
1773 hq
->funcs
= &hw_sm_query_funcs
;
1774 hq
->base
.type
= type
;
1776 if (screen
->base
.class_3d
>= NVE4_3D_CLASS
) {
1798 * [50] = WS0.sequence
1799 * [54] = WS1.sequence
1800 * [58] = WS2.sequence
1801 * [5c] = WS3.sequence
1803 space
= (4 * 4 + 4 + 4) * nvc0
->screen
->mp_count
* sizeof(uint32_t);
1806 * Note that padding is used to align memory access to 128 bits.
1817 * [20] = MP.sequence
1822 space
= (8 + 1 + 3) * nvc0
->screen
->mp_count
* sizeof(uint32_t);
1825 if (!nvc0_hw_query_allocate(nvc0
, &hq
->base
, space
)) {
1834 nvc0_hw_sm_get_driver_query_info(struct nvc0_screen
*screen
, unsigned id
,
1835 struct pipe_driver_query_info
*info
)
1839 if (screen
->base
.drm
->version
>= 0x01000101) {
1840 if (screen
->compute
)
1841 count
= nvc0_hw_sm_get_num_queries(screen
);
1848 if (screen
->compute
) {
1849 if (screen
->base
.class_3d
<= NVF0_3D_CLASS
) {
1850 const struct nvc0_hw_sm_query_cfg
**queries
=
1851 nvc0_hw_sm_get_queries(screen
);
1853 info
->name
= nvc0_hw_sm_query_get_name(queries
[id
]->type
);
1854 info
->query_type
= NVC0_HW_SM_QUERY(queries
[id
]->type
);
1855 info
->group_id
= NVC0_HW_SM_QUERY_GROUP
;