gallium: Use enum pipe_shader_type in set_sampler_views()
[mesa.git] / src / gallium / drivers / nouveau / nvc0 / nvc0_query_hw_sm.c
1 /*
2 * Copyright 2011 Christoph Bumiller
3 * Copyright 2015 Samuel Pitoiset
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21 * OTHER DEALINGS IN THE SOFTWARE.
22 */
23
24 #define NVC0_PUSH_EXPLICIT_SPACE_CHECKING
25
26 #include "nvc0/nvc0_context.h"
27 #include "nvc0/nvc0_query_hw_sm.h"
28
29 #include "nv_object.xml.h"
30 #include "nvc0/nve4_compute.xml.h"
31 #include "nvc0/nvc0_compute.xml.h"
32
33 /* NOTE: intentionally using the same names as NV */
34 #define _Q(t, n, d) { NVC0_HW_SM_QUERY_##t, n, d }
35 static const struct {
36 unsigned type;
37 const char *name;
38 const char *desc;
39 } nvc0_hw_sm_queries[] = {
40 _Q(ACTIVE_CYCLES,
41 "active_cycles",
42 "Number of cycles a multiprocessor has at least one active warp"),
43
44 _Q(ACTIVE_WARPS,
45 "active_warps",
46 "Accumulated number of active warps per cycle. For every cycle it "
47 "increments by the number of active warps in the cycle which can be in "
48 "the range 0 to 64"),
49
50 _Q(ATOM_CAS_COUNT,
51 "atom_cas_count",
52 "Number of warps executing atomic compare and swap operations. Increments "
53 "by one if at least one thread in a warp executes the instruction."),
54
55 _Q(ATOM_COUNT,
56 "atom_count",
57 "Number of warps executing atomic reduction operations. Increments by one "
58 "if at least one thread in a warp executes the instruction"),
59
60 _Q(BRANCH,
61 "branch",
62 "Number of branch instructions executed per warp on a multiprocessor"),
63
64 _Q(DIVERGENT_BRANCH,
65 "divergent_branch",
66 "Number of divergent branches within a warp. This counter will be "
67 "incremented by one if at least one thread in a warp diverges (that is, "
68 "follows a different execution path) via a conditional branch"),
69
70 _Q(GLD_REQUEST,
71 "gld_request",
72 "Number of executed load instructions where the state space is not "
73 "specified and hence generic addressing is used, increments per warp on a "
74 "multiprocessor. It can include the load operations from global,local and "
75 "shared state space"),
76
77 _Q(GLD_MEM_DIV_REPLAY,
78 "global_ld_mem_divergence_replays",
79 "Number of instruction replays for global memory loads. Instruction is "
80 "replayed if the instruction is accessing more than one cache line of "
81 "128 bytes. For each extra cache line access the counter is incremented "
82 "by 1"),
83
84 _Q(GST_TRANSACTIONS,
85 "global_store_transaction",
86 "Number of global store transactions. Increments by 1 per transaction. "
87 "Transaction can be 32/64/96/128B"),
88
89 _Q(GST_MEM_DIV_REPLAY,
90 "global_st_mem_divergence_replays",
91 "Number of instruction replays for global memory stores. Instruction is "
92 "replayed if the instruction is accessing more than one cache line of "
93 "128 bytes. For each extra cache line access the counter is incremented "
94 "by 1"),
95
96 _Q(GRED_COUNT,
97 "gred_count",
98 "Number of warps executing reduction operations on global memory. "
99 "Increments by one if at least one thread in a warp executes the "
100 "instruction"),
101
102 _Q(GST_REQUEST,
103 "gst_request",
104 "Number of executed store instructions where the state space is not "
105 "specified and hence generic addressing is used, increments per warp on a "
106 "multiprocessor. It can include the store operations to global,local and "
107 "shared state space"),
108
109 _Q(INST_EXECUTED,
110 "inst_executed",
111 "Number of instructions executed, do not include replays"),
112
113 _Q(INST_ISSUED,
114 "inst_issued",
115 "Number of instructions issued including replays"),
116
117 _Q(INST_ISSUED1,
118 "inst_issued1",
119 "Number of single instruction issued per cycle"),
120
121 _Q(INST_ISSUED2,
122 "inst_issued2",
123 "Number of dual instructions issued per cycle"),
124
125 _Q(INST_ISSUED1_0,
126 "inst_issued1_0",
127 "Number of single instruction issued per cycle in pipeline 0"),
128
129 _Q(INST_ISSUED1_1,
130 "inst_issued1_1",
131 "Number of single instruction issued per cycle in pipeline 1"),
132
133 _Q(INST_ISSUED2_0,
134 "inst_issued2_0",
135 "Number of dual instructions issued per cycle in pipeline 0"),
136
137 _Q(INST_ISSUED2_1,
138 "inst_issued2_1",
139 "Number of dual instructions issued per cycle in pipeline 1"),
140
141 _Q(L1_GLD_HIT,
142 "l1_global_load_hit",
143 "Number of cache lines that hit in L1 cache for global memory load "
144 "accesses. In case of perfect coalescing this increments by 1,2, and 4 for "
145 "32, 64 and 128 bit accesses by a warp respectively"),
146
147 _Q(L1_GLD_MISS,
148 "l1_global_load_miss",
149 "Number of cache lines that miss in L1 cache for global memory load "
150 "accesses. In case of perfect coalescing this increments by 1,2, and 4 for "
151 "32, 64 and 128 bit accesses by a warp respectively"),
152
153 _Q(L1_GLD_TRANSACTIONS,
154 "__l1_global_load_transactions",
155 "Number of global load transactions from L1 cache. Increments by 1 per "
156 "transaction. Transaction can be 32/64/96/128B"),
157
158 _Q(L1_GST_TRANSACTIONS,
159 "__l1_global_store_transactions",
160 "Number of global store transactions from L1 cache. Increments by 1 per "
161 "transaction. Transaction can be 32/64/96/128B"),
162
163 _Q(L1_LOCAL_LD_HIT,
164 "l1_local_load_hit",
165 "Number of cache lines that hit in L1 cache for local memory load "
166 "accesses. In case of perfect coalescing this increments by 1,2, and 4 for "
167 "32, 64 and 128 bit accesses by a warp respectively"),
168
169 _Q(L1_LOCAL_LD_MISS,
170 "l1_local_load_miss",
171 "Number of cache lines that miss in L1 cache for local memory load "
172 "accesses. In case of perfect coalescing this increments by 1,2, and 4 for "
173 "32, 64 and 128 bit accesses by a warp respectively"),
174
175 _Q(L1_LOCAL_ST_HIT,
176 "l1_local_store_hit",
177 "Number of cache lines that hit in L1 cache for local memory store "
178 "accesses. In case of perfect coalescing this increments by 1,2, and 4 for "
179 "32, 64 and 128 bit accesses by a warp respectively"),
180
181 _Q(L1_LOCAL_ST_MISS,
182 "l1_local_store_miss",
183 "Number of cache lines that miss in L1 cache for local memory store "
184 "accesses. In case of perfect coalescing this increments by 1,2, and 4 for "
185 "32,64 and 128 bit accesses by a warp respectively"),
186
187 _Q(L1_SHARED_LD_TRANSACTIONS,
188 "l1_shared_load_transactions",
189 "Number of shared load transactions. Increments by 1 per transaction. "
190 "Transaction can be 32/64/96/128B"),
191
192 _Q(L1_SHARED_ST_TRANSACTIONS,
193 "l1_shared_store_transactions",
194 "Number of shared store transactions. Increments by 1 per transaction. "
195 "Transaction can be 32/64/96/128B"),
196
197 _Q(LOCAL_LD,
198 "local_load",
199 "Number of executed load instructions where state space is specified as "
200 "local, increments per warp on a multiprocessor"),
201
202 _Q(LOCAL_LD_TRANSACTIONS,
203 "local_load_transactions",
204 "Number of local load transactions from L1 cache. Increments by 1 per "
205 "transaction. Transaction can be 32/64/96/128B"),
206
207 _Q(LOCAL_ST,
208 "local_store",
209 "Number of executed store instructions where state space is specified as "
210 "local, increments per warp on a multiprocessor"),
211
212 _Q(LOCAL_ST_TRANSACTIONS,
213 "local_store_transactions",
214 "Number of local store transactions to L1 cache. Increments by 1 per "
215 "transaction. Transaction can be 32/64/96/128B."),
216
217 _Q(NOT_PRED_OFF_INST_EXECUTED,
218 "not_predicated_off_thread_inst_executed",
219 "Number of not predicated off instructions executed by all threads, does "
220 "not include replays. For each instruction it increments by the number of "
221 "threads that execute this instruction"),
222
223 _Q(PROF_TRIGGER_0,
224 "prof_trigger_00",
225 "User profiled generic trigger that can be inserted in any place of the "
226 "code to collect the related information. Increments per warp."),
227
228 _Q(PROF_TRIGGER_1,
229 "prof_trigger_01",
230 "User profiled generic trigger that can be inserted in any place of the "
231 "code to collect the related information. Increments per warp."),
232
233 _Q(PROF_TRIGGER_2,
234 "prof_trigger_02",
235 "User profiled generic trigger that can be inserted in any place of the "
236 "code to collect the related information. Increments per warp."),
237
238 _Q(PROF_TRIGGER_3,
239 "prof_trigger_03",
240 "User profiled generic trigger that can be inserted in any place of the "
241 "code to collect the related information. Increments per warp."),
242
243 _Q(PROF_TRIGGER_4,
244 "prof_trigger_04",
245 "User profiled generic trigger that can be inserted in any place of the "
246 "code to collect the related information. Increments per warp."),
247
248 _Q(PROF_TRIGGER_5,
249 "prof_trigger_05",
250 "User profiled generic trigger that can be inserted in any place of the "
251 "code to collect the related information. Increments per warp."),
252
253 _Q(PROF_TRIGGER_6,
254 "prof_trigger_06",
255 "User profiled generic trigger that can be inserted in any place of the "
256 "code to collect the related information. Increments per warp."),
257
258 _Q(PROF_TRIGGER_7,
259 "prof_trigger_07",
260 "User profiled generic trigger that can be inserted in any place of the "
261 "code to collect the related information. Increments per warp."),
262
263 _Q(SHARED_LD,
264 "shared_load",
265 "Number of executed load instructions where state space is specified as "
266 "shared, increments per warp on a multiprocessor"),
267
268 _Q(SHARED_LD_REPLAY,
269 "shared_load_replay",
270 "Replays caused due to shared load bank conflict (when the addresses for "
271 "two or more shared memory load requests fall in the same memory bank) or "
272 "when there is no conflict but the total number of words accessed by all "
273 "threads in the warp executing that instruction exceed the number of words "
274 "that can be loaded in one cycle (256 bytes)"),
275
276 _Q(SHARED_ST,
277 "shared_store",
278 "Number of executed store instructions where state space is specified as "
279 "shared, increments per warp on a multiprocessor"),
280
281 _Q(SHARED_ST_REPLAY,
282 "shared_store_replay",
283 "Replays caused due to shared store bank conflict (when the addresses for "
284 "two or more shared memory store requests fall in the same memory bank) or "
285 "when there is no conflict but the total number of words accessed by all "
286 "threads in the warp executing that instruction exceed the number of words "
287 "that can be stored in one cycle"),
288
289 _Q(SM_CTA_LAUNCHED,
290 "sm_cta_launched",
291 "Number of thread blocks launched on a multiprocessor"),
292
293 _Q(THREADS_LAUNCHED,
294 "threads_launched",
295 "Number of threads launched on a multiprocessor"),
296
297 _Q(TH_INST_EXECUTED,
298 "thread_inst_executed",
299 "Number of instructions executed by all threads, does not include "
300 "replays. For each instruction it increments by the number of threads in "
301 "the warp that execute the instruction"),
302
303 _Q(TH_INST_EXECUTED_0,
304 "thread_inst_executed_0",
305 "Number of instructions executed by all threads, does not include "
306 "replays. For each instruction it increments by the number of threads in "
307 "the warp that execute the instruction in pipeline 0"),
308
309 _Q(TH_INST_EXECUTED_1,
310 "thread_inst_executed_1",
311 "Number of instructions executed by all threads, does not include "
312 "replays. For each instruction it increments by the number of threads in "
313 "the warp that execute the instruction in pipeline 1"),
314
315 _Q(TH_INST_EXECUTED_2,
316 "thread_inst_executed_2",
317 "Number of instructions executed by all threads, does not include "
318 "replays. For each instruction it increments by the number of threads in "
319 "the warp that execute the instruction in pipeline 2"),
320
321 _Q(TH_INST_EXECUTED_3,
322 "thread_inst_executed_3",
323 "Number of instructions executed by all threads, does not include "
324 "replays. For each instruction it increments by the number of threads in "
325 "the warp that execute the instruction in pipeline 3"),
326
327 _Q(UNCACHED_GLD_TRANSACTIONS,
328 "uncached_global_load_transaction",
329 "Number of uncached global load transactions. Increments by 1 per "
330 "transaction. Transaction can be 32/64/96/128B."),
331
332 _Q(WARPS_LAUNCHED,
333 "warps_launched",
334 "Number of warps launched on a multiprocessor"),
335 };
336
337 #undef _Q
338
339 static inline const char *
340 nvc0_hw_sm_query_get_name(unsigned query_type)
341 {
342 unsigned i;
343
344 for (i = 0; i < ARRAY_SIZE(nvc0_hw_sm_queries); i++) {
345 if (nvc0_hw_sm_queries[i].type == query_type)
346 return nvc0_hw_sm_queries[i].name;
347 }
348 assert(0);
349 return NULL;
350 }
351
352 /* === PERFORMANCE MONITORING COUNTERS for NVE4+ === */
353
354 /* Code to read out MP counters: They are accessible via mmio, too, but let's
355 * just avoid mapping registers in userspace. We'd have to know which MPs are
356 * enabled/present, too, and that information is not presently exposed.
357 * We could add a kernel interface for it, but reading the counters like this
358 * has the advantage of being async (if get_result isn't called immediately).
359 */
360 static const uint64_t nve4_read_hw_sm_counters_code[] =
361 {
362 /* sched 0x20 0x20 0x20 0x20 0x20 0x20 0x20
363 * mov b32 $r8 $tidx
364 * mov b32 $r12 $physid
365 * mov b32 $r0 $pm0
366 * mov b32 $r1 $pm1
367 * mov b32 $r2 $pm2
368 * mov b32 $r3 $pm3
369 * mov b32 $r4 $pm4
370 * sched 0x20 0x20 0x23 0x04 0x20 0x04 0x2b
371 * mov b32 $r5 $pm5
372 * mov b32 $r6 $pm6
373 * mov b32 $r7 $pm7
374 * set $p0 0x1 eq u32 $r8 0x0
375 * mov b32 $r10 c7[0x620]
376 * ext u32 $r8 $r12 0x414
377 * mov b32 $r11 c7[0x624]
378 * sched 0x04 0x2e 0x04 0x20 0x20 0x28 0x04
379 * ext u32 $r9 $r12 0x208
380 * (not $p0) exit
381 * set $p1 0x1 eq u32 $r9 0x0
382 * mul $r8 u32 $r8 u32 96
383 * mul $r12 u32 $r9 u32 16
384 * mul $r13 u32 $r9 u32 4
385 * add b32 $r9 $r8 $r13
386 * sched 0x28 0x04 0x2c 0x04 0x2c 0x04 0x2c
387 * add b32 $r8 $r8 $r12
388 * mov b32 $r12 $r10
389 * add b32 $r10 $c $r10 $r8
390 * mov b32 $r13 $r11
391 * add b32 $r11 $r11 0x0 $c
392 * add b32 $r12 $c $r12 $r9
393 * st b128 wt g[$r10d] $r0q
394 * sched 0x4 0x2c 0x20 0x04 0x2e 0x00 0x00
395 * mov b32 $r0 c7[0x628]
396 * add b32 $r13 $r13 0x0 $c
397 * $p1 st b128 wt g[$r12d+0x40] $r4q
398 * st b32 wt g[$r12d+0x50] $r0
399 * exit */
400 0x2202020202020207ULL,
401 0x2c00000084021c04ULL,
402 0x2c0000000c031c04ULL,
403 0x2c00000010001c04ULL,
404 0x2c00000014005c04ULL,
405 0x2c00000018009c04ULL,
406 0x2c0000001c00dc04ULL,
407 0x2c00000020011c04ULL,
408 0x22b0420042320207ULL,
409 0x2c00000024015c04ULL,
410 0x2c00000028019c04ULL,
411 0x2c0000002c01dc04ULL,
412 0x190e0000fc81dc03ULL,
413 0x28005c1880029de4ULL,
414 0x7000c01050c21c03ULL,
415 0x28005c189002dde4ULL,
416 0x204282020042e047ULL,
417 0x7000c00820c25c03ULL,
418 0x80000000000021e7ULL,
419 0x190e0000fc93dc03ULL,
420 0x1000000180821c02ULL,
421 0x1000000040931c02ULL,
422 0x1000000010935c02ULL,
423 0x4800000034825c03ULL,
424 0x22c042c042c04287ULL,
425 0x4800000030821c03ULL,
426 0x2800000028031de4ULL,
427 0x4801000020a29c03ULL,
428 0x280000002c035de4ULL,
429 0x0800000000b2dc42ULL,
430 0x4801000024c31c03ULL,
431 0x9400000000a01fc5ULL,
432 0x200002e04202c047ULL,
433 0x28005c18a0001de4ULL,
434 0x0800000000d35c42ULL,
435 0x9400000100c107c5ULL,
436 0x9400000140c01f85ULL,
437 0x8000000000001de7ULL
438 };
439
440 static const uint64_t nvf0_read_hw_sm_counters_code[] =
441 {
442 /* Same kernel as GK104 */
443 0x0880808080808080ULL,
444 0x86400000109c0022ULL,
445 0x86400000019c0032ULL,
446 0x86400000021c0002ULL,
447 0x86400000029c0006ULL,
448 0x86400000031c000aULL,
449 0x86400000039c000eULL,
450 0x86400000041c0012ULL,
451 0x08ac1080108c8080ULL,
452 0x86400000049c0016ULL,
453 0x86400000051c001aULL,
454 0x86400000059c001eULL,
455 0xdb201c007f9c201eULL,
456 0x64c03ce0c41c002aULL,
457 0xc00000020a1c3021ULL,
458 0x64c03ce0c49c002eULL,
459 0x0810a0808010b810ULL,
460 0xc0000001041c3025ULL,
461 0x180000000020003cULL,
462 0xdb201c007f9c243eULL,
463 0xc1c00000301c2021ULL,
464 0xc1c00000081c2431ULL,
465 0xc1c00000021c2435ULL,
466 0xe0800000069c2026ULL,
467 0x08b010b010b010a0ULL,
468 0xe0800000061c2022ULL,
469 0xe4c03c00051c0032ULL,
470 0xe0840000041c282aULL,
471 0xe4c03c00059c0036ULL,
472 0xe08040007f9c2c2eULL,
473 0xe0840000049c3032ULL,
474 0xfe800000001c2800ULL,
475 0x080000b81080b010ULL,
476 0x64c03ce0c51c0002ULL,
477 0xe08040007f9c3436ULL,
478 0xfe80000020043010ULL,
479 0xfc800000281c3000ULL,
480 0x18000000001c003cULL,
481 };
482
483 /* For simplicity, we will allocate as many group slots as we allocate counter
484 * slots. This means that a single counter which wants to source from 2 groups
485 * will have to be declared as using 2 counter slots. This shouldn't really be
486 * a problem because such queries don't make much sense ... (unless someone is
487 * really creative).
488 */
489 struct nvc0_hw_sm_counter_cfg
490 {
491 uint32_t func : 16; /* mask or 4-bit logic op (depending on mode) */
492 uint32_t mode : 4; /* LOGOP,B6,LOGOP_B6(_PULSE) */
493 uint32_t sig_dom : 1; /* if 0, MP_PM_A (per warp-sched), if 1, MP_PM_B */
494 uint32_t sig_sel : 8; /* signal group */
495 uint32_t src_mask; /* mask for signal selection (only for NVC0:NVE4) */
496 uint32_t src_sel; /* signal selection for up to 4 sources */
497 };
498
499 struct nvc0_hw_sm_query_cfg
500 {
501 unsigned type;
502 struct nvc0_hw_sm_counter_cfg ctr[8];
503 uint8_t num_counters;
504 uint8_t norm[2]; /* normalization num,denom */
505 };
506
507 #define _CA(f, m, g, s) { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g, 0, s }
508 #define _CB(f, m, g, s) { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g, 0, s }
509 #define _Q(n, c) [NVE4_HW_SM_QUERY_##n] = c
510
511 /* ==== Compute capability 3.0 (GK104:GK110) ==== */
512 static const struct nvc0_hw_sm_query_cfg
513 sm30_active_cycles =
514 {
515 .type = NVC0_HW_SM_QUERY_ACTIVE_CYCLES,
516 .ctr[0] = _CB(0x0001, B6, WARP, 0x00000000),
517 .num_counters = 1,
518 .norm = { 1, 1 },
519 };
520
521 static const struct nvc0_hw_sm_query_cfg
522 sm30_active_warps =
523 {
524 .type = NVC0_HW_SM_QUERY_ACTIVE_WARPS,
525 .ctr[0] = _CB(0x003f, B6, WARP, 0x31483104),
526 .num_counters = 1,
527 .norm = { 2, 1 },
528 };
529
530 static const struct nvc0_hw_sm_query_cfg
531 sm30_atom_cas_count =
532 {
533 .type = NVC0_HW_SM_QUERY_ATOM_CAS_COUNT,
534 .ctr[0] = _CA(0x0001, B6, BRANCH, 0x000000004),
535 .num_counters = 1,
536 .norm = { 1, 1 },
537 };
538
539 static const struct nvc0_hw_sm_query_cfg
540 sm30_atom_count =
541 {
542 .type = NVC0_HW_SM_QUERY_ATOM_COUNT,
543 .ctr[0] = _CA(0x0001, B6, BRANCH, 0x00000000),
544 .num_counters = 1,
545 .norm = { 1, 1 },
546 };
547
548 static const struct nvc0_hw_sm_query_cfg
549 sm30_branch =
550 {
551 .type = NVC0_HW_SM_QUERY_BRANCH,
552 .ctr[0] = _CA(0x0001, B6, BRANCH, 0x0000000c),
553 .num_counters = 1,
554 .norm = { 1, 1 },
555 };
556
557 static const struct nvc0_hw_sm_query_cfg
558 sm30_divergent_branch =
559 {
560 .type = NVC0_HW_SM_QUERY_DIVERGENT_BRANCH,
561 .ctr[0] = _CA(0x0001, B6, BRANCH, 0x00000010),
562 .num_counters = 1,
563 .norm = { 1, 1 },
564 };
565
566 static const struct nvc0_hw_sm_query_cfg
567 sm30_gld_request =
568 {
569 .type = NVC0_HW_SM_QUERY_GLD_REQUEST,
570 .ctr[0] = _CA(0x0001, B6, LDST, 0x00000010),
571 .num_counters = 1,
572 .norm = { 1, 1 },
573 };
574
575 static const struct nvc0_hw_sm_query_cfg
576 sm30_gld_mem_div_replay =
577 {
578 .type = NVC0_HW_SM_QUERY_GLD_MEM_DIV_REPLAY,
579 .ctr[0] = _CB(0x0001, B6, REPLAY, 0x00000010),
580 .num_counters = 1,
581 .norm = { 1, 1 },
582 };
583
584 static const struct nvc0_hw_sm_query_cfg
585 sm30_gst_transactions =
586 {
587 .type = NVC0_HW_SM_QUERY_GST_TRANSACTIONS,
588 .ctr[0] = _CB(0x0001, B6, MEM, 0x00000004),
589 .num_counters = 1,
590 .norm = { 1, 1 },
591 };
592
593 static const struct nvc0_hw_sm_query_cfg
594 sm30_gst_mem_div_replay =
595 {
596 .type = NVC0_HW_SM_QUERY_GST_MEM_DIV_REPLAY,
597 .ctr[0] = _CB(0x0001, B6, REPLAY, 0x00000014),
598 .num_counters = 1,
599 .norm = { 1, 1 },
600 };
601
602 static const struct nvc0_hw_sm_query_cfg
603 sm30_gred_count =
604 {
605 .type = NVC0_HW_SM_QUERY_GRED_COUNT,
606 .ctr[0] = _CA(0x0001, B6, BRANCH, 0x00000008),
607 .num_counters = 1,
608 .norm = { 1, 1 },
609 };
610
611 static const struct nvc0_hw_sm_query_cfg
612 sm30_gst_request =
613 {
614 .type = NVC0_HW_SM_QUERY_GST_REQUEST,
615 .ctr[0] = _CA(0x0001, B6, LDST, 0x00000014),
616 .num_counters = 1,
617 .norm = { 1, 1 },
618 };
619
620 static const struct nvc0_hw_sm_query_cfg
621 sm30_inst_executed =
622 {
623 .type = NVC0_HW_SM_QUERY_INST_EXECUTED,
624 .ctr[0] = _CA(0x0003, B6, EXEC, 0x00000398),
625 .num_counters = 1,
626 .norm = { 1, 1 },
627 };
628
629 static const struct nvc0_hw_sm_query_cfg
630 sm30_inst_issued1 =
631 {
632 .type = NVC0_HW_SM_QUERY_INST_ISSUED1,
633 .ctr[0] = _CA(0x0001, B6, ISSUE, 0x00000004),
634 .num_counters = 1,
635 .norm = { 1, 1 },
636 };
637
638 static const struct nvc0_hw_sm_query_cfg
639 sm30_inst_issued2 =
640 {
641 .type = NVC0_HW_SM_QUERY_INST_ISSUED2,
642 .ctr[0] = _CA(0x0001, B6, ISSUE, 0x00000008),
643 .num_counters = 1,
644 .norm = { 1, 1 },
645 };
646
647 static const struct nvc0_hw_sm_query_cfg
648 sm30_l1_gld_hit =
649 {
650 .type = NVC0_HW_SM_QUERY_L1_GLD_HIT,
651 .ctr[0] = _CB(0x0001, B6, L1, 0x00000010),
652 .num_counters = 1,
653 .norm = { 1, 1 },
654 };
655
656 static const struct nvc0_hw_sm_query_cfg
657 sm30_l1_gld_miss =
658 {
659 .type = NVC0_HW_SM_QUERY_L1_GLD_MISS,
660 .ctr[0] = _CB(0x0001, B6, L1, 0x00000014),
661 .num_counters = 1,
662 .norm = { 1, 1 },
663 };
664
665 static const struct nvc0_hw_sm_query_cfg
666 sm30_l1_gld_transactions =
667 {
668 .type = NVC0_HW_SM_QUERY_L1_GLD_TRANSACTIONS,
669 .ctr[0] = _CB(0x0001, B6, UNK0F, 0x00000000),
670 .num_counters = 1,
671 .norm = { 1, 1 },
672 };
673
674 static const struct nvc0_hw_sm_query_cfg
675 sm30_l1_gst_transactions =
676 {
677 .type = NVC0_HW_SM_QUERY_L1_GST_TRANSACTIONS,
678 .ctr[0] = _CB(0x0001, B6, UNK0F, 0x00000004),
679 .num_counters = 1,
680 .norm = { 1, 1 },
681 };
682
683 static const struct nvc0_hw_sm_query_cfg
684 sm30_l1_local_ld_hit =
685 {
686 .type = NVC0_HW_SM_QUERY_L1_LOCAL_LD_HIT,
687 .ctr[0] = _CB(0x0001, B6, L1, 0x00000000),
688 .num_counters = 1,
689 .norm = { 1, 1 },
690 };
691
692 static const struct nvc0_hw_sm_query_cfg
693 sm30_l1_local_ld_miss =
694 {
695 .type = NVC0_HW_SM_QUERY_L1_LOCAL_LD_MISS,
696 .ctr[0] = _CB(0x0001, B6, L1, 0x00000004),
697 .num_counters = 1,
698 .norm = { 1, 1 },
699 };
700
701 static const struct nvc0_hw_sm_query_cfg
702 sm30_l1_local_st_hit =
703 {
704 .type = NVC0_HW_SM_QUERY_L1_LOCAL_ST_HIT,
705 .ctr[0] = _CB(0x0001, B6, L1, 0x00000008),
706 .num_counters = 1,
707 .norm = { 1, 1 },
708 };
709
710 static const struct nvc0_hw_sm_query_cfg
711 sm30_l1_local_st_miss =
712 {
713 .type = NVC0_HW_SM_QUERY_L1_LOCAL_ST_MISS,
714 .ctr[0] = _CB(0x0001, B6, L1, 0x0000000c),
715 .num_counters = 1,
716 .norm = { 1, 1 },
717 };
718
719 static const struct nvc0_hw_sm_query_cfg
720 sm30_l1_shared_ld_transactions =
721 {
722 .type = NVC0_HW_SM_QUERY_L1_SHARED_LD_TRANSACTIONS,
723 .ctr[0] = _CB(0x0001, B6, TRANSACTION, 0x00000008),
724 .num_counters = 1,
725 .norm = { 1, 1 },
726 };
727
728 static const struct nvc0_hw_sm_query_cfg
729 sm30_l1_shared_st_transactions =
730 {
731 .type = NVC0_HW_SM_QUERY_L1_SHARED_ST_TRANSACTIONS,
732 .ctr[0] = _CB(0x0001, B6, TRANSACTION, 0x0000000c),
733 .num_counters = 1,
734 .norm = { 1, 1 },
735 };
736
737 static const struct nvc0_hw_sm_query_cfg
738 sm30_local_ld =
739 {
740 .type = NVC0_HW_SM_QUERY_LOCAL_LD,
741 .ctr[0] = _CA(0x0001, B6, LDST, 0x00000008),
742 .num_counters = 1,
743 .norm = { 1, 1 },
744 };
745
746 static const struct nvc0_hw_sm_query_cfg
747 sm30_local_ld_transactions =
748 {
749 .type = NVC0_HW_SM_QUERY_LOCAL_LD_TRANSACTIONS,
750 .ctr[0] = _CB(0x0001, B6, TRANSACTION, 0x00000000),
751 .num_counters = 1,
752 .norm = { 1, 1 },
753 };
754
755 static const struct nvc0_hw_sm_query_cfg
756 sm30_local_st =
757 {
758 .type = NVC0_HW_SM_QUERY_LOCAL_ST,
759 .ctr[0] = _CA(0x0001, B6, LDST, 0x0000000c),
760 .num_counters = 1,
761 .norm = { 1, 1 },
762 };
763
764 static const struct nvc0_hw_sm_query_cfg
765 sm30_local_st_transactions =
766 {
767 .type = NVC0_HW_SM_QUERY_LOCAL_ST_TRANSACTIONS,
768 .ctr[0] = _CB(0x0001, B6, TRANSACTION, 0x00000004),
769 .num_counters = 1,
770 .norm = { 1, 1 },
771 };
772
773 static const struct nvc0_hw_sm_query_cfg
774 sm30_prof_trigger_0 =
775 {
776 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_0,
777 .ctr[0] = _CA(0x0001, B6, USER, 0x00000000),
778 .num_counters = 1,
779 .norm = { 1, 1 },
780 };
781
782 static const struct nvc0_hw_sm_query_cfg
783 sm30_prof_trigger_1 =
784 {
785 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_1,
786 .ctr[0] = _CA(0x0001, B6, USER, 0x00000004),
787 .num_counters = 1,
788 .norm = { 1, 1 },
789 };
790
791 static const struct nvc0_hw_sm_query_cfg
792 sm30_prof_trigger_2 =
793 {
794 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_2,
795 .ctr[0] = _CA(0x0001, B6, USER, 0x00000008),
796 .num_counters = 1,
797 .norm = { 1, 1 },
798 };
799
800 static const struct nvc0_hw_sm_query_cfg
801 sm30_prof_trigger_3 =
802 {
803 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_3,
804 .ctr[0] = _CA(0x0001, B6, USER, 0x0000000c),
805 .num_counters = 1,
806 .norm = { 1, 1 },
807 };
808
809 static const struct nvc0_hw_sm_query_cfg
810 sm30_prof_trigger_4 =
811 {
812 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_4,
813 .ctr[0] = _CA(0x0001, B6, USER, 0x00000010),
814 .num_counters = 1,
815 .norm = { 1, 1 },
816 };
817
818 static const struct nvc0_hw_sm_query_cfg
819 sm30_prof_trigger_5 =
820 {
821 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_5,
822 .ctr[0] = _CA(0x0001, B6, USER, 0x00000014),
823 .num_counters = 1,
824 .norm = { 1, 1 },
825 };
826
827 static const struct nvc0_hw_sm_query_cfg
828 sm30_prof_trigger_6 =
829 {
830 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_6,
831 .ctr[0] = _CA(0x0001, B6, USER, 0x00000018),
832 .num_counters = 1,
833 .norm = { 1, 1 },
834 };
835
836 static const struct nvc0_hw_sm_query_cfg
837 sm30_prof_trigger_7 =
838 {
839 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_7,
840 .ctr[0] = _CA(0x0001, B6, USER, 0x0000001c),
841 .num_counters = 1,
842 .norm = { 1, 1 },
843 };
844
845 static const struct nvc0_hw_sm_query_cfg
846 sm30_shared_ld =
847 {
848 .type = NVC0_HW_SM_QUERY_SHARED_LD,
849 .ctr[0] = _CA(0x0001, B6, LDST, 0x00000000),
850 .num_counters = 1,
851 .norm = { 1, 1 },
852 };
853
854 static const struct nvc0_hw_sm_query_cfg
855 sm30_shared_ld_replay =
856 {
857 .type = NVC0_HW_SM_QUERY_SHARED_LD_REPLAY,
858 .ctr[0] = _CB(0x0001, B6, REPLAY, 0x00000008),
859 .num_counters = 1,
860 .norm = { 1, 1 },
861 };
862
863 static const struct nvc0_hw_sm_query_cfg
864 sm30_shared_st =
865 {
866 .type = NVC0_HW_SM_QUERY_SHARED_ST,
867 .ctr[0] = _CA(0x0001, B6, LDST, 0x00000004),
868 .num_counters = 1,
869 .norm = { 1, 1 },
870 };
871
872 static const struct nvc0_hw_sm_query_cfg
873 sm30_shared_st_replay =
874 {
875 .type = NVC0_HW_SM_QUERY_SHARED_ST_REPLAY,
876 .ctr[0] = _CB(0x0001, B6, REPLAY, 0x0000000c),
877 .num_counters = 1,
878 .norm = { 1, 1 },
879 };
880
881 static const struct nvc0_hw_sm_query_cfg
882 sm30_sm_cta_launched =
883 {
884 .type = NVC0_HW_SM_QUERY_SM_CTA_LAUNCHED,
885 .ctr[0] = _CB(0x0001, B6, WARP, 0x0000001c),
886 .num_counters = 1,
887 .norm = { 1, 1 },
888 };
889
890 static const struct nvc0_hw_sm_query_cfg
891 sm30_threads_launched =
892 {
893 .type = NVC0_HW_SM_QUERY_THREADS_LAUNCHED,
894 .ctr[0] = _CA(0x003f, B6, LAUNCH, 0x398a4188),
895 .num_counters = 1,
896 .norm = { 1, 1 },
897 };
898
899 static const struct nvc0_hw_sm_query_cfg
900 sm30_uncached_gld_transactions =
901 {
902 .type = NVC0_HW_SM_QUERY_UNCACHED_GLD_TRANSACTIONS,
903 .ctr[0] = _CB(0x0001, B6, MEM, 0x00000000),
904 .num_counters = 1,
905 .norm = { 1, 1 },
906 };
907
908 static const struct nvc0_hw_sm_query_cfg
909 sm30_warps_launched =
910 {
911 .type = NVC0_HW_SM_QUERY_WARPS_LAUNCHED,
912 .ctr[0] = _CA(0x0001, B6, LAUNCH, 0x00000004),
913 .num_counters = 1,
914 .norm = { 1, 1 },
915 };
916
917 /* NOTES:
918 * active_warps: bit 0 alternates btw 0 and 1 for odd nr of warps
919 * inst_executed etc.: we only count a single warp scheduler
920 */
921 static const struct nvc0_hw_sm_query_cfg *sm30_hw_sm_queries[] =
922 {
923 &sm30_active_cycles,
924 &sm30_active_warps,
925 &sm30_atom_cas_count,
926 &sm30_atom_count,
927 &sm30_branch,
928 &sm30_divergent_branch,
929 &sm30_gld_request,
930 &sm30_gld_mem_div_replay,
931 &sm30_gst_transactions,
932 &sm30_gst_mem_div_replay,
933 &sm30_gred_count,
934 &sm30_gst_request,
935 &sm30_inst_executed,
936 &sm30_inst_issued1,
937 &sm30_inst_issued2,
938 &sm30_l1_gld_hit,
939 &sm30_l1_gld_miss,
940 &sm30_l1_gld_transactions,
941 &sm30_l1_gst_transactions,
942 &sm30_l1_local_ld_hit,
943 &sm30_l1_local_ld_miss,
944 &sm30_l1_local_st_hit,
945 &sm30_l1_local_st_miss,
946 &sm30_l1_shared_ld_transactions,
947 &sm30_l1_shared_st_transactions,
948 &sm30_local_ld,
949 &sm30_local_ld_transactions,
950 &sm30_local_st,
951 &sm30_local_st_transactions,
952 &sm30_prof_trigger_0,
953 &sm30_prof_trigger_1,
954 &sm30_prof_trigger_2,
955 &sm30_prof_trigger_3,
956 &sm30_prof_trigger_4,
957 &sm30_prof_trigger_5,
958 &sm30_prof_trigger_6,
959 &sm30_prof_trigger_7,
960 &sm30_shared_ld,
961 &sm30_shared_ld_replay,
962 &sm30_shared_st,
963 &sm30_shared_st_replay,
964 &sm30_sm_cta_launched,
965 &sm30_threads_launched,
966 &sm30_uncached_gld_transactions,
967 &sm30_warps_launched,
968 };
969
970 /* ==== Compute capability 3.5 (GK110/GK208) ==== */
971 static const struct nvc0_hw_sm_query_cfg
972 sm35_atom_cas_count =
973 {
974 .type = NVC0_HW_SM_QUERY_ATOM_CAS_COUNT,
975 .ctr[0] = _CA(0x0001, B6, UNK1A, 0x00000014),
976 .num_counters = 1,
977 .norm = { 1, 1 },
978 };
979
980 static const struct nvc0_hw_sm_query_cfg
981 sm35_atom_count =
982 {
983 .type = NVC0_HW_SM_QUERY_ATOM_COUNT,
984 .ctr[0] = _CA(0x0001, B6, UNK1A, 0x00000010),
985 .num_counters = 1,
986 .norm = { 1, 1 },
987 };
988
989 static const struct nvc0_hw_sm_query_cfg
990 sm35_gred_count =
991 {
992 .type = NVC0_HW_SM_QUERY_GRED_COUNT,
993 .ctr[0] = _CA(0x0001, B6, UNK1A, 0x00000018),
994 .num_counters = 1,
995 .norm = { 1, 1 },
996 };
997
998 static const struct nvc0_hw_sm_query_cfg
999 sm35_not_pred_off_inst_executed =
1000 {
1001 .type = NVC0_HW_SM_QUERY_NOT_PRED_OFF_INST_EXECUTED,
1002 .ctr[0] = _CA(0x003f, B6, UNK14, 0x29062080),
1003 .num_counters = 1,
1004 .norm = { 1, 1 },
1005 };
1006
1007 static const struct nvc0_hw_sm_query_cfg
1008 sm35_shared_ld_replay =
1009 {
1010 .type = NVC0_HW_SM_QUERY_SHARED_LD_REPLAY,
1011 .ctr[0] = _CB(0xaaaa, LOGOP, UNK13, 0x00000018),
1012 .ctr[1] = _CB(0x8888, LOGOP, REPLAY, 0x00000151),
1013 .num_counters = 2,
1014 .norm = { 1, 1 },
1015 };
1016
1017 static const struct nvc0_hw_sm_query_cfg
1018 sm35_shared_st_replay =
1019 {
1020 .type = NVC0_HW_SM_QUERY_SHARED_ST_REPLAY,
1021 .ctr[0] = _CB(0xaaaa, LOGOP, UNK13, 0x00000018),
1022 .ctr[1] = _CB(0x8888, LOGOP, REPLAY, 0x000001d1),
1023 .num_counters = 2,
1024 .norm = { 1, 1 },
1025 };
1026
1027 static const struct nvc0_hw_sm_query_cfg
1028 sm35_th_inst_executed =
1029 {
1030 .type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED,
1031 .ctr[0] = _CA(0x003f, B6, UNK11, 0x29062080),
1032 .num_counters = 1,
1033 .norm = { 1, 1 },
1034 };
1035
1036 static const struct nvc0_hw_sm_query_cfg *sm35_hw_sm_queries[] =
1037 {
1038 &sm30_active_cycles,
1039 &sm30_active_warps,
1040 &sm35_atom_cas_count,
1041 &sm35_atom_count,
1042 &sm30_gld_request,
1043 &sm30_gld_mem_div_replay,
1044 &sm30_gst_transactions,
1045 &sm30_gst_mem_div_replay,
1046 &sm35_gred_count,
1047 &sm30_gst_request,
1048 &sm30_inst_executed,
1049 &sm30_inst_issued1,
1050 &sm30_inst_issued2,
1051 &sm30_l1_gld_hit,
1052 &sm30_l1_gld_miss,
1053 &sm30_l1_gld_transactions,
1054 &sm30_l1_gst_transactions,
1055 &sm30_l1_local_ld_hit,
1056 &sm30_l1_local_ld_miss,
1057 &sm30_l1_local_st_hit,
1058 &sm30_l1_local_st_miss,
1059 &sm30_l1_shared_ld_transactions,
1060 &sm30_l1_shared_st_transactions,
1061 &sm30_local_ld,
1062 &sm30_local_ld_transactions,
1063 &sm30_local_st,
1064 &sm30_local_st_transactions,
1065 &sm35_not_pred_off_inst_executed,
1066 &sm30_prof_trigger_0,
1067 &sm30_prof_trigger_1,
1068 &sm30_prof_trigger_2,
1069 &sm30_prof_trigger_3,
1070 &sm30_prof_trigger_4,
1071 &sm30_prof_trigger_5,
1072 &sm30_prof_trigger_6,
1073 &sm30_prof_trigger_7,
1074 &sm30_shared_ld,
1075 &sm35_shared_ld_replay,
1076 &sm30_shared_st,
1077 &sm35_shared_st_replay,
1078 &sm30_sm_cta_launched,
1079 &sm35_th_inst_executed,
1080 &sm30_threads_launched,
1081 &sm30_uncached_gld_transactions,
1082 &sm30_warps_launched,
1083 };
1084
1085 #undef _Q
1086 #undef _CA
1087 #undef _CB
1088
1089 /* === PERFORMANCE MONITORING COUNTERS for NVC0:NVE4 === */
1090 /* NOTES:
1091 * - MP counters on GF100/GF110 (compute capability 2.0) are buggy
1092 * because there is a context-switch problem that we need to fix.
1093 * Results might be wrong sometimes, be careful!
1094 */
1095 static const uint64_t nvc0_read_hw_sm_counters_code[] =
1096 {
1097 /* mov b32 $r8 $tidx
1098 * mov b32 $r9 $physid
1099 * mov b32 $r0 $pm0
1100 * mov b32 $r1 $pm1
1101 * mov b32 $r2 $pm2
1102 * mov b32 $r3 $pm3
1103 * mov b32 $r4 $pm4
1104 * mov b32 $r5 $pm5
1105 * mov b32 $r6 $pm6
1106 * mov b32 $r7 $pm7
1107 * set $p0 0x1 eq u32 $r8 0x0
1108 * mov b32 $r10 c15[0x620]
1109 * mov b32 $r11 c15[0x624]
1110 * ext u32 $r8 $r9 0x414
1111 * (not $p0) exit
1112 * mul $r8 u32 $r8 u32 48
1113 * add b32 $r10 $c $r10 $r8
1114 * add b32 $r11 $r11 0x0 $c
1115 * mov b32 $r8 c15[0x628]
1116 * st b128 wt g[$r10d+0x00] $r0q
1117 * st b128 wt g[$r10d+0x10] $r4q
1118 * st b32 wt g[$r10d+0x20] $r8
1119 * exit */
1120 0x2c00000084021c04ULL,
1121 0x2c0000000c025c04ULL,
1122 0x2c00000010001c04ULL,
1123 0x2c00000014005c04ULL,
1124 0x2c00000018009c04ULL,
1125 0x2c0000001c00dc04ULL,
1126 0x2c00000020011c04ULL,
1127 0x2c00000024015c04ULL,
1128 0x2c00000028019c04ULL,
1129 0x2c0000002c01dc04ULL,
1130 0x190e0000fc81dc03ULL,
1131 0x28007c1880029de4ULL,
1132 0x28007c189002dde4ULL,
1133 0x7000c01050921c03ULL,
1134 0x80000000000021e7ULL,
1135 0x10000000c0821c02ULL,
1136 0x4801000020a29c03ULL,
1137 0x0800000000b2dc42ULL,
1138 0x28007c18a0021de4ULL,
1139 0x9400000000a01fc5ULL,
1140 0x9400000040a11fc5ULL,
1141 0x9400000080a21f85ULL,
1142 0x8000000000001de7ULL
1143 };
1144
1145 #define _C(f, o, g, m, s) { f, NVC0_COMPUTE_MP_PM_OP_MODE_##o, 0, g, m, s }
1146
1147 /* ==== Compute capability 2.0 (GF100/GF110) ==== */
1148 static const struct nvc0_hw_sm_query_cfg
1149 sm20_active_cycles =
1150 {
1151 .type = NVC0_HW_SM_QUERY_ACTIVE_CYCLES,
1152 .ctr[0] = _C(0xaaaa, LOGOP, 0x11, 0x000000ff, 0x00000000),
1153 .num_counters = 1,
1154 .norm = { 1, 1 },
1155 };
1156
1157 static const struct nvc0_hw_sm_query_cfg
1158 sm20_active_warps =
1159 {
1160 .type = NVC0_HW_SM_QUERY_ACTIVE_WARPS,
1161 .ctr[0] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000010),
1162 .ctr[1] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000020),
1163 .ctr[2] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000030),
1164 .ctr[3] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000040),
1165 .ctr[4] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000050),
1166 .ctr[5] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000060),
1167 .num_counters = 6,
1168 .norm = { 1, 1 },
1169 };
1170
1171 static const struct nvc0_hw_sm_query_cfg
1172 sm20_atom_count =
1173 {
1174 .type = NVC0_HW_SM_QUERY_ATOM_COUNT,
1175 .ctr[0] = _C(0xaaaa, LOGOP, 0x63, 0x000000ff, 0x00000030),
1176 .num_counters = 1,
1177 .norm = { 1, 1 },
1178 };
1179
1180 static const struct nvc0_hw_sm_query_cfg
1181 sm20_branch =
1182 {
1183 .type = NVC0_HW_SM_QUERY_BRANCH,
1184 .ctr[0] = _C(0xaaaa, LOGOP, 0x1a, 0x000000ff, 0x00000000),
1185 .ctr[1] = _C(0xaaaa, LOGOP, 0x1a, 0x000000ff, 0x00000010),
1186 .num_counters = 2,
1187 .norm = { 1, 1 },
1188 };
1189
1190 static const struct nvc0_hw_sm_query_cfg
1191 sm20_divergent_branch =
1192 {
1193 .type = NVC0_HW_SM_QUERY_DIVERGENT_BRANCH,
1194 .ctr[0] = _C(0xaaaa, LOGOP, 0x19, 0x000000ff, 0x00000020),
1195 .ctr[1] = _C(0xaaaa, LOGOP, 0x19, 0x000000ff, 0x00000030),
1196 .num_counters = 2,
1197 .norm = { 1, 1 },
1198 };
1199
1200 static const struct nvc0_hw_sm_query_cfg
1201 sm20_gld_request =
1202 {
1203 .type = NVC0_HW_SM_QUERY_GLD_REQUEST,
1204 .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000030),
1205 .num_counters = 1,
1206 .norm = { 1, 1 },
1207 };
1208
1209 static const struct nvc0_hw_sm_query_cfg
1210 sm20_gred_count =
1211 {
1212 .type = NVC0_HW_SM_QUERY_GRED_COUNT,
1213 .ctr[0] = _C(0xaaaa, LOGOP, 0x63, 0x000000ff, 0x00000040),
1214 .num_counters = 1,
1215 .norm = { 1, 1 },
1216 };
1217
1218 static const struct nvc0_hw_sm_query_cfg
1219 sm20_gst_request =
1220 {
1221 .type = NVC0_HW_SM_QUERY_GST_REQUEST,
1222 .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000060),
1223 .num_counters = 1,
1224 .norm = { 1, 1 },
1225 };
1226
1227 static const struct nvc0_hw_sm_query_cfg
1228 sm20_inst_executed =
1229 {
1230 .type = NVC0_HW_SM_QUERY_INST_EXECUTED,
1231 .ctr[0] = _C(0xaaaa, LOGOP, 0x2d, 0x0000ffff, 0x00001000),
1232 .ctr[1] = _C(0xaaaa, LOGOP, 0x2d, 0x0000ffff, 0x00001010),
1233 .num_counters = 2,
1234 .norm = { 1, 1 },
1235 };
1236
1237 static const struct nvc0_hw_sm_query_cfg
1238 sm20_inst_issued =
1239 {
1240 .type = NVC0_HW_SM_QUERY_INST_ISSUED,
1241 .ctr[0] = _C(0xaaaa, LOGOP, 0x27, 0x0000ffff, 0x00007060),
1242 .ctr[1] = _C(0xaaaa, LOGOP, 0x27, 0x0000ffff, 0x00007070),
1243 .num_counters = 2,
1244 .norm = { 1, 1 },
1245 };
1246
1247 static const struct nvc0_hw_sm_query_cfg
1248 sm20_local_ld =
1249 {
1250 .type = NVC0_HW_SM_QUERY_LOCAL_LD,
1251 .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000020),
1252 .num_counters = 1,
1253 .norm = { 1, 1 },
1254 };
1255
1256 static const struct nvc0_hw_sm_query_cfg
1257 sm20_local_st =
1258 {
1259 .type = NVC0_HW_SM_QUERY_LOCAL_ST,
1260 .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000050),
1261 .num_counters = 1,
1262 .norm = { 1, 1 },
1263 };
1264
1265 static const struct nvc0_hw_sm_query_cfg
1266 sm20_prof_trigger_0 =
1267 {
1268 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_0,
1269 .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000000),
1270 .num_counters = 1,
1271 .norm = { 1, 1 },
1272 };
1273
1274 static const struct nvc0_hw_sm_query_cfg
1275 sm20_prof_trigger_1 =
1276 {
1277 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_1,
1278 .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000010),
1279 .num_counters = 1,
1280 .norm = { 1, 1 },
1281 };
1282
1283 static const struct nvc0_hw_sm_query_cfg
1284 sm20_prof_trigger_2 =
1285 {
1286 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_2,
1287 .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000020),
1288 .num_counters = 1,
1289 .norm = { 1, 1 },
1290 };
1291
1292 static const struct nvc0_hw_sm_query_cfg
1293 sm20_prof_trigger_3 =
1294 {
1295 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_3,
1296 .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000030),
1297 .num_counters = 1,
1298 .norm = { 1, 1 },
1299 };
1300
1301 static const struct nvc0_hw_sm_query_cfg
1302 sm20_prof_trigger_4 =
1303 {
1304 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_4,
1305 .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000040),
1306 .num_counters = 1,
1307 .norm = { 1, 1 },
1308 };
1309
1310 static const struct nvc0_hw_sm_query_cfg
1311 sm20_prof_trigger_5 =
1312 {
1313 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_5,
1314 .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000050),
1315 .num_counters = 1,
1316 .norm = { 1, 1 },
1317 };
1318
1319 static const struct nvc0_hw_sm_query_cfg
1320 sm20_prof_trigger_6 =
1321 {
1322 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_6,
1323 .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000060),
1324 .num_counters = 1,
1325 .norm = { 1, 1 },
1326 };
1327
1328 static const struct nvc0_hw_sm_query_cfg
1329 sm20_prof_trigger_7 =
1330 {
1331 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_7,
1332 .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000070),
1333 .num_counters = 1,
1334 .norm = { 1, 1 },
1335 };
1336
1337 static const struct nvc0_hw_sm_query_cfg
1338 sm20_shared_ld =
1339 {
1340 .type = NVC0_HW_SM_QUERY_SHARED_LD,
1341 .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000010),
1342 .num_counters = 1,
1343 .norm = { 1, 1 },
1344 };
1345
1346 static const struct nvc0_hw_sm_query_cfg
1347 sm20_shared_st =
1348 {
1349 .type = NVC0_HW_SM_QUERY_SHARED_ST,
1350 .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000040),
1351 .num_counters = 1,
1352 .norm = { 1, 1 },
1353 };
1354
1355 static const struct nvc0_hw_sm_query_cfg
1356 sm20_threads_launched =
1357 {
1358 .type = NVC0_HW_SM_QUERY_THREADS_LAUNCHED,
1359 .ctr[0] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000010),
1360 .ctr[1] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000020),
1361 .ctr[2] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000030),
1362 .ctr[3] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000040),
1363 .ctr[4] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000050),
1364 .ctr[5] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000060),
1365 .num_counters = 6,
1366 .norm = { 1, 1 },
1367 };
1368
1369 static const struct nvc0_hw_sm_query_cfg
1370 sm20_th_inst_executed_0 =
1371 {
1372 .type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED_0,
1373 .ctr[0] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000000),
1374 .ctr[1] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000010),
1375 .ctr[2] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000020),
1376 .ctr[3] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000030),
1377 .ctr[4] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000040),
1378 .ctr[5] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000050),
1379 .num_counters = 6,
1380 .norm = { 1, 1 },
1381 };
1382
1383 static const struct nvc0_hw_sm_query_cfg
1384 sm20_th_inst_executed_1 =
1385 {
1386 .type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED_1,
1387 .ctr[0] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000000),
1388 .ctr[1] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000010),
1389 .ctr[2] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000020),
1390 .ctr[3] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000030),
1391 .ctr[4] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000040),
1392 .ctr[5] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000050),
1393 .num_counters = 6,
1394 .norm = { 1, 1 },
1395 };
1396
1397 static const struct nvc0_hw_sm_query_cfg
1398 sm20_warps_launched =
1399 {
1400 .type = NVC0_HW_SM_QUERY_WARPS_LAUNCHED,
1401 .ctr[0] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000000),
1402 .num_counters = 1,
1403 .norm = { 1, 1 },
1404 };
1405
1406 static const struct nvc0_hw_sm_query_cfg *sm20_hw_sm_queries[] =
1407 {
1408 &sm20_active_cycles,
1409 &sm20_active_warps,
1410 &sm20_atom_count,
1411 &sm20_branch,
1412 &sm20_divergent_branch,
1413 &sm20_gld_request,
1414 &sm20_gred_count,
1415 &sm20_gst_request,
1416 &sm20_inst_executed,
1417 &sm20_inst_issued,
1418 &sm20_local_ld,
1419 &sm20_local_st,
1420 &sm20_prof_trigger_0,
1421 &sm20_prof_trigger_1,
1422 &sm20_prof_trigger_2,
1423 &sm20_prof_trigger_3,
1424 &sm20_prof_trigger_4,
1425 &sm20_prof_trigger_5,
1426 &sm20_prof_trigger_6,
1427 &sm20_prof_trigger_7,
1428 &sm20_shared_ld,
1429 &sm20_shared_st,
1430 &sm20_threads_launched,
1431 &sm20_th_inst_executed_0,
1432 &sm20_th_inst_executed_1,
1433 &sm20_warps_launched,
1434 };
1435
1436 /* ==== Compute capability 2.1 (GF108+ except GF110) ==== */
1437 static const struct nvc0_hw_sm_query_cfg
1438 sm21_inst_executed =
1439 {
1440 .type = NVC0_HW_SM_QUERY_INST_EXECUTED,
1441 .ctr[0] = _C(0xaaaa, LOGOP, 0x2d, 0x000000ff, 0x00000000),
1442 .ctr[1] = _C(0xaaaa, LOGOP, 0x2d, 0x000000ff, 0x00000010),
1443 .ctr[2] = _C(0xaaaa, LOGOP, 0x2d, 0x000000ff, 0x00000020),
1444 .num_counters = 3,
1445 .norm = { 1, 1 },
1446 };
1447
1448 static const struct nvc0_hw_sm_query_cfg
1449 sm21_inst_issued1_0 =
1450 {
1451 .type = NVC0_HW_SM_QUERY_INST_ISSUED1_0,
1452 .ctr[0] = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000010),
1453 .num_counters = 1,
1454 .norm = { 1, 1 },
1455 };
1456
1457 static const struct nvc0_hw_sm_query_cfg
1458 sm21_inst_issued1_1 =
1459 {
1460 .type = NVC0_HW_SM_QUERY_INST_ISSUED1_1,
1461 .ctr[0] = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000040),
1462 .num_counters = 1,
1463 .norm = { 1, 1 },
1464 };
1465
1466 static const struct nvc0_hw_sm_query_cfg
1467 sm21_inst_issued2_0 =
1468 {
1469 .type = NVC0_HW_SM_QUERY_INST_ISSUED2_0,
1470 .ctr[0] = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000020),
1471 .num_counters = 1,
1472 .norm = { 1, 1 },
1473 };
1474
1475 static const struct nvc0_hw_sm_query_cfg
1476 sm21_inst_issued2_1 =
1477 {
1478 .type = NVC0_HW_SM_QUERY_INST_ISSUED2_1,
1479 .ctr[0] = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000050),
1480 .num_counters = 1,
1481 .norm = { 1, 1 },
1482 };
1483
1484 static const struct nvc0_hw_sm_query_cfg
1485 sm21_th_inst_executed_0 =
1486 {
1487 .type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED_0,
1488 .ctr[0] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000000),
1489 .ctr[1] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000010),
1490 .ctr[2] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000020),
1491 .ctr[3] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000030),
1492 .ctr[4] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000040),
1493 .ctr[5] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000050),
1494 .num_counters = 6,
1495 .norm = { 1, 1 },
1496 };
1497
1498 static const struct nvc0_hw_sm_query_cfg
1499 sm21_th_inst_executed_1 =
1500 {
1501 .type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED_1,
1502 .ctr[0] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000000),
1503 .ctr[1] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000010),
1504 .ctr[2] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000020),
1505 .ctr[3] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000030),
1506 .ctr[4] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000040),
1507 .ctr[5] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000050),
1508 .num_counters = 6,
1509 .norm = { 1, 1 },
1510 };
1511
1512 static const struct nvc0_hw_sm_query_cfg
1513 sm21_th_inst_executed_2 =
1514 {
1515 .type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED_2,
1516 .ctr[0] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000000),
1517 .ctr[1] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000010),
1518 .ctr[2] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000020),
1519 .ctr[3] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000030),
1520 .ctr[4] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000040),
1521 .ctr[5] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000050),
1522 .num_counters = 6,
1523 .norm = { 1, 1 },
1524 };
1525
1526 static const struct nvc0_hw_sm_query_cfg
1527 sm21_th_inst_executed_3 =
1528 {
1529 .type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED_3,
1530 .ctr[0] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000000),
1531 .ctr[1] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000010),
1532 .ctr[2] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000020),
1533 .ctr[3] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000030),
1534 .ctr[4] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000040),
1535 .ctr[5] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000050),
1536 .num_counters = 6,
1537 .norm = { 1, 1 },
1538 };
1539
1540 static const struct nvc0_hw_sm_query_cfg *sm21_hw_sm_queries[] =
1541 {
1542 &sm20_active_cycles,
1543 &sm20_active_warps,
1544 &sm20_atom_count,
1545 &sm20_branch,
1546 &sm20_divergent_branch,
1547 &sm20_gld_request,
1548 &sm20_gred_count,
1549 &sm20_gst_request,
1550 &sm21_inst_executed,
1551 &sm21_inst_issued1_0,
1552 &sm21_inst_issued1_1,
1553 &sm21_inst_issued2_0,
1554 &sm21_inst_issued2_1,
1555 &sm20_local_ld,
1556 &sm20_local_st,
1557 &sm20_prof_trigger_0,
1558 &sm20_prof_trigger_1,
1559 &sm20_prof_trigger_2,
1560 &sm20_prof_trigger_3,
1561 &sm20_prof_trigger_4,
1562 &sm20_prof_trigger_5,
1563 &sm20_prof_trigger_6,
1564 &sm20_prof_trigger_7,
1565 &sm20_shared_ld,
1566 &sm20_shared_st,
1567 &sm20_threads_launched,
1568 &sm21_th_inst_executed_0,
1569 &sm21_th_inst_executed_1,
1570 &sm21_th_inst_executed_2,
1571 &sm21_th_inst_executed_3,
1572 &sm20_warps_launched,
1573 };
1574
1575 #undef _C
1576
1577 static inline const struct nvc0_hw_sm_query_cfg **
1578 nvc0_hw_sm_get_queries(struct nvc0_screen *screen)
1579 {
1580 struct nouveau_device *dev = screen->base.device;
1581
1582 switch (screen->base.class_3d) {
1583 case NVF0_3D_CLASS:
1584 return sm35_hw_sm_queries;
1585 case NVE4_3D_CLASS:
1586 return sm30_hw_sm_queries;
1587 default:
1588 if (dev->chipset == 0xc0 || dev->chipset == 0xc8)
1589 return sm20_hw_sm_queries;
1590 return sm21_hw_sm_queries;
1591 }
1592 assert(0);
1593 return NULL;
1594 }
1595
1596 unsigned
1597 nvc0_hw_sm_get_num_queries(struct nvc0_screen *screen)
1598 {
1599 struct nouveau_device *dev = screen->base.device;
1600
1601 switch (screen->base.class_3d) {
1602 case NVF0_3D_CLASS:
1603 return ARRAY_SIZE(sm35_hw_sm_queries);
1604 case NVE4_3D_CLASS:
1605 return ARRAY_SIZE(sm30_hw_sm_queries);
1606 default:
1607 if (dev->chipset == 0xc0 || dev->chipset == 0xc8)
1608 return ARRAY_SIZE(sm20_hw_sm_queries);
1609 return ARRAY_SIZE(sm21_hw_sm_queries);
1610 }
1611 return 0;
1612 }
1613
1614 static const struct nvc0_hw_sm_query_cfg *
1615 nvc0_hw_sm_query_get_cfg(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
1616 {
1617 const struct nvc0_hw_sm_query_cfg **queries;
1618 struct nvc0_screen *screen = nvc0->screen;
1619 struct nvc0_query *q = &hq->base;
1620 unsigned num_queries;
1621 unsigned i;
1622
1623 num_queries = nvc0_hw_sm_get_num_queries(screen);
1624 queries = nvc0_hw_sm_get_queries(screen);
1625
1626 for (i = 0; i < num_queries; i++) {
1627 if (NVC0_HW_SM_QUERY(queries[i]->type) == q->type)
1628 return queries[i];
1629 }
1630 assert(0);
1631 return NULL;
1632 }
1633
1634 static void
1635 nvc0_hw_sm_destroy_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
1636 {
1637 struct nvc0_query *q = &hq->base;
1638 nvc0_hw_query_allocate(nvc0, q, 0);
1639 nouveau_fence_ref(NULL, &hq->fence);
1640 FREE(hq);
1641 }
1642
1643 static boolean
1644 nve4_hw_sm_begin_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
1645 {
1646 struct nvc0_screen *screen = nvc0->screen;
1647 struct nouveau_pushbuf *push = nvc0->base.pushbuf;
1648 struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
1649 const struct nvc0_hw_sm_query_cfg *cfg;
1650 unsigned i, c;
1651 unsigned num_ab[2] = { 0, 0 };
1652
1653 cfg = nvc0_hw_sm_query_get_cfg(nvc0, hq);
1654
1655 /* check if we have enough free counter slots */
1656 for (i = 0; i < cfg->num_counters; ++i)
1657 num_ab[cfg->ctr[i].sig_dom]++;
1658
1659 if (screen->pm.num_hw_sm_active[0] + num_ab[0] > 4 ||
1660 screen->pm.num_hw_sm_active[1] + num_ab[1] > 4) {
1661 NOUVEAU_ERR("Not enough free MP counter slots !\n");
1662 return false;
1663 }
1664
1665 assert(cfg->num_counters <= 4);
1666 PUSH_SPACE(push, 4 * 8 * + 6);
1667
1668 if (!screen->pm.mp_counters_enabled) {
1669 screen->pm.mp_counters_enabled = true;
1670 BEGIN_NVC0(push, SUBC_SW(0x06ac), 1);
1671 PUSH_DATA (push, 0x1fcb);
1672 }
1673
1674 /* set sequence field to 0 (used to check if result is available) */
1675 for (i = 0; i < screen->mp_count; ++i)
1676 hq->data[i * 10 + 10] = 0;
1677 hq->sequence++;
1678
1679 for (i = 0; i < cfg->num_counters; ++i) {
1680 const unsigned d = cfg->ctr[i].sig_dom;
1681
1682 if (!screen->pm.num_hw_sm_active[d]) {
1683 uint32_t m = (1 << 22) | (1 << (7 + (8 * !d)));
1684 if (screen->pm.num_hw_sm_active[!d])
1685 m |= 1 << (7 + (8 * d));
1686 BEGIN_NVC0(push, SUBC_SW(0x0600), 1);
1687 PUSH_DATA (push, m);
1688 }
1689 screen->pm.num_hw_sm_active[d]++;
1690
1691 for (c = d * 4; c < (d * 4 + 4); ++c) {
1692 if (!screen->pm.mp_counter[c]) {
1693 hsq->ctr[i] = c;
1694 screen->pm.mp_counter[c] = hsq;
1695 break;
1696 }
1697 }
1698 assert(c <= (d * 4 + 3)); /* must succeed, already checked for space */
1699
1700 /* configure and reset the counter(s) */
1701 if (d == 0)
1702 BEGIN_NVC0(push, NVE4_CP(MP_PM_A_SIGSEL(c & 3)), 1);
1703 else
1704 BEGIN_NVC0(push, NVE4_CP(MP_PM_B_SIGSEL(c & 3)), 1);
1705 PUSH_DATA (push, cfg->ctr[i].sig_sel);
1706 BEGIN_NVC0(push, NVE4_CP(MP_PM_SRCSEL(c)), 1);
1707 PUSH_DATA (push, cfg->ctr[i].src_sel + 0x2108421 * (c & 3));
1708 BEGIN_NVC0(push, NVE4_CP(MP_PM_FUNC(c)), 1);
1709 PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode);
1710 BEGIN_NVC0(push, NVE4_CP(MP_PM_SET(c)), 1);
1711 PUSH_DATA (push, 0);
1712 }
1713 return true;
1714 }
1715
1716 static boolean
1717 nvc0_hw_sm_begin_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
1718 {
1719 struct nvc0_screen *screen = nvc0->screen;
1720 struct nouveau_pushbuf *push = nvc0->base.pushbuf;
1721 struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
1722 const struct nvc0_hw_sm_query_cfg *cfg;
1723 unsigned i, c;
1724
1725 if (screen->base.class_3d >= NVE4_3D_CLASS)
1726 return nve4_hw_sm_begin_query(nvc0, hq);
1727
1728 cfg = nvc0_hw_sm_query_get_cfg(nvc0, hq);
1729
1730 /* check if we have enough free counter slots */
1731 if (screen->pm.num_hw_sm_active[0] + cfg->num_counters > 8) {
1732 NOUVEAU_ERR("Not enough free MP counter slots !\n");
1733 return false;
1734 }
1735
1736 assert(cfg->num_counters <= 8);
1737 PUSH_SPACE(push, 8 * 8 + 2);
1738
1739 /* set sequence field to 0 (used to check if result is available) */
1740 for (i = 0; i < screen->mp_count; ++i) {
1741 const unsigned b = (0x30 / 4) * i;
1742 hq->data[b + 8] = 0;
1743 }
1744 hq->sequence++;
1745
1746 for (i = 0; i < cfg->num_counters; ++i) {
1747 uint32_t mask_sel = 0x00000000;
1748
1749 if (!screen->pm.num_hw_sm_active[0]) {
1750 BEGIN_NVC0(push, SUBC_SW(0x0600), 1);
1751 PUSH_DATA (push, 0x80000000);
1752 }
1753 screen->pm.num_hw_sm_active[0]++;
1754
1755 for (c = 0; c < 8; ++c) {
1756 if (!screen->pm.mp_counter[c]) {
1757 hsq->ctr[i] = c;
1758 screen->pm.mp_counter[c] = hsq;
1759 break;
1760 }
1761 }
1762
1763 /* Oddly-enough, the signal id depends on the slot selected on Fermi but
1764 * not on Kepler. Fortunately, the signal ids are just offseted by the
1765 * slot id! */
1766 mask_sel |= c;
1767 mask_sel |= (c << 8);
1768 mask_sel |= (c << 16);
1769 mask_sel |= (c << 24);
1770 mask_sel &= cfg->ctr[i].src_mask;
1771
1772 /* configure and reset the counter(s) */
1773 BEGIN_NVC0(push, NVC0_CP(MP_PM_SIGSEL(c)), 1);
1774 PUSH_DATA (push, cfg->ctr[i].sig_sel);
1775 BEGIN_NVC0(push, NVC0_CP(MP_PM_SRCSEL(c)), 1);
1776 PUSH_DATA (push, cfg->ctr[i].src_sel | mask_sel);
1777 BEGIN_NVC0(push, NVC0_CP(MP_PM_OP(c)), 1);
1778 PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode);
1779 BEGIN_NVC0(push, NVC0_CP(MP_PM_SET(c)), 1);
1780 PUSH_DATA (push, 0);
1781 }
1782 return true;
1783 }
1784
1785 static inline struct nvc0_program *
1786 nvc0_hw_sm_get_program(struct nvc0_screen *screen)
1787 {
1788 struct nvc0_program *prog;
1789
1790 prog = CALLOC_STRUCT(nvc0_program);
1791 if (!prog)
1792 return NULL;
1793
1794 prog->type = PIPE_SHADER_COMPUTE;
1795 prog->translated = true;
1796 prog->parm_size = 12;
1797
1798 if (screen->base.class_3d == NVE4_3D_CLASS ||
1799 screen->base.class_3d == NVF0_3D_CLASS) {
1800 if (screen->base.class_3d == NVE4_3D_CLASS) {
1801 prog->code = (uint32_t *)nve4_read_hw_sm_counters_code;
1802 prog->code_size = sizeof(nve4_read_hw_sm_counters_code);
1803 } else {
1804 prog->code = (uint32_t *)nvf0_read_hw_sm_counters_code;
1805 prog->code_size = sizeof(nvf0_read_hw_sm_counters_code);
1806 }
1807 prog->num_gprs = 14;
1808 } else {
1809 prog->code = (uint32_t *)nvc0_read_hw_sm_counters_code;
1810 prog->code_size = sizeof(nvc0_read_hw_sm_counters_code);
1811 prog->num_gprs = 12;
1812 }
1813 return prog;
1814 }
1815
1816 static inline void
1817 nvc0_hw_sm_upload_input(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
1818 {
1819 struct nouveau_pushbuf *push = nvc0->base.pushbuf;
1820 struct nvc0_screen *screen = nvc0->screen;
1821 uint64_t address;
1822 const int s = 5;
1823
1824 address = screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s);
1825
1826 PUSH_SPACE(push, 11);
1827
1828 if (screen->base.class_3d >= NVE4_3D_CLASS) {
1829 BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
1830 PUSH_DATAh(push, address + NVC0_CB_AUX_MP_INFO);
1831 PUSH_DATA (push, address + NVC0_CB_AUX_MP_INFO);
1832 BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
1833 PUSH_DATA (push, 3 * 4);
1834 PUSH_DATA (push, 0x1);
1835 BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 3);
1836 PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
1837 } else {
1838 BEGIN_NVC0(push, NVC0_CP(CB_SIZE), 3);
1839 PUSH_DATA (push, NVC0_CB_AUX_SIZE);
1840 PUSH_DATAh(push, address);
1841 PUSH_DATA (push, address);
1842 BEGIN_1IC0(push, NVC0_CP(CB_POS), 1 + 3);
1843 PUSH_DATA (push, NVC0_CB_AUX_MP_INFO);
1844 }
1845 PUSH_DATA (push, (hq->bo->offset + hq->base_offset));
1846 PUSH_DATAh(push, (hq->bo->offset + hq->base_offset));
1847 PUSH_DATA (push, hq->sequence);
1848 }
1849
1850 static void
1851 nvc0_hw_sm_end_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
1852 {
1853 struct nvc0_screen *screen = nvc0->screen;
1854 struct pipe_context *pipe = &nvc0->base.pipe;
1855 struct nouveau_pushbuf *push = nvc0->base.pushbuf;
1856 const bool is_nve4 = screen->base.class_3d >= NVE4_3D_CLASS;
1857 struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
1858 struct nvc0_program *old = nvc0->compprog;
1859 struct pipe_grid_info info = {};
1860 uint32_t mask;
1861 uint32_t input[3];
1862 const uint block[3] = { 32, is_nve4 ? 4 : 1, 1 };
1863 const uint grid[3] = { screen->mp_count, screen->gpc_count, 1 };
1864 unsigned c, i;
1865
1866 if (unlikely(!screen->pm.prog))
1867 screen->pm.prog = nvc0_hw_sm_get_program(screen);
1868
1869 /* disable all counting */
1870 PUSH_SPACE(push, 8);
1871 for (c = 0; c < 8; ++c)
1872 if (screen->pm.mp_counter[c]) {
1873 if (is_nve4) {
1874 IMMED_NVC0(push, NVE4_CP(MP_PM_FUNC(c)), 0);
1875 } else {
1876 IMMED_NVC0(push, NVC0_CP(MP_PM_OP(c)), 0);
1877 }
1878 }
1879 /* release counters for this query */
1880 for (c = 0; c < 8; ++c) {
1881 if (screen->pm.mp_counter[c] == hsq) {
1882 uint8_t d = is_nve4 ? c / 4 : 0; /* only one domain for NVC0:NVE4 */
1883 screen->pm.num_hw_sm_active[d]--;
1884 screen->pm.mp_counter[c] = NULL;
1885 }
1886 }
1887
1888 BCTX_REFN_bo(nvc0->bufctx_cp, CP_QUERY, NOUVEAU_BO_GART | NOUVEAU_BO_WR,
1889 hq->bo);
1890
1891 PUSH_SPACE(push, 1);
1892 IMMED_NVC0(push, SUBC_CP(NV50_GRAPH_SERIALIZE), 0);
1893
1894 /* upload input data for the compute shader which reads MP counters */
1895 nvc0_hw_sm_upload_input(nvc0, hq);
1896
1897 pipe->bind_compute_state(pipe, screen->pm.prog);
1898 for (i = 0; i < 3; i++) {
1899 info.block[i] = block[i];
1900 info.grid[i] = grid[i];
1901 }
1902 info.pc = 0;
1903 info.input = input;
1904 pipe->launch_grid(pipe, &info);
1905 pipe->bind_compute_state(pipe, old);
1906
1907 nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_QUERY);
1908
1909 /* re-activate other counters */
1910 PUSH_SPACE(push, 16);
1911 mask = 0;
1912 for (c = 0; c < 8; ++c) {
1913 const struct nvc0_hw_sm_query_cfg *cfg;
1914 unsigned i;
1915
1916 hsq = screen->pm.mp_counter[c];
1917 if (!hsq)
1918 continue;
1919
1920 cfg = nvc0_hw_sm_query_get_cfg(nvc0, &hsq->base);
1921 for (i = 0; i < cfg->num_counters; ++i) {
1922 if (mask & (1 << hsq->ctr[i]))
1923 break;
1924 mask |= 1 << hsq->ctr[i];
1925 if (is_nve4) {
1926 BEGIN_NVC0(push, NVE4_CP(MP_PM_FUNC(hsq->ctr[i])), 1);
1927 } else {
1928 BEGIN_NVC0(push, NVC0_CP(MP_PM_OP(hsq->ctr[i])), 1);
1929 }
1930 PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode);
1931 }
1932 }
1933 }
1934
1935 static inline bool
1936 nvc0_hw_sm_query_read_data(uint32_t count[32][8],
1937 struct nvc0_context *nvc0, bool wait,
1938 struct nvc0_hw_query *hq,
1939 const struct nvc0_hw_sm_query_cfg *cfg,
1940 unsigned mp_count)
1941 {
1942 struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
1943 unsigned p, c;
1944
1945 for (p = 0; p < mp_count; ++p) {
1946 const unsigned b = (0x30 / 4) * p;
1947
1948 for (c = 0; c < cfg->num_counters; ++c) {
1949 if (hq->data[b + 8] != hq->sequence) {
1950 if (!wait)
1951 return false;
1952 if (nouveau_bo_wait(hq->bo, NOUVEAU_BO_RD, nvc0->base.client))
1953 return false;
1954 }
1955 count[p][c] = hq->data[b + hsq->ctr[c]] * (1 << c);
1956 }
1957 }
1958 return true;
1959 }
1960
1961 static inline bool
1962 nve4_hw_sm_query_read_data(uint32_t count[32][8],
1963 struct nvc0_context *nvc0, bool wait,
1964 struct nvc0_hw_query *hq,
1965 const struct nvc0_hw_sm_query_cfg *cfg,
1966 unsigned mp_count)
1967 {
1968 struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
1969 unsigned p, c, d;
1970
1971 for (p = 0; p < mp_count; ++p) {
1972 const unsigned b = (0x60 / 4) * p;
1973
1974 for (c = 0; c < cfg->num_counters; ++c) {
1975 count[p][c] = 0;
1976 for (d = 0; d < ((hsq->ctr[c] & ~3) ? 1 : 4); ++d) {
1977 if (hq->data[b + 20 + d] != hq->sequence) {
1978 if (!wait)
1979 return false;
1980 if (nouveau_bo_wait(hq->bo, NOUVEAU_BO_RD, nvc0->base.client))
1981 return false;
1982 }
1983 if (hsq->ctr[c] & ~0x3)
1984 count[p][c] = hq->data[b + 16 + (hsq->ctr[c] & 3)];
1985 else
1986 count[p][c] += hq->data[b + d * 4 + hsq->ctr[c]];
1987 }
1988 }
1989 }
1990 return true;
1991 }
1992
1993 static boolean
1994 nvc0_hw_sm_get_query_result(struct nvc0_context *nvc0, struct nvc0_hw_query *hq,
1995 boolean wait, union pipe_query_result *result)
1996 {
1997 uint32_t count[32][8];
1998 uint64_t value = 0;
1999 unsigned mp_count = MIN2(nvc0->screen->mp_count_compute, 32);
2000 unsigned p, c;
2001 const struct nvc0_hw_sm_query_cfg *cfg;
2002 bool ret;
2003
2004 cfg = nvc0_hw_sm_query_get_cfg(nvc0, hq);
2005
2006 if (nvc0->screen->base.class_3d >= NVE4_3D_CLASS)
2007 ret = nve4_hw_sm_query_read_data(count, nvc0, wait, hq, cfg, mp_count);
2008 else
2009 ret = nvc0_hw_sm_query_read_data(count, nvc0, wait, hq, cfg, mp_count);
2010 if (!ret)
2011 return false;
2012
2013 for (c = 0; c < cfg->num_counters; ++c)
2014 for (p = 0; p < mp_count; ++p)
2015 value += count[p][c];
2016 value = (value * cfg->norm[0]) / cfg->norm[1];
2017
2018 *(uint64_t *)result = value;
2019 return true;
2020 }
2021
2022 static const struct nvc0_hw_query_funcs hw_sm_query_funcs = {
2023 .destroy_query = nvc0_hw_sm_destroy_query,
2024 .begin_query = nvc0_hw_sm_begin_query,
2025 .end_query = nvc0_hw_sm_end_query,
2026 .get_query_result = nvc0_hw_sm_get_query_result,
2027 };
2028
2029 struct nvc0_hw_query *
2030 nvc0_hw_sm_create_query(struct nvc0_context *nvc0, unsigned type)
2031 {
2032 struct nvc0_screen *screen = nvc0->screen;
2033 struct nvc0_hw_sm_query *hsq;
2034 struct nvc0_hw_query *hq;
2035 unsigned space;
2036
2037 if (nvc0->screen->base.drm->version < 0x01000101)
2038 return NULL;
2039
2040 if (type < NVC0_HW_SM_QUERY(0) || type > NVC0_HW_SM_QUERY_LAST)
2041 return NULL;
2042
2043 hsq = CALLOC_STRUCT(nvc0_hw_sm_query);
2044 if (!hsq)
2045 return NULL;
2046
2047 hq = &hsq->base;
2048 hq->funcs = &hw_sm_query_funcs;
2049 hq->base.type = type;
2050
2051 if (screen->base.class_3d >= NVE4_3D_CLASS) {
2052 /* for each MP:
2053 * [00] = WS0.C0
2054 * [04] = WS0.C1
2055 * [08] = WS0.C2
2056 * [0c] = WS0.C3
2057 * [10] = WS1.C0
2058 * [14] = WS1.C1
2059 * [18] = WS1.C2
2060 * [1c] = WS1.C3
2061 * [20] = WS2.C0
2062 * [24] = WS2.C1
2063 * [28] = WS2.C2
2064 * [2c] = WS2.C3
2065 * [30] = WS3.C0
2066 * [34] = WS3.C1
2067 * [38] = WS3.C2
2068 * [3c] = WS3.C3
2069 * [40] = MP.C4
2070 * [44] = MP.C5
2071 * [48] = MP.C6
2072 * [4c] = MP.C7
2073 * [50] = WS0.sequence
2074 * [54] = WS1.sequence
2075 * [58] = WS2.sequence
2076 * [5c] = WS3.sequence
2077 */
2078 space = (4 * 4 + 4 + 4) * nvc0->screen->mp_count * sizeof(uint32_t);
2079 } else {
2080 /*
2081 * Note that padding is used to align memory access to 128 bits.
2082 *
2083 * for each MP:
2084 * [00] = MP.C0
2085 * [04] = MP.C1
2086 * [08] = MP.C2
2087 * [0c] = MP.C3
2088 * [10] = MP.C4
2089 * [14] = MP.C5
2090 * [18] = MP.C6
2091 * [1c] = MP.C7
2092 * [20] = MP.sequence
2093 * [24] = padding
2094 * [28] = padding
2095 * [2c] = padding
2096 */
2097 space = (8 + 1 + 3) * nvc0->screen->mp_count * sizeof(uint32_t);
2098 }
2099
2100 if (!nvc0_hw_query_allocate(nvc0, &hq->base, space)) {
2101 FREE(hq);
2102 return NULL;
2103 }
2104
2105 return hq;
2106 }
2107
2108 int
2109 nvc0_hw_sm_get_driver_query_info(struct nvc0_screen *screen, unsigned id,
2110 struct pipe_driver_query_info *info)
2111 {
2112 int count = 0;
2113
2114 if (screen->base.drm->version >= 0x01000101) {
2115 if (screen->compute)
2116 count = nvc0_hw_sm_get_num_queries(screen);
2117 }
2118
2119 if (!info)
2120 return count;
2121
2122 if (id < count) {
2123 if (screen->compute) {
2124 if (screen->base.class_3d <= NVF0_3D_CLASS) {
2125 const struct nvc0_hw_sm_query_cfg **queries =
2126 nvc0_hw_sm_get_queries(screen);
2127
2128 info->name = nvc0_hw_sm_query_get_name(queries[id]->type);
2129 info->query_type = NVC0_HW_SM_QUERY(queries[id]->type);
2130 info->group_id = NVC0_HW_SM_QUERY_GROUP;
2131 return 1;
2132 }
2133 }
2134 }
2135 return 0;
2136 }