2 * Copyright 2011 Christoph Bumiller
3 * Copyright 2015 Samuel Pitoiset
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21 * OTHER DEALINGS IN THE SOFTWARE.
24 #define NVC0_PUSH_EXPLICIT_SPACE_CHECKING
26 #include "nvc0/nvc0_context.h"
27 #include "nvc0/nvc0_query_hw_sm.h"
29 #include "nv_object.xml.h"
30 #include "nvc0/nve4_compute.xml.h"
31 #include "nvc0/nvc0_compute.xml.h"
33 /* === PERFORMANCE MONITORING COUNTERS for NVE4+ === */
35 /* NOTE: intentionally using the same names as NV */
36 static const char *nve4_hw_sm_query_names
[] =
45 "global_ld_mem_divergence_replays",
46 "global_store_transaction",
47 "global_st_mem_divergence_replays",
55 "l1_global_load_miss",
59 "l1_local_store_miss",
60 "l1_shared_load_transactions",
61 "l1_shared_store_transactions",
63 "local_load_transactions",
65 "local_store_transactions",
77 "shared_store_replay",
80 "uncached_global_load_transaction",
82 /* metrics, i.e. functions of the MP counters */
83 "metric-ipc", /* inst_executed, clock */
84 "metric-ipac", /* inst_executed, active_cycles */
85 "metric-ipec", /* inst_executed, (bool)inst_executed */
86 "metric-achieved_occupancy", /* active_warps, active_cycles */
87 "metric-sm_efficiency", /* active_cycles, clock */
88 "metric-inst_replay_overhead" /* inst_issued, inst_executed */
91 /* Code to read out MP counters: They are accessible via mmio, too, but let's
92 * just avoid mapping registers in userspace. We'd have to know which MPs are
93 * enabled/present, too, and that information is not presently exposed.
94 * We could add a kernel interface for it, but reading the counters like this
95 * has the advantage of being async (if get_result isn't called immediately).
97 static const uint64_t nve4_read_hw_sm_counters_code
[] =
99 /* sched 0x20 0x20 0x20 0x20 0x20 0x20 0x20
101 * mov b32 $r12 $physid
107 * sched 0x20 0x20 0x23 0x04 0x20 0x04 0x2b
111 * set $p0 0x1 eq u32 $r8 0x0
112 * mov b32 $r10 c0[0x0]
113 * ext u32 $r8 $r12 0x414
114 * mov b32 $r11 c0[0x4]
115 * sched 0x04 0x2e 0x04 0x20 0x20 0x28 0x04
116 * ext u32 $r9 $r12 0x208
118 * set $p1 0x1 eq u32 $r9 0x0
119 * mul $r8 u32 $r8 u32 96
120 * mul $r12 u32 $r9 u32 16
121 * mul $r13 u32 $r9 u32 4
122 * add b32 $r9 $r8 $r13
123 * sched 0x28 0x04 0x2c 0x04 0x2c 0x04 0x2c
124 * add b32 $r8 $r8 $r12
126 * add b32 $r10 $c $r10 $r8
128 * add b32 $r11 $r11 0x0 $c
129 * add b32 $r12 $c $r12 $r9
130 * st b128 wt g[$r10d] $r0q
131 * sched 0x4 0x2c 0x20 0x04 0x2e 0x00 0x00
132 * mov b32 $r0 c0[0x8]
133 * add b32 $r13 $r13 0x0 $c
134 * $p1 st b128 wt g[$r12d+0x40] $r4q
135 * st b32 wt g[$r12d+0x50] $r0
137 0x2202020202020207ULL
,
138 0x2c00000084021c04ULL
,
139 0x2c0000000c031c04ULL
,
140 0x2c00000010001c04ULL
,
141 0x2c00000014005c04ULL
,
142 0x2c00000018009c04ULL
,
143 0x2c0000001c00dc04ULL
,
144 0x2c00000020011c04ULL
,
145 0x22b0420042320207ULL
,
146 0x2c00000024015c04ULL
,
147 0x2c00000028019c04ULL
,
148 0x2c0000002c01dc04ULL
,
149 0x190e0000fc81dc03ULL
,
150 0x2800400000029de4ULL
,
151 0x7000c01050c21c03ULL
,
152 0x280040001002dde4ULL
,
153 0x204282020042e047ULL
,
154 0x7000c00820c25c03ULL
,
155 0x80000000000021e7ULL
,
156 0x190e0000fc93dc03ULL
,
157 0x1000000180821c02ULL
,
158 0x1000000040931c02ULL
,
159 0x1000000010935c02ULL
,
160 0x4800000034825c03ULL
,
161 0x22c042c042c04287ULL
,
162 0x4800000030821c03ULL
,
163 0x2800000028031de4ULL
,
164 0x4801000020a29c03ULL
,
165 0x280000002c035de4ULL
,
166 0x0800000000b2dc42ULL
,
167 0x4801000024c31c03ULL
,
168 0x9400000000a01fc5ULL
,
169 0x200002e04202c047ULL
,
170 0x2800400020001de4ULL
,
171 0x0800000000d35c42ULL
,
172 0x9400000100c107c5ULL
,
173 0x9400000140c01f85ULL
,
174 0x8000000000001de7ULL
177 /* For simplicity, we will allocate as many group slots as we allocate counter
178 * slots. This means that a single counter which wants to source from 2 groups
179 * will have to be declared as using 2 counter slots. This shouldn't really be
180 * a problem because such queries don't make much sense ... (unless someone is
183 struct nvc0_hw_sm_counter_cfg
185 uint32_t func
: 16; /* mask or 4-bit logic op (depending on mode) */
186 uint32_t mode
: 4; /* LOGOP,B6,LOGOP_B6(_PULSE) */
187 uint32_t sig_dom
: 1; /* if 0, MP_PM_A (per warp-sched), if 1, MP_PM_B */
188 uint32_t sig_sel
: 8; /* signal group */
189 uint32_t src_mask
; /* mask for signal selection (only for NVC0:NVE4) */
190 uint32_t src_sel
; /* signal selection for up to 4 sources */
193 #define NVC0_COUNTER_OPn_SUM 0
194 #define NVC0_COUNTER_OPn_OR 1
195 #define NVC0_COUNTER_OPn_AND 2
196 #define NVC0_COUNTER_OP2_REL_SUM_MM 3 /* (sum(ctr0) - sum(ctr1)) / sum(ctr0) */
197 #define NVC0_COUNTER_OP2_DIV_SUM_M0 4 /* sum(ctr0) / ctr1 of MP[0]) */
198 #define NVC0_COUNTER_OP2_AVG_DIV_MM 5 /* avg(ctr0 / ctr1) */
199 #define NVC0_COUNTER_OP2_AVG_DIV_M0 6 /* avg(ctr0) / ctr1 of MP[0]) */
201 struct nvc0_hw_sm_query_cfg
203 struct nvc0_hw_sm_counter_cfg ctr
[8];
204 uint8_t num_counters
;
206 uint8_t norm
[2]; /* normalization num,denom */
209 #define _Q1A(n, f, m, g, s, nu, dn) [NVE4_HW_SM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g, 0, s }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { nu, dn } }
210 #define _Q1B(n, f, m, g, s, nu, dn) [NVE4_HW_SM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g, 0, s }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { nu, dn } }
211 #define _M2A(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_HW_SM_QUERY_METRIC_##n] = { { \
212 { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g0, 0, s0 }, \
213 { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g1, 0, s1 }, \
214 {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } }
215 #define _M2B(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_HW_SM_QUERY_METRIC_##n] = { { \
216 { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g0, 0, s0 }, \
217 { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g1, 0, s1 }, \
218 {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } }
219 #define _M2AB(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_HW_SM_QUERY_METRIC_##n] = { { \
220 { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g0, 0, s0 }, \
221 { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g1, 0, s1 }, \
222 {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } }
225 * active_warps: bit 0 alternates btw 0 and 1 for odd nr of warps
226 * inst_executed etc.: we only count a single warp scheduler
227 * metric-ipXc: we simply multiply by 4 to account for the 4 warp schedulers;
228 * this is inaccurate !
230 static const struct nvc0_hw_sm_query_cfg nve4_hw_sm_queries
[] =
232 _Q1B(ACTIVE_CYCLES
, 0x0001, B6
, WARP
, 0x00000000, 1, 1),
233 _Q1B(ACTIVE_WARPS
, 0x003f, B6
, WARP
, 0x31483104, 2, 1),
234 _Q1A(ATOM_COUNT
, 0x0001, B6
, BRANCH
, 0x00000000, 1, 1),
235 _Q1A(BRANCH
, 0x0001, B6
, BRANCH
, 0x0000000c, 1, 1),
236 _Q1A(DIVERGENT_BRANCH
, 0x0001, B6
, BRANCH
, 0x00000010, 1, 1),
237 _Q1A(GLD_REQUEST
, 0x0001, B6
, LDST
, 0x00000010, 1, 1),
238 _Q1B(GLD_MEM_DIV_REPLAY
, 0x0001, B6
, REPLAY
, 0x00000010, 1, 1),
239 _Q1B(GST_TRANSACTIONS
, 0x0001, B6
, MEM
, 0x00000004, 1, 1),
240 _Q1B(GST_MEM_DIV_REPLAY
, 0x0001, B6
, REPLAY
, 0x00000014, 1, 1),
241 _Q1A(GRED_COUNT
, 0x0001, B6
, BRANCH
, 0x00000008, 1, 1),
242 _Q1A(GST_REQUEST
, 0x0001, B6
, LDST
, 0x00000014, 1, 1),
243 _Q1A(INST_EXECUTED
, 0x0003, B6
, EXEC
, 0x00000398, 1, 1),
244 _Q1A(INST_ISSUED
, 0x0003, B6
, ISSUE
, 0x00000104, 1, 1),
245 _Q1A(INST_ISSUED1
, 0x0001, B6
, ISSUE
, 0x00000004, 1, 1),
246 _Q1A(INST_ISSUED2
, 0x0001, B6
, ISSUE
, 0x00000008, 1, 1),
247 _Q1B(L1_GLD_HIT
, 0x0001, B6
, L1
, 0x00000010, 1, 1),
248 _Q1B(L1_GLD_MISS
, 0x0001, B6
, L1
, 0x00000014, 1, 1),
249 _Q1B(L1_LOCAL_LD_HIT
, 0x0001, B6
, L1
, 0x00000000, 1, 1),
250 _Q1B(L1_LOCAL_LD_MISS
, 0x0001, B6
, L1
, 0x00000004, 1, 1),
251 _Q1B(L1_LOCAL_ST_HIT
, 0x0001, B6
, L1
, 0x00000008, 1, 1),
252 _Q1B(L1_LOCAL_ST_MISS
, 0x0001, B6
, L1
, 0x0000000c, 1, 1),
253 _Q1B(L1_SHARED_LD_TRANSACTIONS
, 0x0001, B6
, TRANSACTION
, 0x00000008, 1, 1),
254 _Q1B(L1_SHARED_ST_TRANSACTIONS
, 0x0001, B6
, TRANSACTION
, 0x0000000c, 1, 1),
255 _Q1A(LOCAL_LD
, 0x0001, B6
, LDST
, 0x00000008, 1, 1),
256 _Q1B(LOCAL_LD_TRANSACTIONS
, 0x0001, B6
, TRANSACTION
, 0x00000000, 1, 1),
257 _Q1A(LOCAL_ST
, 0x0001, B6
, LDST
, 0x0000000c, 1, 1),
258 _Q1B(LOCAL_ST_TRANSACTIONS
, 0x0001, B6
, TRANSACTION
, 0x00000004, 1, 1),
259 _Q1A(PROF_TRIGGER_0
, 0x0001, B6
, USER
, 0x00000000, 1, 1),
260 _Q1A(PROF_TRIGGER_1
, 0x0001, B6
, USER
, 0x00000004, 1, 1),
261 _Q1A(PROF_TRIGGER_2
, 0x0001, B6
, USER
, 0x00000008, 1, 1),
262 _Q1A(PROF_TRIGGER_3
, 0x0001, B6
, USER
, 0x0000000c, 1, 1),
263 _Q1A(PROF_TRIGGER_4
, 0x0001, B6
, USER
, 0x00000010, 1, 1),
264 _Q1A(PROF_TRIGGER_5
, 0x0001, B6
, USER
, 0x00000014, 1, 1),
265 _Q1A(PROF_TRIGGER_6
, 0x0001, B6
, USER
, 0x00000018, 1, 1),
266 _Q1A(PROF_TRIGGER_7
, 0x0001, B6
, USER
, 0x0000001c, 1, 1),
267 _Q1A(SHARED_LD
, 0x0001, B6
, LDST
, 0x00000000, 1, 1),
268 _Q1B(SHARED_LD_REPLAY
, 0x0001, B6
, REPLAY
, 0x00000008, 1, 1),
269 _Q1A(SHARED_ST
, 0x0001, B6
, LDST
, 0x00000004, 1, 1),
270 _Q1B(SHARED_ST_REPLAY
, 0x0001, B6
, REPLAY
, 0x0000000c, 1, 1),
271 _Q1B(SM_CTA_LAUNCHED
, 0x0001, B6
, WARP
, 0x0000001c, 1, 1),
272 _Q1A(THREADS_LAUNCHED
, 0x003f, B6
, LAUNCH
, 0x398a4188, 1, 1),
273 _Q1B(UNCACHED_GLD_TRANSACTIONS
, 0x0001, B6
, MEM
, 0x00000000, 1, 1),
274 _Q1A(WARPS_LAUNCHED
, 0x0001, B6
, LAUNCH
, 0x00000004, 1, 1),
275 _M2AB(IPC
, 0x3, B6
, EXEC
, 0x398, 0xffff, LOGOP
, WARP
, 0x0, DIV_SUM_M0
, 10, 1),
276 _M2AB(IPAC
, 0x3, B6
, EXEC
, 0x398, 0x1, B6
, WARP
, 0x0, AVG_DIV_MM
, 10, 1),
277 _M2A(IPEC
, 0x3, B6
, EXEC
, 0x398, 0xe, LOGOP
, EXEC
, 0x398, AVG_DIV_MM
, 10, 1),
278 _M2A(INST_REPLAY_OHEAD
, 0x3, B6
, ISSUE
, 0x104, 0x3, B6
, EXEC
, 0x398, REL_SUM_MM
, 100, 1),
279 _M2B(MP_OCCUPANCY
, 0x3f, B6
, WARP
, 0x31483104, 0x01, B6
, WARP
, 0x0, AVG_DIV_MM
, 200, 64),
280 _M2B(MP_EFFICIENCY
, 0x01, B6
, WARP
, 0x0, 0xffff, LOGOP
, WARP
, 0x0, AVG_DIV_M0
, 100, 1),
288 /* === PERFORMANCE MONITORING COUNTERS for NVC0:NVE4 === */
290 * - MP counters on GF100/GF110 (compute capability 2.0) are buggy
291 * because there is a context-switch problem that we need to fix.
292 * Results might be wrong sometimes, be careful!
294 static const char *nvc0_hw_sm_query_names
[] =
324 "thread_inst_executed_0",
325 "thread_inst_executed_1",
326 "thread_inst_executed_2",
327 "thread_inst_executed_3",
331 static const uint64_t nvc0_read_hw_sm_counters_code
[] =
334 * mov b32 $r9 $physid
343 * set $p0 0x1 eq u32 $r8 0x0
344 * mov b32 $r10 c0[0x0]
345 * mov b32 $r11 c0[0x4]
346 * ext u32 $r8 $r9 0x414
348 * mul $r8 u32 $r8 u32 48
349 * add b32 $r10 $c $r10 $r8
350 * add b32 $r11 $r11 0x0 $c
351 * mov b32 $r8 c0[0x8]
352 * st b128 wt g[$r10d+0x00] $r0q
353 * st b128 wt g[$r10d+0x10] $r4q
354 * st b32 wt g[$r10d+0x20] $r8
356 0x2c00000084021c04ULL
,
357 0x2c0000000c025c04ULL
,
358 0x2c00000010001c04ULL
,
359 0x2c00000014005c04ULL
,
360 0x2c00000018009c04ULL
,
361 0x2c0000001c00dc04ULL
,
362 0x2c00000020011c04ULL
,
363 0x2c00000024015c04ULL
,
364 0x2c00000028019c04ULL
,
365 0x2c0000002c01dc04ULL
,
366 0x190e0000fc81dc03ULL
,
367 0x2800400000029de4ULL
,
368 0x280040001002dde4ULL
,
369 0x7000c01050921c03ULL
,
370 0x80000000000021e7ULL
,
371 0x10000000c0821c02ULL
,
372 0x4801000020a29c03ULL
,
373 0x0800000000b2dc42ULL
,
374 0x2800400020021de4ULL
,
375 0x9400000000a01fc5ULL
,
376 0x9400000040a11fc5ULL
,
377 0x9400000080a21f85ULL
,
378 0x8000000000001de7ULL
381 #define _C(f, o, g, m, s) { f, NVC0_COMPUTE_MP_PM_OP_MODE_##o, 0, g, m, s }
382 #define _Q(n, c) [NVC0_HW_SM_QUERY_##n] = c
384 /* ==== Compute capability 2.0 (GF100/GF110) ==== */
385 static const struct nvc0_hw_sm_query_cfg
388 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x11, 0x000000ff, 0x00000000),
390 .op
= NVC0_COUNTER_OPn_SUM
,
394 static const struct nvc0_hw_sm_query_cfg
397 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x24, 0x000000ff, 0x00000010),
398 .ctr
[1] = _C(0xaaaa, LOGOP
, 0x24, 0x000000ff, 0x00000020),
399 .ctr
[2] = _C(0xaaaa, LOGOP
, 0x24, 0x000000ff, 0x00000030),
400 .ctr
[3] = _C(0xaaaa, LOGOP
, 0x24, 0x000000ff, 0x00000040),
401 .ctr
[4] = _C(0xaaaa, LOGOP
, 0x24, 0x000000ff, 0x00000050),
402 .ctr
[5] = _C(0xaaaa, LOGOP
, 0x24, 0x000000ff, 0x00000060),
404 .op
= NVC0_COUNTER_OPn_SUM
,
408 static const struct nvc0_hw_sm_query_cfg
411 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x63, 0x000000ff, 0x00000030),
413 .op
= NVC0_COUNTER_OPn_SUM
,
417 static const struct nvc0_hw_sm_query_cfg
420 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x1a, 0x000000ff, 0x00000000),
421 .ctr
[1] = _C(0xaaaa, LOGOP
, 0x1a, 0x000000ff, 0x00000010),
423 .op
= NVC0_COUNTER_OPn_SUM
,
427 static const struct nvc0_hw_sm_query_cfg
428 sm20_divergent_branch
=
430 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x19, 0x000000ff, 0x00000020),
431 .ctr
[1] = _C(0xaaaa, LOGOP
, 0x19, 0x000000ff, 0x00000030),
433 .op
= NVC0_COUNTER_OPn_SUM
,
437 static const struct nvc0_hw_sm_query_cfg
440 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x64, 0x000000ff, 0x00000030),
442 .op
= NVC0_COUNTER_OPn_SUM
,
446 static const struct nvc0_hw_sm_query_cfg
449 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x63, 0x000000ff, 0x00000040),
451 .op
= NVC0_COUNTER_OPn_SUM
,
455 static const struct nvc0_hw_sm_query_cfg
458 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x64, 0x000000ff, 0x00000060),
460 .op
= NVC0_COUNTER_OPn_SUM
,
464 static const struct nvc0_hw_sm_query_cfg
467 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x2d, 0x0000ffff, 0x00001000),
468 .ctr
[1] = _C(0xaaaa, LOGOP
, 0x2d, 0x0000ffff, 0x00001010),
470 .op
= NVC0_COUNTER_OPn_SUM
,
474 static const struct nvc0_hw_sm_query_cfg
477 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x27, 0x0000ffff, 0x00007060),
478 .ctr
[1] = _C(0xaaaa, LOGOP
, 0x27, 0x0000ffff, 0x00007070),
480 .op
= NVC0_COUNTER_OPn_SUM
,
484 static const struct nvc0_hw_sm_query_cfg
487 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x64, 0x000000ff, 0x00000020),
489 .op
= NVC0_COUNTER_OPn_SUM
,
493 static const struct nvc0_hw_sm_query_cfg
496 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x64, 0x000000ff, 0x00000050),
498 .op
= NVC0_COUNTER_OPn_SUM
,
502 static const struct nvc0_hw_sm_query_cfg
503 sm20_prof_trigger_0
=
505 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x01, 0x000000ff, 0x00000000),
507 .op
= NVC0_COUNTER_OPn_SUM
,
511 static const struct nvc0_hw_sm_query_cfg
512 sm20_prof_trigger_1
=
514 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x01, 0x000000ff, 0x00000010),
516 .op
= NVC0_COUNTER_OPn_SUM
,
520 static const struct nvc0_hw_sm_query_cfg
521 sm20_prof_trigger_2
=
523 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x01, 0x000000ff, 0x00000020),
525 .op
= NVC0_COUNTER_OPn_SUM
,
529 static const struct nvc0_hw_sm_query_cfg
530 sm20_prof_trigger_3
=
532 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x01, 0x000000ff, 0x00000030),
534 .op
= NVC0_COUNTER_OPn_SUM
,
538 static const struct nvc0_hw_sm_query_cfg
539 sm20_prof_trigger_4
=
541 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x01, 0x000000ff, 0x00000040),
543 .op
= NVC0_COUNTER_OPn_SUM
,
547 static const struct nvc0_hw_sm_query_cfg
548 sm20_prof_trigger_5
=
550 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x01, 0x000000ff, 0x00000050),
552 .op
= NVC0_COUNTER_OPn_SUM
,
556 static const struct nvc0_hw_sm_query_cfg
557 sm20_prof_trigger_6
=
559 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x01, 0x000000ff, 0x00000060),
561 .op
= NVC0_COUNTER_OPn_SUM
,
565 static const struct nvc0_hw_sm_query_cfg
566 sm20_prof_trigger_7
=
568 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x01, 0x000000ff, 0x00000070),
570 .op
= NVC0_COUNTER_OPn_SUM
,
574 static const struct nvc0_hw_sm_query_cfg
577 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x64, 0x000000ff, 0x00000010),
579 .op
= NVC0_COUNTER_OPn_SUM
,
583 static const struct nvc0_hw_sm_query_cfg
586 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x64, 0x000000ff, 0x00000040),
588 .op
= NVC0_COUNTER_OPn_SUM
,
592 static const struct nvc0_hw_sm_query_cfg
593 sm20_threads_launched
=
595 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x26, 0x000000ff, 0x00000010),
596 .ctr
[1] = _C(0xaaaa, LOGOP
, 0x26, 0x000000ff, 0x00000020),
597 .ctr
[2] = _C(0xaaaa, LOGOP
, 0x26, 0x000000ff, 0x00000030),
598 .ctr
[3] = _C(0xaaaa, LOGOP
, 0x26, 0x000000ff, 0x00000040),
599 .ctr
[4] = _C(0xaaaa, LOGOP
, 0x26, 0x000000ff, 0x00000050),
600 .ctr
[5] = _C(0xaaaa, LOGOP
, 0x26, 0x000000ff, 0x00000060),
602 .op
= NVC0_COUNTER_OPn_SUM
,
606 static const struct nvc0_hw_sm_query_cfg
607 sm20_th_inst_executed_0
=
609 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x2f, 0x000000ff, 0x00000000),
610 .ctr
[1] = _C(0xaaaa, LOGOP
, 0x2f, 0x000000ff, 0x00000010),
611 .ctr
[2] = _C(0xaaaa, LOGOP
, 0x2f, 0x000000ff, 0x00000020),
612 .ctr
[3] = _C(0xaaaa, LOGOP
, 0x2f, 0x000000ff, 0x00000030),
613 .ctr
[4] = _C(0xaaaa, LOGOP
, 0x2f, 0x000000ff, 0x00000040),
614 .ctr
[5] = _C(0xaaaa, LOGOP
, 0x2f, 0x000000ff, 0x00000050),
616 .op
= NVC0_COUNTER_OPn_SUM
,
620 static const struct nvc0_hw_sm_query_cfg
621 sm20_th_inst_executed_1
=
623 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x30, 0x000000ff, 0x00000000),
624 .ctr
[1] = _C(0xaaaa, LOGOP
, 0x30, 0x000000ff, 0x00000010),
625 .ctr
[2] = _C(0xaaaa, LOGOP
, 0x30, 0x000000ff, 0x00000020),
626 .ctr
[3] = _C(0xaaaa, LOGOP
, 0x30, 0x000000ff, 0x00000030),
627 .ctr
[4] = _C(0xaaaa, LOGOP
, 0x30, 0x000000ff, 0x00000040),
628 .ctr
[5] = _C(0xaaaa, LOGOP
, 0x30, 0x000000ff, 0x00000050),
630 .op
= NVC0_COUNTER_OPn_SUM
,
634 static const struct nvc0_hw_sm_query_cfg
635 sm20_warps_launched
=
637 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x26, 0x000000ff, 0x00000000),
639 .op
= NVC0_COUNTER_OPn_SUM
,
643 static const struct nvc0_hw_sm_query_cfg
*sm20_hw_sm_queries
[] =
645 _Q(ACTIVE_CYCLES
, &sm20_active_cycles
),
646 _Q(ACTIVE_WARPS
, &sm20_active_warps
),
647 _Q(ATOM_COUNT
, &sm20_atom_count
),
648 _Q(BRANCH
, &sm20_branch
),
649 _Q(DIVERGENT_BRANCH
, &sm20_divergent_branch
),
650 _Q(GLD_REQUEST
, &sm20_gld_request
),
651 _Q(GRED_COUNT
, &sm20_gred_count
),
652 _Q(GST_REQUEST
, &sm20_gst_request
),
653 _Q(INST_EXECUTED
, &sm20_inst_executed
),
654 _Q(INST_ISSUED
, &sm20_inst_issued
),
655 _Q(INST_ISSUED1_0
, NULL
),
656 _Q(INST_ISSUED1_1
, NULL
),
657 _Q(INST_ISSUED2_0
, NULL
),
658 _Q(INST_ISSUED2_1
, NULL
),
659 _Q(LOCAL_LD
, &sm20_local_ld
),
660 _Q(LOCAL_ST
, &sm20_local_st
),
661 _Q(PROF_TRIGGER_0
, &sm20_prof_trigger_0
),
662 _Q(PROF_TRIGGER_1
, &sm20_prof_trigger_1
),
663 _Q(PROF_TRIGGER_2
, &sm20_prof_trigger_2
),
664 _Q(PROF_TRIGGER_3
, &sm20_prof_trigger_3
),
665 _Q(PROF_TRIGGER_4
, &sm20_prof_trigger_4
),
666 _Q(PROF_TRIGGER_5
, &sm20_prof_trigger_5
),
667 _Q(PROF_TRIGGER_6
, &sm20_prof_trigger_6
),
668 _Q(PROF_TRIGGER_7
, &sm20_prof_trigger_7
),
669 _Q(SHARED_LD
, &sm20_shared_ld
),
670 _Q(SHARED_ST
, &sm20_shared_st
),
671 _Q(THREADS_LAUNCHED
, &sm20_threads_launched
),
672 _Q(TH_INST_EXECUTED_0
, &sm20_th_inst_executed_0
),
673 _Q(TH_INST_EXECUTED_1
, &sm20_th_inst_executed_1
),
674 _Q(TH_INST_EXECUTED_2
, NULL
),
675 _Q(TH_INST_EXECUTED_3
, NULL
),
676 _Q(WARPS_LAUNCHED
, &sm20_warps_launched
),
679 /* ==== Compute capability 2.1 (GF108+ except GF110) ==== */
680 static const struct nvc0_hw_sm_query_cfg
683 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x2d, 0x000000ff, 0x00000000),
684 .ctr
[1] = _C(0xaaaa, LOGOP
, 0x2d, 0x000000ff, 0x00000010),
685 .ctr
[2] = _C(0xaaaa, LOGOP
, 0x2d, 0x000000ff, 0x00000020),
687 .op
= NVC0_COUNTER_OPn_SUM
,
691 static const struct nvc0_hw_sm_query_cfg
692 sm21_inst_issued1_0
=
694 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x7e, 0x000000ff, 0x00000010),
696 .op
= NVC0_COUNTER_OPn_SUM
,
700 static const struct nvc0_hw_sm_query_cfg
701 sm21_inst_issued1_1
=
703 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x7e, 0x000000ff, 0x00000040),
705 .op
= NVC0_COUNTER_OPn_SUM
,
709 static const struct nvc0_hw_sm_query_cfg
710 sm21_inst_issued2_0
=
712 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x7e, 0x000000ff, 0x00000020),
714 .op
= NVC0_COUNTER_OPn_SUM
,
718 static const struct nvc0_hw_sm_query_cfg
719 sm21_inst_issued2_1
=
721 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x7e, 0x000000ff, 0x00000050),
723 .op
= NVC0_COUNTER_OPn_SUM
,
727 static const struct nvc0_hw_sm_query_cfg
728 sm21_th_inst_executed_0
=
730 .ctr
[0] = _C(0xaaaa, LOGOP
, 0xa3, 0x000000ff, 0x00000000),
731 .ctr
[1] = _C(0xaaaa, LOGOP
, 0xa3, 0x000000ff, 0x00000010),
732 .ctr
[2] = _C(0xaaaa, LOGOP
, 0xa3, 0x000000ff, 0x00000020),
733 .ctr
[3] = _C(0xaaaa, LOGOP
, 0xa3, 0x000000ff, 0x00000030),
734 .ctr
[4] = _C(0xaaaa, LOGOP
, 0xa3, 0x000000ff, 0x00000040),
735 .ctr
[5] = _C(0xaaaa, LOGOP
, 0xa3, 0x000000ff, 0x00000050),
737 .op
= NVC0_COUNTER_OPn_SUM
,
741 static const struct nvc0_hw_sm_query_cfg
742 sm21_th_inst_executed_1
=
744 .ctr
[0] = _C(0xaaaa, LOGOP
, 0xa5, 0x000000ff, 0x00000000),
745 .ctr
[1] = _C(0xaaaa, LOGOP
, 0xa5, 0x000000ff, 0x00000010),
746 .ctr
[2] = _C(0xaaaa, LOGOP
, 0xa5, 0x000000ff, 0x00000020),
747 .ctr
[3] = _C(0xaaaa, LOGOP
, 0xa5, 0x000000ff, 0x00000030),
748 .ctr
[4] = _C(0xaaaa, LOGOP
, 0xa5, 0x000000ff, 0x00000040),
749 .ctr
[5] = _C(0xaaaa, LOGOP
, 0xa5, 0x000000ff, 0x00000050),
751 .op
= NVC0_COUNTER_OPn_SUM
,
755 static const struct nvc0_hw_sm_query_cfg
756 sm21_th_inst_executed_2
=
758 .ctr
[0] = _C(0xaaaa, LOGOP
, 0xa4, 0x000000ff, 0x00000000),
759 .ctr
[1] = _C(0xaaaa, LOGOP
, 0xa4, 0x000000ff, 0x00000010),
760 .ctr
[2] = _C(0xaaaa, LOGOP
, 0xa4, 0x000000ff, 0x00000020),
761 .ctr
[3] = _C(0xaaaa, LOGOP
, 0xa4, 0x000000ff, 0x00000030),
762 .ctr
[4] = _C(0xaaaa, LOGOP
, 0xa4, 0x000000ff, 0x00000040),
763 .ctr
[5] = _C(0xaaaa, LOGOP
, 0xa4, 0x000000ff, 0x00000050),
765 .op
= NVC0_COUNTER_OPn_SUM
,
769 static const struct nvc0_hw_sm_query_cfg
770 sm21_th_inst_executed_3
=
772 .ctr
[0] = _C(0xaaaa, LOGOP
, 0xa6, 0x000000ff, 0x00000000),
773 .ctr
[1] = _C(0xaaaa, LOGOP
, 0xa6, 0x000000ff, 0x00000010),
774 .ctr
[2] = _C(0xaaaa, LOGOP
, 0xa6, 0x000000ff, 0x00000020),
775 .ctr
[3] = _C(0xaaaa, LOGOP
, 0xa6, 0x000000ff, 0x00000030),
776 .ctr
[4] = _C(0xaaaa, LOGOP
, 0xa6, 0x000000ff, 0x00000040),
777 .ctr
[5] = _C(0xaaaa, LOGOP
, 0xa6, 0x000000ff, 0x00000050),
779 .op
= NVC0_COUNTER_OPn_SUM
,
783 static const struct nvc0_hw_sm_query_cfg
*sm21_hw_sm_queries
[] =
785 _Q(ACTIVE_CYCLES
, &sm20_active_cycles
),
786 _Q(ACTIVE_WARPS
, &sm20_active_warps
),
787 _Q(ATOM_COUNT
, &sm20_atom_count
),
788 _Q(BRANCH
, &sm20_branch
),
789 _Q(DIVERGENT_BRANCH
, &sm20_divergent_branch
),
790 _Q(GLD_REQUEST
, &sm20_gld_request
),
791 _Q(GRED_COUNT
, &sm20_gred_count
),
792 _Q(GST_REQUEST
, &sm20_gst_request
),
793 _Q(INST_EXECUTED
, &sm21_inst_executed
),
794 _Q(INST_ISSUED
, NULL
),
795 _Q(INST_ISSUED1_0
, &sm21_inst_issued1_0
),
796 _Q(INST_ISSUED1_1
, &sm21_inst_issued1_1
),
797 _Q(INST_ISSUED2_0
, &sm21_inst_issued2_0
),
798 _Q(INST_ISSUED2_1
, &sm21_inst_issued2_1
),
799 _Q(LOCAL_LD
, &sm20_local_ld
),
800 _Q(LOCAL_ST
, &sm20_local_st
),
801 _Q(PROF_TRIGGER_0
, &sm20_prof_trigger_0
),
802 _Q(PROF_TRIGGER_1
, &sm20_prof_trigger_1
),
803 _Q(PROF_TRIGGER_2
, &sm20_prof_trigger_2
),
804 _Q(PROF_TRIGGER_3
, &sm20_prof_trigger_3
),
805 _Q(PROF_TRIGGER_4
, &sm20_prof_trigger_4
),
806 _Q(PROF_TRIGGER_5
, &sm20_prof_trigger_5
),
807 _Q(PROF_TRIGGER_6
, &sm20_prof_trigger_6
),
808 _Q(PROF_TRIGGER_7
, &sm20_prof_trigger_7
),
809 _Q(SHARED_LD
, &sm20_shared_ld
),
810 _Q(SHARED_ST
, &sm20_shared_st
),
811 _Q(THREADS_LAUNCHED
, &sm20_threads_launched
),
812 _Q(TH_INST_EXECUTED_0
, &sm21_th_inst_executed_0
),
813 _Q(TH_INST_EXECUTED_1
, &sm21_th_inst_executed_1
),
814 _Q(TH_INST_EXECUTED_2
, &sm21_th_inst_executed_2
),
815 _Q(TH_INST_EXECUTED_3
, &sm21_th_inst_executed_3
),
816 _Q(WARPS_LAUNCHED
, &sm20_warps_launched
),
822 static inline const struct nvc0_hw_sm_query_cfg
**
823 nvc0_hw_sm_get_queries(struct nvc0_screen
*screen
)
825 struct nouveau_device
*dev
= screen
->base
.device
;
827 if (dev
->chipset
== 0xc0 || dev
->chipset
== 0xc8)
828 return sm20_hw_sm_queries
;
829 return sm21_hw_sm_queries
;
832 static const struct nvc0_hw_sm_query_cfg
*
833 nvc0_hw_sm_query_get_cfg(struct nvc0_context
*nvc0
, struct nvc0_hw_query
*hq
)
835 struct nvc0_screen
*screen
= nvc0
->screen
;
836 struct nvc0_query
*q
= &hq
->base
;
838 if (screen
->base
.class_3d
>= NVE4_3D_CLASS
)
839 return &nve4_hw_sm_queries
[q
->type
- PIPE_QUERY_DRIVER_SPECIFIC
];
841 if (q
->type
>= NVC0_HW_SM_QUERY(0) && q
->type
<= NVC0_HW_SM_QUERY_LAST
) {
842 const struct nvc0_hw_sm_query_cfg
**queries
=
843 nvc0_hw_sm_get_queries(screen
);
844 return queries
[q
->type
- NVC0_HW_SM_QUERY(0)];
846 debug_printf("invalid query type: %d\n", q
->type
);
851 nvc0_hw_sm_destroy_query(struct nvc0_context
*nvc0
, struct nvc0_hw_query
*hq
)
853 struct nvc0_query
*q
= &hq
->base
;
854 q
->funcs
->destroy_query(nvc0
, q
);
858 nve4_hw_sm_begin_query(struct nvc0_context
*nvc0
, struct nvc0_hw_query
*hq
)
860 struct nvc0_screen
*screen
= nvc0
->screen
;
861 struct nouveau_pushbuf
*push
= nvc0
->base
.pushbuf
;
862 struct nvc0_hw_sm_query
*hsq
= nvc0_hw_sm_query(hq
);
863 const struct nvc0_hw_sm_query_cfg
*cfg
;
865 unsigned num_ab
[2] = { 0, 0 };
867 cfg
= nvc0_hw_sm_query_get_cfg(nvc0
, hq
);
869 /* check if we have enough free counter slots */
870 for (i
= 0; i
< cfg
->num_counters
; ++i
)
871 num_ab
[cfg
->ctr
[i
].sig_dom
]++;
873 if (screen
->pm
.num_hw_sm_active
[0] + num_ab
[0] > 4 ||
874 screen
->pm
.num_hw_sm_active
[1] + num_ab
[1] > 4) {
875 NOUVEAU_ERR("Not enough free MP counter slots !\n");
879 assert(cfg
->num_counters
<= 4);
880 PUSH_SPACE(push
, 4 * 8 * + 6);
882 if (!screen
->pm
.mp_counters_enabled
) {
883 screen
->pm
.mp_counters_enabled
= true;
884 BEGIN_NVC0(push
, SUBC_SW(0x06ac), 1);
885 PUSH_DATA (push
, 0x1fcb);
888 /* set sequence field to 0 (used to check if result is available) */
889 for (i
= 0; i
< screen
->mp_count
; ++i
)
890 hq
->data
[i
* 10 + 10] = 0;
893 for (i
= 0; i
< cfg
->num_counters
; ++i
) {
894 const unsigned d
= cfg
->ctr
[i
].sig_dom
;
896 if (!screen
->pm
.num_hw_sm_active
[d
]) {
897 uint32_t m
= (1 << 22) | (1 << (7 + (8 * !d
)));
898 if (screen
->pm
.num_hw_sm_active
[!d
])
899 m
|= 1 << (7 + (8 * d
));
900 BEGIN_NVC0(push
, SUBC_SW(0x0600), 1);
903 screen
->pm
.num_hw_sm_active
[d
]++;
905 for (c
= d
* 4; c
< (d
* 4 + 4); ++c
) {
906 if (!screen
->pm
.mp_counter
[c
]) {
908 screen
->pm
.mp_counter
[c
] = hsq
;
912 assert(c
<= (d
* 4 + 3)); /* must succeed, already checked for space */
914 /* configure and reset the counter(s) */
916 BEGIN_NVC0(push
, NVE4_COMPUTE(MP_PM_A_SIGSEL(c
& 3)), 1);
918 BEGIN_NVC0(push
, NVE4_COMPUTE(MP_PM_B_SIGSEL(c
& 3)), 1);
919 PUSH_DATA (push
, cfg
->ctr
[i
].sig_sel
);
920 BEGIN_NVC0(push
, NVE4_COMPUTE(MP_PM_SRCSEL(c
)), 1);
921 PUSH_DATA (push
, cfg
->ctr
[i
].src_sel
+ 0x2108421 * (c
& 3));
922 BEGIN_NVC0(push
, NVE4_COMPUTE(MP_PM_FUNC(c
)), 1);
923 PUSH_DATA (push
, (cfg
->ctr
[i
].func
<< 4) | cfg
->ctr
[i
].mode
);
924 BEGIN_NVC0(push
, NVE4_COMPUTE(MP_PM_SET(c
)), 1);
931 nvc0_hw_sm_begin_query(struct nvc0_context
*nvc0
, struct nvc0_hw_query
*hq
)
933 struct nvc0_screen
*screen
= nvc0
->screen
;
934 struct nouveau_pushbuf
*push
= nvc0
->base
.pushbuf
;
935 struct nvc0_hw_sm_query
*hsq
= nvc0_hw_sm_query(hq
);
936 const struct nvc0_hw_sm_query_cfg
*cfg
;
939 if (screen
->base
.class_3d
>= NVE4_3D_CLASS
)
940 return nve4_hw_sm_begin_query(nvc0
, hq
);
942 cfg
= nvc0_hw_sm_query_get_cfg(nvc0
, hq
);
944 /* check if we have enough free counter slots */
945 if (screen
->pm
.num_hw_sm_active
[0] + cfg
->num_counters
> 8) {
946 NOUVEAU_ERR("Not enough free MP counter slots !\n");
950 assert(cfg
->num_counters
<= 8);
951 PUSH_SPACE(push
, 8 * 8 + 2);
953 /* set sequence field to 0 (used to check if result is available) */
954 for (i
= 0; i
< screen
->mp_count
; ++i
) {
955 const unsigned b
= (0x30 / 4) * i
;
960 for (i
= 0; i
< cfg
->num_counters
; ++i
) {
961 uint32_t mask_sel
= 0x00000000;
963 if (!screen
->pm
.num_hw_sm_active
[0]) {
964 BEGIN_NVC0(push
, SUBC_SW(0x0600), 1);
965 PUSH_DATA (push
, 0x80000000);
967 screen
->pm
.num_hw_sm_active
[0]++;
969 for (c
= 0; c
< 8; ++c
) {
970 if (!screen
->pm
.mp_counter
[c
]) {
972 screen
->pm
.mp_counter
[c
] = hsq
;
977 /* Oddly-enough, the signal id depends on the slot selected on Fermi but
978 * not on Kepler. Fortunately, the signal ids are just offseted by the
981 mask_sel
|= (c
<< 8);
982 mask_sel
|= (c
<< 16);
983 mask_sel
|= (c
<< 24);
984 mask_sel
&= cfg
->ctr
[i
].src_mask
;
986 /* configure and reset the counter(s) */
987 BEGIN_NVC0(push
, NVC0_COMPUTE(MP_PM_SIGSEL(c
)), 1);
988 PUSH_DATA (push
, cfg
->ctr
[i
].sig_sel
);
989 BEGIN_NVC0(push
, NVC0_COMPUTE(MP_PM_SRCSEL(c
)), 1);
990 PUSH_DATA (push
, cfg
->ctr
[i
].src_sel
| mask_sel
);
991 BEGIN_NVC0(push
, NVC0_COMPUTE(MP_PM_OP(c
)), 1);
992 PUSH_DATA (push
, (cfg
->ctr
[i
].func
<< 4) | cfg
->ctr
[i
].mode
);
993 BEGIN_NVC0(push
, NVC0_COMPUTE(MP_PM_SET(c
)), 1);
1000 nvc0_hw_sm_end_query(struct nvc0_context
*nvc0
, struct nvc0_hw_query
*hq
)
1002 struct nvc0_screen
*screen
= nvc0
->screen
;
1003 struct pipe_context
*pipe
= &nvc0
->base
.pipe
;
1004 struct nouveau_pushbuf
*push
= nvc0
->base
.pushbuf
;
1005 const bool is_nve4
= screen
->base
.class_3d
>= NVE4_3D_CLASS
;
1006 struct nvc0_hw_sm_query
*hsq
= nvc0_hw_sm_query(hq
);
1009 const uint block
[3] = { 32, is_nve4
? 4 : 1, 1 };
1010 const uint grid
[3] = { screen
->mp_count
, screen
->gpc_count
, 1 };
1013 if (unlikely(!screen
->pm
.prog
)) {
1014 struct nvc0_program
*prog
= CALLOC_STRUCT(nvc0_program
);
1015 prog
->type
= PIPE_SHADER_COMPUTE
;
1016 prog
->translated
= true;
1017 prog
->parm_size
= 12;
1019 prog
->code
= (uint32_t *)nve4_read_hw_sm_counters_code
;
1020 prog
->code_size
= sizeof(nve4_read_hw_sm_counters_code
);
1021 prog
->num_gprs
= 14;
1023 prog
->code
= (uint32_t *)nvc0_read_hw_sm_counters_code
;
1024 prog
->code_size
= sizeof(nvc0_read_hw_sm_counters_code
);
1025 prog
->num_gprs
= 12;
1027 screen
->pm
.prog
= prog
;
1030 /* disable all counting */
1031 PUSH_SPACE(push
, 8);
1032 for (c
= 0; c
< 8; ++c
)
1033 if (screen
->pm
.mp_counter
[c
]) {
1035 IMMED_NVC0(push
, NVE4_COMPUTE(MP_PM_FUNC(c
)), 0);
1037 IMMED_NVC0(push
, NVC0_COMPUTE(MP_PM_OP(c
)), 0);
1040 /* release counters for this query */
1041 for (c
= 0; c
< 8; ++c
) {
1042 if (screen
->pm
.mp_counter
[c
] == hsq
) {
1043 uint8_t d
= is_nve4
? c
/ 4 : 0; /* only one domain for NVC0:NVE4 */
1044 screen
->pm
.num_hw_sm_active
[d
]--;
1045 screen
->pm
.mp_counter
[c
] = NULL
;
1049 BCTX_REFN_bo(nvc0
->bufctx_cp
, CP_QUERY
, NOUVEAU_BO_GART
| NOUVEAU_BO_WR
,
1052 PUSH_SPACE(push
, 1);
1053 IMMED_NVC0(push
, SUBC_COMPUTE(NV50_GRAPH_SERIALIZE
), 0);
1055 pipe
->bind_compute_state(pipe
, screen
->pm
.prog
);
1056 input
[0] = (hq
->bo
->offset
+ hq
->base_offset
);
1057 input
[1] = (hq
->bo
->offset
+ hq
->base_offset
) >> 32;
1058 input
[2] = hq
->sequence
;
1059 pipe
->launch_grid(pipe
, block
, grid
, 0, input
);
1061 nouveau_bufctx_reset(nvc0
->bufctx_cp
, NVC0_BIND_CP_QUERY
);
1063 /* re-activate other counters */
1064 PUSH_SPACE(push
, 16);
1066 for (c
= 0; c
< 8; ++c
) {
1067 const struct nvc0_hw_sm_query_cfg
*cfg
;
1070 hsq
= screen
->pm
.mp_counter
[c
];
1074 cfg
= nvc0_hw_sm_query_get_cfg(nvc0
, &hsq
->base
);
1075 for (i
= 0; i
< cfg
->num_counters
; ++i
) {
1076 if (mask
& (1 << hsq
->ctr
[i
]))
1078 mask
|= 1 << hsq
->ctr
[i
];
1080 BEGIN_NVC0(push
, NVE4_COMPUTE(MP_PM_FUNC(hsq
->ctr
[i
])), 1);
1082 BEGIN_NVC0(push
, NVC0_COMPUTE(MP_PM_OP(hsq
->ctr
[i
])), 1);
1084 PUSH_DATA (push
, (cfg
->ctr
[i
].func
<< 4) | cfg
->ctr
[i
].mode
);
1090 nvc0_hw_sm_query_read_data(uint32_t count
[32][8],
1091 struct nvc0_context
*nvc0
, bool wait
,
1092 struct nvc0_hw_query
*hq
,
1093 const struct nvc0_hw_sm_query_cfg
*cfg
,
1096 struct nvc0_hw_sm_query
*hsq
= nvc0_hw_sm_query(hq
);
1099 for (p
= 0; p
< mp_count
; ++p
) {
1100 const unsigned b
= (0x30 / 4) * p
;
1102 for (c
= 0; c
< cfg
->num_counters
; ++c
) {
1103 if (hq
->data
[b
+ 8] != hq
->sequence
) {
1106 if (nouveau_bo_wait(hq
->bo
, NOUVEAU_BO_RD
, nvc0
->base
.client
))
1109 count
[p
][c
] = hq
->data
[b
+ hsq
->ctr
[c
]] * (1 << c
);
1116 nve4_hw_sm_query_read_data(uint32_t count
[32][8],
1117 struct nvc0_context
*nvc0
, bool wait
,
1118 struct nvc0_hw_query
*hq
,
1119 const struct nvc0_hw_sm_query_cfg
*cfg
,
1122 struct nvc0_hw_sm_query
*hsq
= nvc0_hw_sm_query(hq
);
1125 for (p
= 0; p
< mp_count
; ++p
) {
1126 const unsigned b
= (0x60 / 4) * p
;
1128 for (c
= 0; c
< cfg
->num_counters
; ++c
) {
1130 for (d
= 0; d
< ((hsq
->ctr
[c
] & ~3) ? 1 : 4); ++d
) {
1131 if (hq
->data
[b
+ 20 + d
] != hq
->sequence
) {
1134 if (nouveau_bo_wait(hq
->bo
, NOUVEAU_BO_RD
, nvc0
->base
.client
))
1137 if (hsq
->ctr
[c
] & ~0x3)
1138 count
[p
][c
] = hq
->data
[b
+ 16 + (hsq
->ctr
[c
] & 3)];
1140 count
[p
][c
] += hq
->data
[b
+ d
* 4 + hsq
->ctr
[c
]];
1147 /* Metric calculations:
1148 * sum(x) ... sum of x over all MPs
1149 * avg(x) ... average of x over all MPs
1151 * IPC : sum(inst_executed) / clock
1152 * INST_REPLAY_OHEAD: (sum(inst_issued) - sum(inst_executed)) / sum(inst_issued)
1153 * MP_OCCUPANCY : avg((active_warps / 64) / active_cycles)
1154 * MP_EFFICIENCY : avg(active_cycles / clock)
1156 * NOTE: Interpretation of IPC requires knowledge of MP count.
1159 nvc0_hw_sm_get_query_result(struct nvc0_context
*nvc0
, struct nvc0_hw_query
*hq
,
1160 boolean wait
, union pipe_query_result
*result
)
1162 uint32_t count
[32][8];
1164 unsigned mp_count
= MIN2(nvc0
->screen
->mp_count_compute
, 32);
1166 const struct nvc0_hw_sm_query_cfg
*cfg
;
1169 cfg
= nvc0_hw_sm_query_get_cfg(nvc0
, hq
);
1171 if (nvc0
->screen
->base
.class_3d
>= NVE4_3D_CLASS
)
1172 ret
= nve4_hw_sm_query_read_data(count
, nvc0
, wait
, hq
, cfg
, mp_count
);
1174 ret
= nvc0_hw_sm_query_read_data(count
, nvc0
, wait
, hq
, cfg
, mp_count
);
1178 if (cfg
->op
== NVC0_COUNTER_OPn_SUM
) {
1179 for (c
= 0; c
< cfg
->num_counters
; ++c
)
1180 for (p
= 0; p
< mp_count
; ++p
)
1181 value
+= count
[p
][c
];
1182 value
= (value
* cfg
->norm
[0]) / cfg
->norm
[1];
1184 if (cfg
->op
== NVC0_COUNTER_OPn_OR
) {
1186 for (c
= 0; c
< cfg
->num_counters
; ++c
)
1187 for (p
= 0; p
< mp_count
; ++p
)
1189 value
= ((uint64_t)v
* cfg
->norm
[0]) / cfg
->norm
[1];
1191 if (cfg
->op
== NVC0_COUNTER_OPn_AND
) {
1193 for (c
= 0; c
< cfg
->num_counters
; ++c
)
1194 for (p
= 0; p
< mp_count
; ++p
)
1196 value
= ((uint64_t)v
* cfg
->norm
[0]) / cfg
->norm
[1];
1198 if (cfg
->op
== NVC0_COUNTER_OP2_REL_SUM_MM
) {
1199 uint64_t v
[2] = { 0, 0 };
1200 for (p
= 0; p
< mp_count
; ++p
) {
1201 v
[0] += count
[p
][0];
1202 v
[1] += count
[p
][1];
1205 value
= ((v
[0] - v
[1]) * cfg
->norm
[0]) / (v
[0] * cfg
->norm
[1]);
1207 if (cfg
->op
== NVC0_COUNTER_OP2_DIV_SUM_M0
) {
1208 for (p
= 0; p
< mp_count
; ++p
)
1209 value
+= count
[p
][0];
1211 value
= (value
* cfg
->norm
[0]) / (count
[0][1] * cfg
->norm
[1]);
1215 if (cfg
->op
== NVC0_COUNTER_OP2_AVG_DIV_MM
) {
1216 unsigned mp_used
= 0;
1217 for (p
= 0; p
< mp_count
; ++p
, mp_used
+= !!count
[p
][0])
1219 value
+= (count
[p
][0] * cfg
->norm
[0]) / count
[p
][1];
1221 value
/= (uint64_t)mp_used
* cfg
->norm
[1];
1223 if (cfg
->op
== NVC0_COUNTER_OP2_AVG_DIV_M0
) {
1224 unsigned mp_used
= 0;
1225 for (p
= 0; p
< mp_count
; ++p
, mp_used
+= !!count
[p
][0])
1226 value
+= count
[p
][0];
1227 if (count
[0][1] && mp_used
) {
1228 value
*= cfg
->norm
[0];
1229 value
/= (uint64_t)count
[0][1] * mp_used
* cfg
->norm
[1];
1235 *(uint64_t *)result
= value
;
1239 static const struct nvc0_hw_query_funcs hw_sm_query_funcs
= {
1240 .destroy_query
= nvc0_hw_sm_destroy_query
,
1241 .begin_query
= nvc0_hw_sm_begin_query
,
1242 .end_query
= nvc0_hw_sm_end_query
,
1243 .get_query_result
= nvc0_hw_sm_get_query_result
,
1246 struct nvc0_hw_query
*
1247 nvc0_hw_sm_create_query(struct nvc0_context
*nvc0
, unsigned type
)
1249 struct nvc0_screen
*screen
= nvc0
->screen
;
1250 struct nvc0_hw_sm_query
*hsq
;
1251 struct nvc0_hw_query
*hq
;
1254 if (nvc0
->screen
->base
.device
->drm_version
< 0x01000101)
1257 if ((type
< NVE4_HW_SM_QUERY(0) || type
> NVE4_HW_SM_QUERY_LAST
) &&
1258 (type
< NVC0_HW_SM_QUERY(0) || type
> NVC0_HW_SM_QUERY_LAST
))
1261 hsq
= CALLOC_STRUCT(nvc0_hw_sm_query
);
1266 hq
->funcs
= &hw_sm_query_funcs
;
1267 hq
->base
.type
= type
;
1269 if (screen
->base
.class_3d
>= NVE4_3D_CLASS
) {
1286 * [50] = WS0.sequence
1287 * [54] = WS1.sequence
1288 * [58] = WS2.sequence
1289 * [5c] = WS3.sequence
1291 space
= (4 * 4 + 4 + 4) * nvc0
->screen
->mp_count
* sizeof(uint32_t);
1294 * Note that padding is used to align memory access to 128 bits.
1305 * [20] = MP.sequence
1310 space
= (8 + 1 + 3) * nvc0
->screen
->mp_count
* sizeof(uint32_t);
1313 if (!nvc0_hw_query_allocate(nvc0
, &hq
->base
, space
)) {
1322 nvc0_hw_sm_get_next_query_id(const struct nvc0_hw_sm_query_cfg
**queries
,
1325 unsigned i
, next
= 0;
1327 for (i
= 0; i
< NVC0_HW_SM_QUERY_COUNT
; i
++) {
1331 if (i
>= id
&& queries
[id
+ next
]) {
1339 nvc0_hw_sm_get_driver_query_info(struct nvc0_screen
*screen
, unsigned id
,
1340 struct pipe_driver_query_info
*info
)
1344 if (screen
->base
.device
->drm_version
>= 0x01000101) {
1345 if (screen
->compute
) {
1346 if (screen
->base
.class_3d
== NVE4_3D_CLASS
) {
1347 count
+= NVE4_HW_SM_QUERY_COUNT
;
1349 if (screen
->base
.class_3d
< NVE4_3D_CLASS
) {
1350 const struct nvc0_hw_sm_query_cfg
**queries
=
1351 nvc0_hw_sm_get_queries(screen
);
1354 for (i
= 0; i
< NVC0_HW_SM_QUERY_COUNT
; i
++) {
1366 if (screen
->compute
) {
1367 if (screen
->base
.class_3d
== NVE4_3D_CLASS
) {
1368 info
->name
= nve4_hw_sm_query_names
[id
];
1369 info
->query_type
= NVE4_HW_SM_QUERY(id
);
1370 info
->max_value
.u64
=
1371 (id
< NVE4_HW_SM_QUERY_METRIC_MP_OCCUPANCY
) ? 0 : 100;
1372 info
->group_id
= NVC0_HW_SM_QUERY_GROUP
;
1375 if (screen
->base
.class_3d
< NVE4_3D_CLASS
) {
1376 const struct nvc0_hw_sm_query_cfg
**queries
=
1377 nvc0_hw_sm_get_queries(screen
);
1379 id
= nvc0_hw_sm_get_next_query_id(queries
, id
);
1380 info
->name
= nvc0_hw_sm_query_names
[id
];
1381 info
->query_type
= NVC0_HW_SM_QUERY(id
);
1382 info
->group_id
= NVC0_HW_SM_QUERY_GROUP
;