2 * Copyright 2011 Christoph Bumiller
3 * Copyright 2015 Samuel Pitoiset
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21 * OTHER DEALINGS IN THE SOFTWARE.
24 #define NVC0_PUSH_EXPLICIT_SPACE_CHECKING
26 #include "nvc0/nvc0_context.h"
27 #include "nvc0/nvc0_query_hw_sm.h"
29 #include "nv_object.xml.h"
30 #include "nvc0/nve4_compute.xml.h"
31 #include "nvc0/nvc0_compute.xml.h"
33 /* === PERFORMANCE MONITORING COUNTERS for NVE4+ === */
35 /* Code to read out MP counters: They are accessible via mmio, too, but let's
36 * just avoid mapping registers in userspace. We'd have to know which MPs are
37 * enabled/present, too, and that information is not presently exposed.
38 * We could add a kernel interface for it, but reading the counters like this
39 * has the advantage of being async (if get_result isn't called immediately).
41 static const uint64_t nve4_read_hw_sm_counters_code
[] =
43 /* sched 0x20 0x20 0x20 0x20 0x20 0x20 0x20
45 * mov b32 $r12 $physid
51 * sched 0x20 0x20 0x23 0x04 0x20 0x04 0x2b
55 * set $p0 0x1 eq u32 $r8 0x0
56 * mov b32 $r10 c0[0x0]
57 * ext u32 $r8 $r12 0x414
58 * mov b32 $r11 c0[0x4]
59 * sched 0x04 0x2e 0x04 0x20 0x20 0x28 0x04
60 * ext u32 $r9 $r12 0x208
62 * set $p1 0x1 eq u32 $r9 0x0
63 * mul $r8 u32 $r8 u32 96
64 * mul $r12 u32 $r9 u32 16
65 * mul $r13 u32 $r9 u32 4
66 * add b32 $r9 $r8 $r13
67 * sched 0x28 0x04 0x2c 0x04 0x2c 0x04 0x2c
68 * add b32 $r8 $r8 $r12
70 * add b32 $r10 $c $r10 $r8
72 * add b32 $r11 $r11 0x0 $c
73 * add b32 $r12 $c $r12 $r9
74 * st b128 wt g[$r10d] $r0q
75 * sched 0x4 0x2c 0x20 0x04 0x2e 0x00 0x00
77 * add b32 $r13 $r13 0x0 $c
78 * $p1 st b128 wt g[$r12d+0x40] $r4q
79 * st b32 wt g[$r12d+0x50] $r0
81 0x2202020202020207ULL
,
82 0x2c00000084021c04ULL
,
83 0x2c0000000c031c04ULL
,
84 0x2c00000010001c04ULL
,
85 0x2c00000014005c04ULL
,
86 0x2c00000018009c04ULL
,
87 0x2c0000001c00dc04ULL
,
88 0x2c00000020011c04ULL
,
89 0x22b0420042320207ULL
,
90 0x2c00000024015c04ULL
,
91 0x2c00000028019c04ULL
,
92 0x2c0000002c01dc04ULL
,
93 0x190e0000fc81dc03ULL
,
94 0x2800400000029de4ULL
,
95 0x7000c01050c21c03ULL
,
96 0x280040001002dde4ULL
,
97 0x204282020042e047ULL
,
98 0x7000c00820c25c03ULL
,
99 0x80000000000021e7ULL
,
100 0x190e0000fc93dc03ULL
,
101 0x1000000180821c02ULL
,
102 0x1000000040931c02ULL
,
103 0x1000000010935c02ULL
,
104 0x4800000034825c03ULL
,
105 0x22c042c042c04287ULL
,
106 0x4800000030821c03ULL
,
107 0x2800000028031de4ULL
,
108 0x4801000020a29c03ULL
,
109 0x280000002c035de4ULL
,
110 0x0800000000b2dc42ULL
,
111 0x4801000024c31c03ULL
,
112 0x9400000000a01fc5ULL
,
113 0x200002e04202c047ULL
,
114 0x2800400020001de4ULL
,
115 0x0800000000d35c42ULL
,
116 0x9400000100c107c5ULL
,
117 0x9400000140c01f85ULL
,
118 0x8000000000001de7ULL
121 /* For simplicity, we will allocate as many group slots as we allocate counter
122 * slots. This means that a single counter which wants to source from 2 groups
123 * will have to be declared as using 2 counter slots. This shouldn't really be
124 * a problem because such queries don't make much sense ... (unless someone is
127 struct nvc0_hw_sm_counter_cfg
129 uint32_t func
: 16; /* mask or 4-bit logic op (depending on mode) */
130 uint32_t mode
: 4; /* LOGOP,B6,LOGOP_B6(_PULSE) */
131 uint32_t sig_dom
: 1; /* if 0, MP_PM_A (per warp-sched), if 1, MP_PM_B */
132 uint32_t sig_sel
: 8; /* signal group */
133 uint32_t src_mask
; /* mask for signal selection (only for NVC0:NVE4) */
134 uint32_t src_sel
; /* signal selection for up to 4 sources */
137 #define NVC0_COUNTER_OPn_SUM 0
138 #define NVC0_COUNTER_OPn_OR 1
139 #define NVC0_COUNTER_OPn_AND 2
140 #define NVC0_COUNTER_OP2_REL_SUM_MM 3 /* (sum(ctr0) - sum(ctr1)) / sum(ctr0) */
141 #define NVC0_COUNTER_OP2_DIV_SUM_M0 4 /* sum(ctr0) / ctr1 of MP[0]) */
142 #define NVC0_COUNTER_OP2_AVG_DIV_MM 5 /* avg(ctr0 / ctr1) */
143 #define NVC0_COUNTER_OP2_AVG_DIV_M0 6 /* avg(ctr0) / ctr1 of MP[0]) */
145 struct nvc0_hw_sm_query_cfg
147 struct nvc0_hw_sm_counter_cfg ctr
[8];
148 uint8_t num_counters
;
150 uint8_t norm
[2]; /* normalization num,denom */
153 #define _Q1A(n, f, m, g, s, nu, dn) [NVE4_HW_SM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g, 0, s }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { nu, dn } }
154 #define _Q1B(n, f, m, g, s, nu, dn) [NVE4_HW_SM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g, 0, s }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { nu, dn } }
155 #define _M2A(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_HW_SM_QUERY_METRIC_##n] = { { \
156 { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g0, 0, s0 }, \
157 { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g1, 0, s1 }, \
158 {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } }
159 #define _M2B(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_HW_SM_QUERY_METRIC_##n] = { { \
160 { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g0, 0, s0 }, \
161 { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g1, 0, s1 }, \
162 {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } }
163 #define _M2AB(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_HW_SM_QUERY_METRIC_##n] = { { \
164 { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g0, 0, s0 }, \
165 { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g1, 0, s1 }, \
166 {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } }
169 * active_warps: bit 0 alternates btw 0 and 1 for odd nr of warps
170 * inst_executed etc.: we only count a single warp scheduler
171 * metric-ipXc: we simply multiply by 4 to account for the 4 warp schedulers;
172 * this is inaccurate !
174 static const struct nvc0_hw_sm_query_cfg nve4_hw_sm_queries
[] =
176 _Q1B(ACTIVE_CYCLES
, 0x0001, B6
, WARP
, 0x00000000, 1, 1),
177 _Q1B(ACTIVE_WARPS
, 0x003f, B6
, WARP
, 0x31483104, 2, 1),
178 _Q1A(ATOM_COUNT
, 0x0001, B6
, BRANCH
, 0x00000000, 1, 1),
179 _Q1A(BRANCH
, 0x0001, B6
, BRANCH
, 0x0000000c, 1, 1),
180 _Q1A(DIVERGENT_BRANCH
, 0x0001, B6
, BRANCH
, 0x00000010, 1, 1),
181 _Q1A(GLD_REQUEST
, 0x0001, B6
, LDST
, 0x00000010, 1, 1),
182 _Q1B(GLD_MEM_DIV_REPLAY
, 0x0001, B6
, REPLAY
, 0x00000010, 1, 1),
183 _Q1B(GST_TRANSACTIONS
, 0x0001, B6
, MEM
, 0x00000004, 1, 1),
184 _Q1B(GST_MEM_DIV_REPLAY
, 0x0001, B6
, REPLAY
, 0x00000014, 1, 1),
185 _Q1A(GRED_COUNT
, 0x0001, B6
, BRANCH
, 0x00000008, 1, 1),
186 _Q1A(GST_REQUEST
, 0x0001, B6
, LDST
, 0x00000014, 1, 1),
187 _Q1A(INST_EXECUTED
, 0x0003, B6
, EXEC
, 0x00000398, 1, 1),
188 _Q1A(INST_ISSUED
, 0x0003, B6
, ISSUE
, 0x00000104, 1, 1),
189 _Q1A(INST_ISSUED1
, 0x0001, B6
, ISSUE
, 0x00000004, 1, 1),
190 _Q1A(INST_ISSUED2
, 0x0001, B6
, ISSUE
, 0x00000008, 1, 1),
191 _Q1B(L1_GLD_HIT
, 0x0001, B6
, L1
, 0x00000010, 1, 1),
192 _Q1B(L1_GLD_MISS
, 0x0001, B6
, L1
, 0x00000014, 1, 1),
193 _Q1B(L1_LOCAL_LD_HIT
, 0x0001, B6
, L1
, 0x00000000, 1, 1),
194 _Q1B(L1_LOCAL_LD_MISS
, 0x0001, B6
, L1
, 0x00000004, 1, 1),
195 _Q1B(L1_LOCAL_ST_HIT
, 0x0001, B6
, L1
, 0x00000008, 1, 1),
196 _Q1B(L1_LOCAL_ST_MISS
, 0x0001, B6
, L1
, 0x0000000c, 1, 1),
197 _Q1B(L1_SHARED_LD_TRANSACTIONS
, 0x0001, B6
, TRANSACTION
, 0x00000008, 1, 1),
198 _Q1B(L1_SHARED_ST_TRANSACTIONS
, 0x0001, B6
, TRANSACTION
, 0x0000000c, 1, 1),
199 _Q1A(LOCAL_LD
, 0x0001, B6
, LDST
, 0x00000008, 1, 1),
200 _Q1B(LOCAL_LD_TRANSACTIONS
, 0x0001, B6
, TRANSACTION
, 0x00000000, 1, 1),
201 _Q1A(LOCAL_ST
, 0x0001, B6
, LDST
, 0x0000000c, 1, 1),
202 _Q1B(LOCAL_ST_TRANSACTIONS
, 0x0001, B6
, TRANSACTION
, 0x00000004, 1, 1),
203 _Q1A(PROF_TRIGGER_0
, 0x0001, B6
, USER
, 0x00000000, 1, 1),
204 _Q1A(PROF_TRIGGER_1
, 0x0001, B6
, USER
, 0x00000004, 1, 1),
205 _Q1A(PROF_TRIGGER_2
, 0x0001, B6
, USER
, 0x00000008, 1, 1),
206 _Q1A(PROF_TRIGGER_3
, 0x0001, B6
, USER
, 0x0000000c, 1, 1),
207 _Q1A(PROF_TRIGGER_4
, 0x0001, B6
, USER
, 0x00000010, 1, 1),
208 _Q1A(PROF_TRIGGER_5
, 0x0001, B6
, USER
, 0x00000014, 1, 1),
209 _Q1A(PROF_TRIGGER_6
, 0x0001, B6
, USER
, 0x00000018, 1, 1),
210 _Q1A(PROF_TRIGGER_7
, 0x0001, B6
, USER
, 0x0000001c, 1, 1),
211 _Q1A(SHARED_LD
, 0x0001, B6
, LDST
, 0x00000000, 1, 1),
212 _Q1B(SHARED_LD_REPLAY
, 0x0001, B6
, REPLAY
, 0x00000008, 1, 1),
213 _Q1A(SHARED_ST
, 0x0001, B6
, LDST
, 0x00000004, 1, 1),
214 _Q1B(SHARED_ST_REPLAY
, 0x0001, B6
, REPLAY
, 0x0000000c, 1, 1),
215 _Q1B(SM_CTA_LAUNCHED
, 0x0001, B6
, WARP
, 0x0000001c, 1, 1),
216 _Q1A(THREADS_LAUNCHED
, 0x003f, B6
, LAUNCH
, 0x398a4188, 1, 1),
217 _Q1B(UNCACHED_GLD_TRANSACTIONS
, 0x0001, B6
, MEM
, 0x00000000, 1, 1),
218 _Q1A(WARPS_LAUNCHED
, 0x0001, B6
, LAUNCH
, 0x00000004, 1, 1),
219 _M2AB(IPC
, 0x3, B6
, EXEC
, 0x398, 0xffff, LOGOP
, WARP
, 0x0, DIV_SUM_M0
, 10, 1),
220 _M2AB(IPAC
, 0x3, B6
, EXEC
, 0x398, 0x1, B6
, WARP
, 0x0, AVG_DIV_MM
, 10, 1),
221 _M2A(IPEC
, 0x3, B6
, EXEC
, 0x398, 0xe, LOGOP
, EXEC
, 0x398, AVG_DIV_MM
, 10, 1),
222 _M2A(INST_REPLAY_OHEAD
, 0x3, B6
, ISSUE
, 0x104, 0x3, B6
, EXEC
, 0x398, REL_SUM_MM
, 100, 1),
223 _M2B(MP_OCCUPANCY
, 0x3f, B6
, WARP
, 0x31483104, 0x01, B6
, WARP
, 0x0, AVG_DIV_MM
, 200, 64),
224 _M2B(MP_EFFICIENCY
, 0x01, B6
, WARP
, 0x0, 0xffff, LOGOP
, WARP
, 0x0, AVG_DIV_M0
, 100, 1),
232 /* === PERFORMANCE MONITORING COUNTERS for NVC0:NVE4 === */
233 static const uint64_t nvc0_read_hw_sm_counters_code
[] =
236 * mov b32 $r9 $physid
245 * set $p0 0x1 eq u32 $r8 0x0
246 * mov b32 $r10 c0[0x0]
247 * mov b32 $r11 c0[0x4]
248 * ext u32 $r8 $r9 0x414
250 * mul $r8 u32 $r8 u32 48
251 * add b32 $r10 $c $r10 $r8
252 * add b32 $r11 $r11 0x0 $c
253 * mov b32 $r8 c0[0x8]
254 * st b128 wt g[$r10d+0x00] $r0q
255 * st b128 wt g[$r10d+0x10] $r4q
256 * st b32 wt g[$r10d+0x20] $r8
258 0x2c00000084021c04ULL
,
259 0x2c0000000c025c04ULL
,
260 0x2c00000010001c04ULL
,
261 0x2c00000014005c04ULL
,
262 0x2c00000018009c04ULL
,
263 0x2c0000001c00dc04ULL
,
264 0x2c00000020011c04ULL
,
265 0x2c00000024015c04ULL
,
266 0x2c00000028019c04ULL
,
267 0x2c0000002c01dc04ULL
,
268 0x190e0000fc81dc03ULL
,
269 0x2800400000029de4ULL
,
270 0x280040001002dde4ULL
,
271 0x7000c01050921c03ULL
,
272 0x80000000000021e7ULL
,
273 0x10000000c0821c02ULL
,
274 0x4801000020a29c03ULL
,
275 0x0800000000b2dc42ULL
,
276 0x2800400020021de4ULL
,
277 0x9400000000a01fc5ULL
,
278 0x9400000040a11fc5ULL
,
279 0x9400000080a21f85ULL
,
280 0x8000000000001de7ULL
283 #define _C(f, o, g, m, s) { f, NVC0_COMPUTE_MP_PM_OP_MODE_##o, 0, g, m, s }
284 #define _Q(n, c, ...) [NVC0_HW_SM_QUERY_##n] = { \
285 { __VA_ARGS__ }, c, NVC0_COUNTER_OPn_SUM, { 1, 1 }, \
288 static const struct nvc0_hw_sm_query_cfg nvc0_hw_sm_queries
[] =
290 _Q(ACTIVE_CYCLES
, 1, _C(0xaaaa, LOGOP
, 0x11, 0x000000ff, 0x00000000)),
291 _Q(ACTIVE_WARPS
, 6, _C(0xaaaa, LOGOP
, 0x24, 0x000000ff, 0x00000010),
292 _C(0xaaaa, LOGOP
, 0x24, 0x000000ff, 0x00000020),
293 _C(0xaaaa, LOGOP
, 0x24, 0x000000ff, 0x00000030),
294 _C(0xaaaa, LOGOP
, 0x24, 0x000000ff, 0x00000040),
295 _C(0xaaaa, LOGOP
, 0x24, 0x000000ff, 0x00000050),
296 _C(0xaaaa, LOGOP
, 0x24, 0x000000ff, 0x00000060)),
297 _Q(ATOM_COUNT
, 1, _C(0xaaaa, LOGOP
, 0x63, 0x000000ff, 0x00000030)),
298 _Q(BRANCH
, 2, _C(0xaaaa, LOGOP
, 0x1a, 0x000000ff, 0x00000000),
299 _C(0xaaaa, LOGOP
, 0x1a, 0x000000ff, 0x00000010)),
300 _Q(DIVERGENT_BRANCH
, 2, _C(0xaaaa, LOGOP
, 0x19, 0x000000ff, 0x00000020),
301 _C(0xaaaa, LOGOP
, 0x19, 0x000000ff, 0x00000030)),
302 _Q(GLD_REQUEST
, 1, _C(0xaaaa, LOGOP
, 0x64, 0x000000ff, 0x00000030)),
303 _Q(GRED_COUNT
, 1, _C(0xaaaa, LOGOP
, 0x63, 0x000000ff, 0x00000040)),
304 _Q(GST_REQUEST
, 1, _C(0xaaaa, LOGOP
, 0x64, 0x000000ff, 0x00000060)),
305 _Q(INST_EXECUTED
, 3, _C(0xaaaa, LOGOP
, 0x2d, 0x000000ff, 0x00000000),
306 _C(0xaaaa, LOGOP
, 0x2d, 0x000000ff, 0x00000010),
307 _C(0xaaaa, LOGOP
, 0x2d, 0x000000ff, 0x00000020)),
308 _Q(INST_ISSUED1_0
, 1, _C(0xaaaa, LOGOP
, 0x7e, 0x000000ff, 0x00000010)),
309 _Q(INST_ISSUED1_1
, 1, _C(0xaaaa, LOGOP
, 0x7e, 0x000000ff, 0x00000040)),
310 _Q(INST_ISSUED2_0
, 1, _C(0xaaaa, LOGOP
, 0x7e, 0x000000ff, 0x00000020)),
311 _Q(INST_ISSUED2_1
, 1, _C(0xaaaa, LOGOP
, 0x7e, 0x000000ff, 0x00000050)),
312 _Q(LOCAL_LD
, 1, _C(0xaaaa, LOGOP
, 0x64, 0x000000ff, 0x00000020)),
313 _Q(LOCAL_ST
, 1, _C(0xaaaa, LOGOP
, 0x64, 0x000000ff, 0x00000050)),
314 _Q(PROF_TRIGGER_0
, 1, _C(0xaaaa, LOGOP
, 0x01, 0x000000ff, 0x00000000)),
315 _Q(PROF_TRIGGER_1
, 1, _C(0xaaaa, LOGOP
, 0x01, 0x000000ff, 0x00000010)),
316 _Q(PROF_TRIGGER_2
, 1, _C(0xaaaa, LOGOP
, 0x01, 0x000000ff, 0x00000020)),
317 _Q(PROF_TRIGGER_3
, 1, _C(0xaaaa, LOGOP
, 0x01, 0x000000ff, 0x00000030)),
318 _Q(PROF_TRIGGER_4
, 1, _C(0xaaaa, LOGOP
, 0x01, 0x000000ff, 0x00000040)),
319 _Q(PROF_TRIGGER_5
, 1, _C(0xaaaa, LOGOP
, 0x01, 0x000000ff, 0x00000050)),
320 _Q(PROF_TRIGGER_6
, 1, _C(0xaaaa, LOGOP
, 0x01, 0x000000ff, 0x00000060)),
321 _Q(PROF_TRIGGER_7
, 1, _C(0xaaaa, LOGOP
, 0x01, 0x000000ff, 0x00000070)),
322 _Q(SHARED_LD
, 1, _C(0xaaaa, LOGOP
, 0x64, 0x000000ff, 0x00000010)),
323 _Q(SHARED_ST
, 1, _C(0xaaaa, LOGOP
, 0x64, 0x000000ff, 0x00000040)),
324 _Q(THREADS_LAUNCHED
, 6, _C(0xaaaa, LOGOP
, 0x26, 0x000000ff, 0x00000010),
325 _C(0xaaaa, LOGOP
, 0x26, 0x000000ff, 0x00000020),
326 _C(0xaaaa, LOGOP
, 0x26, 0x000000ff, 0x00000030),
327 _C(0xaaaa, LOGOP
, 0x26, 0x000000ff, 0x00000040),
328 _C(0xaaaa, LOGOP
, 0x26, 0x000000ff, 0x00000050),
329 _C(0xaaaa, LOGOP
, 0x26, 0x000000ff, 0x00000060)),
330 _Q(TH_INST_EXECUTED_0
, 6, _C(0xaaaa, LOGOP
, 0xa3, 0x000000ff, 0x00000000),
331 _C(0xaaaa, LOGOP
, 0xa3, 0x000000ff, 0x00000010),
332 _C(0xaaaa, LOGOP
, 0xa3, 0x000000ff, 0x00000020),
333 _C(0xaaaa, LOGOP
, 0xa3, 0x000000ff, 0x00000030),
334 _C(0xaaaa, LOGOP
, 0xa3, 0x000000ff, 0x00000040),
335 _C(0xaaaa, LOGOP
, 0xa3, 0x000000ff, 0x00000050)),
336 _Q(TH_INST_EXECUTED_1
, 6, _C(0xaaaa, LOGOP
, 0xa5, 0x000000ff, 0x00000000),
337 _C(0xaaaa, LOGOP
, 0xa5, 0x000000ff, 0x00000010),
338 _C(0xaaaa, LOGOP
, 0xa5, 0x000000ff, 0x00000020),
339 _C(0xaaaa, LOGOP
, 0xa5, 0x000000ff, 0x00000030),
340 _C(0xaaaa, LOGOP
, 0xa5, 0x000000ff, 0x00000040),
341 _C(0xaaaa, LOGOP
, 0xa5, 0x000000ff, 0x00000050)),
342 _Q(TH_INST_EXECUTED_2
, 6, _C(0xaaaa, LOGOP
, 0xa4, 0x000000ff, 0x00000000),
343 _C(0xaaaa, LOGOP
, 0xa4, 0x000000ff, 0x00000010),
344 _C(0xaaaa, LOGOP
, 0xa4, 0x000000ff, 0x00000020),
345 _C(0xaaaa, LOGOP
, 0xa4, 0x000000ff, 0x00000030),
346 _C(0xaaaa, LOGOP
, 0xa4, 0x000000ff, 0x00000040),
347 _C(0xaaaa, LOGOP
, 0xa4, 0x000000ff, 0x00000050)),
348 _Q(TH_INST_EXECUTED_3
, 6, _C(0xaaaa, LOGOP
, 0xa6, 0x000000ff, 0x00000000),
349 _C(0xaaaa, LOGOP
, 0xa6, 0x000000ff, 0x00000010),
350 _C(0xaaaa, LOGOP
, 0xa6, 0x000000ff, 0x00000020),
351 _C(0xaaaa, LOGOP
, 0xa6, 0x000000ff, 0x00000030),
352 _C(0xaaaa, LOGOP
, 0xa6, 0x000000ff, 0x00000040),
353 _C(0xaaaa, LOGOP
, 0xa6, 0x000000ff, 0x00000050)),
354 _Q(WARPS_LAUNCHED
, 1, _C(0xaaaa, LOGOP
, 0x26, 0x000000ff, 0x00000000)),
360 static const struct nvc0_hw_sm_query_cfg
*
361 nvc0_hw_sm_query_get_cfg(struct nvc0_context
*nvc0
, struct nvc0_hw_query
*hq
)
363 struct nvc0_screen
*screen
= nvc0
->screen
;
364 struct nvc0_query
*q
= &hq
->base
;
366 if (screen
->base
.class_3d
>= NVE4_3D_CLASS
)
367 return &nve4_hw_sm_queries
[q
->type
- PIPE_QUERY_DRIVER_SPECIFIC
];
368 return &nvc0_hw_sm_queries
[q
->type
- NVC0_HW_SM_QUERY(0)];
372 nvc0_hw_sm_destroy_query(struct nvc0_context
*nvc0
, struct nvc0_hw_query
*hq
)
374 struct nvc0_query
*q
= &hq
->base
;
375 q
->funcs
->destroy_query(nvc0
, q
);
379 nve4_hw_sm_begin_query(struct nvc0_context
*nvc0
, struct nvc0_hw_query
*hq
)
381 struct nvc0_screen
*screen
= nvc0
->screen
;
382 struct nouveau_pushbuf
*push
= nvc0
->base
.pushbuf
;
383 struct nvc0_hw_sm_query
*hsq
= nvc0_hw_sm_query(hq
);
384 const struct nvc0_hw_sm_query_cfg
*cfg
;
386 unsigned num_ab
[2] = { 0, 0 };
388 cfg
= nvc0_hw_sm_query_get_cfg(nvc0
, hq
);
390 /* check if we have enough free counter slots */
391 for (i
= 0; i
< cfg
->num_counters
; ++i
)
392 num_ab
[cfg
->ctr
[i
].sig_dom
]++;
394 if (screen
->pm
.num_hw_sm_active
[0] + num_ab
[0] > 4 ||
395 screen
->pm
.num_hw_sm_active
[1] + num_ab
[1] > 4) {
396 NOUVEAU_ERR("Not enough free MP counter slots !\n");
400 assert(cfg
->num_counters
<= 4);
401 PUSH_SPACE(push
, 4 * 8 * + 6);
403 if (!screen
->pm
.mp_counters_enabled
) {
404 screen
->pm
.mp_counters_enabled
= true;
405 BEGIN_NVC0(push
, SUBC_SW(0x06ac), 1);
406 PUSH_DATA (push
, 0x1fcb);
409 /* set sequence field to 0 (used to check if result is available) */
410 for (i
= 0; i
< screen
->mp_count
; ++i
)
411 hq
->data
[i
* 10 + 10] = 0;
414 for (i
= 0; i
< cfg
->num_counters
; ++i
) {
415 const unsigned d
= cfg
->ctr
[i
].sig_dom
;
417 if (!screen
->pm
.num_hw_sm_active
[d
]) {
418 uint32_t m
= (1 << 22) | (1 << (7 + (8 * !d
)));
419 if (screen
->pm
.num_hw_sm_active
[!d
])
420 m
|= 1 << (7 + (8 * d
));
421 BEGIN_NVC0(push
, SUBC_SW(0x0600), 1);
424 screen
->pm
.num_hw_sm_active
[d
]++;
426 for (c
= d
* 4; c
< (d
* 4 + 4); ++c
) {
427 if (!screen
->pm
.mp_counter
[c
]) {
429 screen
->pm
.mp_counter
[c
] = hsq
;
433 assert(c
<= (d
* 4 + 3)); /* must succeed, already checked for space */
435 /* configure and reset the counter(s) */
437 BEGIN_NVC0(push
, NVE4_COMPUTE(MP_PM_A_SIGSEL(c
& 3)), 1);
439 BEGIN_NVC0(push
, NVE4_COMPUTE(MP_PM_B_SIGSEL(c
& 3)), 1);
440 PUSH_DATA (push
, cfg
->ctr
[i
].sig_sel
);
441 BEGIN_NVC0(push
, NVE4_COMPUTE(MP_PM_SRCSEL(c
)), 1);
442 PUSH_DATA (push
, cfg
->ctr
[i
].src_sel
+ 0x2108421 * (c
& 3));
443 BEGIN_NVC0(push
, NVE4_COMPUTE(MP_PM_FUNC(c
)), 1);
444 PUSH_DATA (push
, (cfg
->ctr
[i
].func
<< 4) | cfg
->ctr
[i
].mode
);
445 BEGIN_NVC0(push
, NVE4_COMPUTE(MP_PM_SET(c
)), 1);
452 nvc0_hw_sm_begin_query(struct nvc0_context
*nvc0
, struct nvc0_hw_query
*hq
)
454 struct nvc0_screen
*screen
= nvc0
->screen
;
455 struct nouveau_pushbuf
*push
= nvc0
->base
.pushbuf
;
456 struct nvc0_hw_sm_query
*hsq
= nvc0_hw_sm_query(hq
);
457 const struct nvc0_hw_sm_query_cfg
*cfg
;
460 if (screen
->base
.class_3d
>= NVE4_3D_CLASS
)
461 return nve4_hw_sm_begin_query(nvc0
, hq
);
463 cfg
= nvc0_hw_sm_query_get_cfg(nvc0
, hq
);
465 /* check if we have enough free counter slots */
466 if (screen
->pm
.num_hw_sm_active
[0] + cfg
->num_counters
> 8) {
467 NOUVEAU_ERR("Not enough free MP counter slots !\n");
471 assert(cfg
->num_counters
<= 8);
472 PUSH_SPACE(push
, 8 * 8 + 2);
474 /* set sequence field to 0 (used to check if result is available) */
475 for (i
= 0; i
< screen
->mp_count
; ++i
) {
476 const unsigned b
= (0x30 / 4) * i
;
481 for (i
= 0; i
< cfg
->num_counters
; ++i
) {
482 uint32_t mask_sel
= 0x00000000;
484 if (!screen
->pm
.num_hw_sm_active
[0]) {
485 BEGIN_NVC0(push
, SUBC_SW(0x0600), 1);
486 PUSH_DATA (push
, 0x80000000);
488 screen
->pm
.num_hw_sm_active
[0]++;
490 for (c
= 0; c
< 8; ++c
) {
491 if (!screen
->pm
.mp_counter
[c
]) {
493 screen
->pm
.mp_counter
[c
] = hsq
;
498 /* Oddly-enough, the signal id depends on the slot selected on Fermi but
499 * not on Kepler. Fortunately, the signal ids are just offseted by the
502 mask_sel
|= (c
<< 8);
503 mask_sel
|= (c
<< 16);
504 mask_sel
|= (c
<< 24);
505 mask_sel
&= cfg
->ctr
[i
].src_mask
;
507 /* configure and reset the counter(s) */
508 BEGIN_NVC0(push
, NVC0_COMPUTE(MP_PM_SIGSEL(c
)), 1);
509 PUSH_DATA (push
, cfg
->ctr
[i
].sig_sel
);
510 BEGIN_NVC0(push
, NVC0_COMPUTE(MP_PM_SRCSEL(c
)), 1);
511 PUSH_DATA (push
, cfg
->ctr
[i
].src_sel
| mask_sel
);
512 BEGIN_NVC0(push
, NVC0_COMPUTE(MP_PM_OP(c
)), 1);
513 PUSH_DATA (push
, (cfg
->ctr
[i
].func
<< 4) | cfg
->ctr
[i
].mode
);
514 BEGIN_NVC0(push
, NVC0_COMPUTE(MP_PM_SET(c
)), 1);
521 nvc0_hw_sm_end_query(struct nvc0_context
*nvc0
, struct nvc0_hw_query
*hq
)
523 struct nvc0_screen
*screen
= nvc0
->screen
;
524 struct pipe_context
*pipe
= &nvc0
->base
.pipe
;
525 struct nouveau_pushbuf
*push
= nvc0
->base
.pushbuf
;
526 const bool is_nve4
= screen
->base
.class_3d
>= NVE4_3D_CLASS
;
527 struct nvc0_hw_sm_query
*hsq
= nvc0_hw_sm_query(hq
);
530 const uint block
[3] = { 32, is_nve4
? 4 : 1, 1 };
531 const uint grid
[3] = { screen
->mp_count
, screen
->gpc_count
, 1 };
534 if (unlikely(!screen
->pm
.prog
)) {
535 struct nvc0_program
*prog
= CALLOC_STRUCT(nvc0_program
);
536 prog
->type
= PIPE_SHADER_COMPUTE
;
537 prog
->translated
= true;
539 prog
->parm_size
= 12;
541 prog
->code
= (uint32_t *)nve4_read_hw_sm_counters_code
;
542 prog
->code_size
= sizeof(nve4_read_hw_sm_counters_code
);
544 prog
->code
= (uint32_t *)nvc0_read_hw_sm_counters_code
;
545 prog
->code_size
= sizeof(nvc0_read_hw_sm_counters_code
);
547 screen
->pm
.prog
= prog
;
550 /* disable all counting */
552 for (c
= 0; c
< 8; ++c
)
553 if (screen
->pm
.mp_counter
[c
]) {
555 IMMED_NVC0(push
, NVE4_COMPUTE(MP_PM_FUNC(c
)), 0);
557 IMMED_NVC0(push
, NVC0_COMPUTE(MP_PM_OP(c
)), 0);
560 /* release counters for this query */
561 for (c
= 0; c
< 8; ++c
) {
562 if (screen
->pm
.mp_counter
[c
] == hsq
) {
563 uint8_t d
= is_nve4
? c
/ 4 : 0; /* only one domain for NVC0:NVE4 */
564 screen
->pm
.num_hw_sm_active
[d
]--;
565 screen
->pm
.mp_counter
[c
] = NULL
;
569 BCTX_REFN_bo(nvc0
->bufctx_cp
, CP_QUERY
, NOUVEAU_BO_GART
| NOUVEAU_BO_WR
,
573 IMMED_NVC0(push
, SUBC_COMPUTE(NV50_GRAPH_SERIALIZE
), 0);
575 pipe
->bind_compute_state(pipe
, screen
->pm
.prog
);
576 input
[0] = (hq
->bo
->offset
+ hq
->base_offset
);
577 input
[1] = (hq
->bo
->offset
+ hq
->base_offset
) >> 32;
578 input
[2] = hq
->sequence
;
579 pipe
->launch_grid(pipe
, block
, grid
, 0, input
);
581 nouveau_bufctx_reset(nvc0
->bufctx_cp
, NVC0_BIND_CP_QUERY
);
583 /* re-activate other counters */
584 PUSH_SPACE(push
, 16);
586 for (c
= 0; c
< 8; ++c
) {
587 const struct nvc0_hw_sm_query_cfg
*cfg
;
590 hsq
= screen
->pm
.mp_counter
[c
];
594 cfg
= nvc0_hw_sm_query_get_cfg(nvc0
, &hsq
->base
);
595 for (i
= 0; i
< cfg
->num_counters
; ++i
) {
596 if (mask
& (1 << hsq
->ctr
[i
]))
598 mask
|= 1 << hsq
->ctr
[i
];
600 BEGIN_NVC0(push
, NVE4_COMPUTE(MP_PM_FUNC(hsq
->ctr
[i
])), 1);
602 BEGIN_NVC0(push
, NVC0_COMPUTE(MP_PM_OP(hsq
->ctr
[i
])), 1);
604 PUSH_DATA (push
, (cfg
->ctr
[i
].func
<< 4) | cfg
->ctr
[i
].mode
);
610 nvc0_hw_sm_query_read_data(uint32_t count
[32][8],
611 struct nvc0_context
*nvc0
, bool wait
,
612 struct nvc0_hw_query
*hq
,
613 const struct nvc0_hw_sm_query_cfg
*cfg
,
616 struct nvc0_hw_sm_query
*hsq
= nvc0_hw_sm_query(hq
);
619 for (p
= 0; p
< mp_count
; ++p
) {
620 const unsigned b
= (0x30 / 4) * p
;
622 for (c
= 0; c
< cfg
->num_counters
; ++c
) {
623 if (hq
->data
[b
+ 8] != hq
->sequence
) {
626 if (nouveau_bo_wait(hq
->bo
, NOUVEAU_BO_RD
, nvc0
->base
.client
))
629 count
[p
][c
] = hq
->data
[b
+ hsq
->ctr
[c
]] * (1 << c
);
636 nve4_hw_sm_query_read_data(uint32_t count
[32][8],
637 struct nvc0_context
*nvc0
, bool wait
,
638 struct nvc0_hw_query
*hq
,
639 const struct nvc0_hw_sm_query_cfg
*cfg
,
642 struct nvc0_hw_sm_query
*hsq
= nvc0_hw_sm_query(hq
);
645 for (p
= 0; p
< mp_count
; ++p
) {
646 const unsigned b
= (0x60 / 4) * p
;
648 for (c
= 0; c
< cfg
->num_counters
; ++c
) {
650 for (d
= 0; d
< ((hsq
->ctr
[c
] & ~3) ? 1 : 4); ++d
) {
651 if (hq
->data
[b
+ 20 + d
] != hq
->sequence
) {
654 if (nouveau_bo_wait(hq
->bo
, NOUVEAU_BO_RD
, nvc0
->base
.client
))
657 if (hsq
->ctr
[c
] & ~0x3)
658 count
[p
][c
] = hq
->data
[b
+ 16 + (hsq
->ctr
[c
] & 3)];
660 count
[p
][c
] += hq
->data
[b
+ d
* 4 + hsq
->ctr
[c
]];
667 /* Metric calculations:
668 * sum(x) ... sum of x over all MPs
669 * avg(x) ... average of x over all MPs
671 * IPC : sum(inst_executed) / clock
672 * INST_REPLAY_OHEAD: (sum(inst_issued) - sum(inst_executed)) / sum(inst_issued)
673 * MP_OCCUPANCY : avg((active_warps / 64) / active_cycles)
674 * MP_EFFICIENCY : avg(active_cycles / clock)
676 * NOTE: Interpretation of IPC requires knowledge of MP count.
679 nvc0_hw_sm_get_query_result(struct nvc0_context
*nvc0
, struct nvc0_hw_query
*hq
,
680 boolean wait
, union pipe_query_result
*result
)
682 uint32_t count
[32][8];
684 unsigned mp_count
= MIN2(nvc0
->screen
->mp_count_compute
, 32);
686 const struct nvc0_hw_sm_query_cfg
*cfg
;
689 cfg
= nvc0_hw_sm_query_get_cfg(nvc0
, hq
);
691 if (nvc0
->screen
->base
.class_3d
>= NVE4_3D_CLASS
)
692 ret
= nve4_hw_sm_query_read_data(count
, nvc0
, wait
, hq
, cfg
, mp_count
);
694 ret
= nvc0_hw_sm_query_read_data(count
, nvc0
, wait
, hq
, cfg
, mp_count
);
698 if (cfg
->op
== NVC0_COUNTER_OPn_SUM
) {
699 for (c
= 0; c
< cfg
->num_counters
; ++c
)
700 for (p
= 0; p
< mp_count
; ++p
)
701 value
+= count
[p
][c
];
702 value
= (value
* cfg
->norm
[0]) / cfg
->norm
[1];
704 if (cfg
->op
== NVC0_COUNTER_OPn_OR
) {
706 for (c
= 0; c
< cfg
->num_counters
; ++c
)
707 for (p
= 0; p
< mp_count
; ++p
)
709 value
= ((uint64_t)v
* cfg
->norm
[0]) / cfg
->norm
[1];
711 if (cfg
->op
== NVC0_COUNTER_OPn_AND
) {
713 for (c
= 0; c
< cfg
->num_counters
; ++c
)
714 for (p
= 0; p
< mp_count
; ++p
)
716 value
= ((uint64_t)v
* cfg
->norm
[0]) / cfg
->norm
[1];
718 if (cfg
->op
== NVC0_COUNTER_OP2_REL_SUM_MM
) {
719 uint64_t v
[2] = { 0, 0 };
720 for (p
= 0; p
< mp_count
; ++p
) {
725 value
= ((v
[0] - v
[1]) * cfg
->norm
[0]) / (v
[0] * cfg
->norm
[1]);
727 if (cfg
->op
== NVC0_COUNTER_OP2_DIV_SUM_M0
) {
728 for (p
= 0; p
< mp_count
; ++p
)
729 value
+= count
[p
][0];
731 value
= (value
* cfg
->norm
[0]) / (count
[0][1] * cfg
->norm
[1]);
735 if (cfg
->op
== NVC0_COUNTER_OP2_AVG_DIV_MM
) {
736 unsigned mp_used
= 0;
737 for (p
= 0; p
< mp_count
; ++p
, mp_used
+= !!count
[p
][0])
739 value
+= (count
[p
][0] * cfg
->norm
[0]) / count
[p
][1];
741 value
/= (uint64_t)mp_used
* cfg
->norm
[1];
743 if (cfg
->op
== NVC0_COUNTER_OP2_AVG_DIV_M0
) {
744 unsigned mp_used
= 0;
745 for (p
= 0; p
< mp_count
; ++p
, mp_used
+= !!count
[p
][0])
746 value
+= count
[p
][0];
747 if (count
[0][1] && mp_used
) {
748 value
*= cfg
->norm
[0];
749 value
/= (uint64_t)count
[0][1] * mp_used
* cfg
->norm
[1];
755 *(uint64_t *)result
= value
;
759 static const struct nvc0_hw_query_funcs hw_sm_query_funcs
= {
760 .destroy_query
= nvc0_hw_sm_destroy_query
,
761 .begin_query
= nvc0_hw_sm_begin_query
,
762 .end_query
= nvc0_hw_sm_end_query
,
763 .get_query_result
= nvc0_hw_sm_get_query_result
,
766 struct nvc0_hw_query
*
767 nvc0_hw_sm_create_query(struct nvc0_context
*nvc0
, unsigned type
)
769 struct nvc0_screen
*screen
= nvc0
->screen
;
770 struct nvc0_hw_sm_query
*hsq
;
771 struct nvc0_hw_query
*hq
;
774 if (nvc0
->screen
->base
.device
->drm_version
< 0x01000101)
777 if ((type
< NVE4_HW_SM_QUERY(0) || type
> NVE4_HW_SM_QUERY_LAST
) &&
778 (type
< NVC0_HW_SM_QUERY(0) || type
> NVC0_HW_SM_QUERY_LAST
))
781 hsq
= CALLOC_STRUCT(nvc0_hw_sm_query
);
786 hq
->funcs
= &hw_sm_query_funcs
;
787 hq
->base
.type
= type
;
789 if (screen
->base
.class_3d
>= NVE4_3D_CLASS
) {
811 * [50] = WS0.sequence
812 * [54] = WS1.sequence
813 * [58] = WS2.sequence
814 * [5c] = WS3.sequence
816 space
= (4 * 4 + 4 + 4) * nvc0
->screen
->mp_count
* sizeof(uint32_t);
819 * Note that padding is used to align memory access to 128 bits.
835 space
= (8 + 1 + 3) * nvc0
->screen
->mp_count
* sizeof(uint32_t);
838 if (!nvc0_hw_query_allocate(nvc0
, &hq
->base
, space
)) {