nvc0: add MP counters variants for GF100/GF110
[mesa.git] / src / gallium / drivers / nouveau / nvc0 / nvc0_query_hw_sm.c
1 /*
2 * Copyright 2011 Christoph Bumiller
3 * Copyright 2015 Samuel Pitoiset
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21 * OTHER DEALINGS IN THE SOFTWARE.
22 */
23
24 #define NVC0_PUSH_EXPLICIT_SPACE_CHECKING
25
26 #include "nvc0/nvc0_context.h"
27 #include "nvc0/nvc0_query_hw_sm.h"
28
29 #include "nv_object.xml.h"
30 #include "nvc0/nve4_compute.xml.h"
31 #include "nvc0/nvc0_compute.xml.h"
32
33 /* === PERFORMANCE MONITORING COUNTERS for NVE4+ === */
34
35 /* NOTE: intentionally using the same names as NV */
36 static const char *nve4_hw_sm_query_names[] =
37 {
38 /* MP counters */
39 "active_cycles",
40 "active_warps",
41 "atom_count",
42 "branch",
43 "divergent_branch",
44 "gld_request",
45 "global_ld_mem_divergence_replays",
46 "global_store_transaction",
47 "global_st_mem_divergence_replays",
48 "gred_count",
49 "gst_request",
50 "inst_executed",
51 "inst_issued",
52 "inst_issued1",
53 "inst_issued2",
54 "l1_global_load_hit",
55 "l1_global_load_miss",
56 "l1_local_load_hit",
57 "l1_local_load_miss",
58 "l1_local_store_hit",
59 "l1_local_store_miss",
60 "l1_shared_load_transactions",
61 "l1_shared_store_transactions",
62 "local_load",
63 "local_load_transactions",
64 "local_store",
65 "local_store_transactions",
66 "prof_trigger_00",
67 "prof_trigger_01",
68 "prof_trigger_02",
69 "prof_trigger_03",
70 "prof_trigger_04",
71 "prof_trigger_05",
72 "prof_trigger_06",
73 "prof_trigger_07",
74 "shared_load",
75 "shared_load_replay",
76 "shared_store",
77 "shared_store_replay",
78 "sm_cta_launched",
79 "threads_launched",
80 "uncached_global_load_transaction",
81 "warps_launched",
82 /* metrics, i.e. functions of the MP counters */
83 "metric-ipc", /* inst_executed, clock */
84 "metric-ipac", /* inst_executed, active_cycles */
85 "metric-ipec", /* inst_executed, (bool)inst_executed */
86 "metric-achieved_occupancy", /* active_warps, active_cycles */
87 "metric-sm_efficiency", /* active_cycles, clock */
88 "metric-inst_replay_overhead" /* inst_issued, inst_executed */
89 };
90
91 /* Code to read out MP counters: They are accessible via mmio, too, but let's
92 * just avoid mapping registers in userspace. We'd have to know which MPs are
93 * enabled/present, too, and that information is not presently exposed.
94 * We could add a kernel interface for it, but reading the counters like this
95 * has the advantage of being async (if get_result isn't called immediately).
96 */
97 static const uint64_t nve4_read_hw_sm_counters_code[] =
98 {
99 /* sched 0x20 0x20 0x20 0x20 0x20 0x20 0x20
100 * mov b32 $r8 $tidx
101 * mov b32 $r12 $physid
102 * mov b32 $r0 $pm0
103 * mov b32 $r1 $pm1
104 * mov b32 $r2 $pm2
105 * mov b32 $r3 $pm3
106 * mov b32 $r4 $pm4
107 * sched 0x20 0x20 0x23 0x04 0x20 0x04 0x2b
108 * mov b32 $r5 $pm5
109 * mov b32 $r6 $pm6
110 * mov b32 $r7 $pm7
111 * set $p0 0x1 eq u32 $r8 0x0
112 * mov b32 $r10 c0[0x0]
113 * ext u32 $r8 $r12 0x414
114 * mov b32 $r11 c0[0x4]
115 * sched 0x04 0x2e 0x04 0x20 0x20 0x28 0x04
116 * ext u32 $r9 $r12 0x208
117 * (not $p0) exit
118 * set $p1 0x1 eq u32 $r9 0x0
119 * mul $r8 u32 $r8 u32 96
120 * mul $r12 u32 $r9 u32 16
121 * mul $r13 u32 $r9 u32 4
122 * add b32 $r9 $r8 $r13
123 * sched 0x28 0x04 0x2c 0x04 0x2c 0x04 0x2c
124 * add b32 $r8 $r8 $r12
125 * mov b32 $r12 $r10
126 * add b32 $r10 $c $r10 $r8
127 * mov b32 $r13 $r11
128 * add b32 $r11 $r11 0x0 $c
129 * add b32 $r12 $c $r12 $r9
130 * st b128 wt g[$r10d] $r0q
131 * sched 0x4 0x2c 0x20 0x04 0x2e 0x00 0x00
132 * mov b32 $r0 c0[0x8]
133 * add b32 $r13 $r13 0x0 $c
134 * $p1 st b128 wt g[$r12d+0x40] $r4q
135 * st b32 wt g[$r12d+0x50] $r0
136 * exit */
137 0x2202020202020207ULL,
138 0x2c00000084021c04ULL,
139 0x2c0000000c031c04ULL,
140 0x2c00000010001c04ULL,
141 0x2c00000014005c04ULL,
142 0x2c00000018009c04ULL,
143 0x2c0000001c00dc04ULL,
144 0x2c00000020011c04ULL,
145 0x22b0420042320207ULL,
146 0x2c00000024015c04ULL,
147 0x2c00000028019c04ULL,
148 0x2c0000002c01dc04ULL,
149 0x190e0000fc81dc03ULL,
150 0x2800400000029de4ULL,
151 0x7000c01050c21c03ULL,
152 0x280040001002dde4ULL,
153 0x204282020042e047ULL,
154 0x7000c00820c25c03ULL,
155 0x80000000000021e7ULL,
156 0x190e0000fc93dc03ULL,
157 0x1000000180821c02ULL,
158 0x1000000040931c02ULL,
159 0x1000000010935c02ULL,
160 0x4800000034825c03ULL,
161 0x22c042c042c04287ULL,
162 0x4800000030821c03ULL,
163 0x2800000028031de4ULL,
164 0x4801000020a29c03ULL,
165 0x280000002c035de4ULL,
166 0x0800000000b2dc42ULL,
167 0x4801000024c31c03ULL,
168 0x9400000000a01fc5ULL,
169 0x200002e04202c047ULL,
170 0x2800400020001de4ULL,
171 0x0800000000d35c42ULL,
172 0x9400000100c107c5ULL,
173 0x9400000140c01f85ULL,
174 0x8000000000001de7ULL
175 };
176
177 /* For simplicity, we will allocate as many group slots as we allocate counter
178 * slots. This means that a single counter which wants to source from 2 groups
179 * will have to be declared as using 2 counter slots. This shouldn't really be
180 * a problem because such queries don't make much sense ... (unless someone is
181 * really creative).
182 */
183 struct nvc0_hw_sm_counter_cfg
184 {
185 uint32_t func : 16; /* mask or 4-bit logic op (depending on mode) */
186 uint32_t mode : 4; /* LOGOP,B6,LOGOP_B6(_PULSE) */
187 uint32_t sig_dom : 1; /* if 0, MP_PM_A (per warp-sched), if 1, MP_PM_B */
188 uint32_t sig_sel : 8; /* signal group */
189 uint32_t src_mask; /* mask for signal selection (only for NVC0:NVE4) */
190 uint32_t src_sel; /* signal selection for up to 4 sources */
191 };
192
193 #define NVC0_COUNTER_OPn_SUM 0
194 #define NVC0_COUNTER_OPn_OR 1
195 #define NVC0_COUNTER_OPn_AND 2
196 #define NVC0_COUNTER_OP2_REL_SUM_MM 3 /* (sum(ctr0) - sum(ctr1)) / sum(ctr0) */
197 #define NVC0_COUNTER_OP2_DIV_SUM_M0 4 /* sum(ctr0) / ctr1 of MP[0]) */
198 #define NVC0_COUNTER_OP2_AVG_DIV_MM 5 /* avg(ctr0 / ctr1) */
199 #define NVC0_COUNTER_OP2_AVG_DIV_M0 6 /* avg(ctr0) / ctr1 of MP[0]) */
200
201 struct nvc0_hw_sm_query_cfg
202 {
203 struct nvc0_hw_sm_counter_cfg ctr[8];
204 uint8_t num_counters;
205 uint8_t op;
206 uint8_t norm[2]; /* normalization num,denom */
207 };
208
209 #define _Q1A(n, f, m, g, s, nu, dn) [NVE4_HW_SM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g, 0, s }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { nu, dn } }
210 #define _Q1B(n, f, m, g, s, nu, dn) [NVE4_HW_SM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g, 0, s }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { nu, dn } }
211 #define _M2A(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_HW_SM_QUERY_METRIC_##n] = { { \
212 { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g0, 0, s0 }, \
213 { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g1, 0, s1 }, \
214 {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } }
215 #define _M2B(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_HW_SM_QUERY_METRIC_##n] = { { \
216 { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g0, 0, s0 }, \
217 { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g1, 0, s1 }, \
218 {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } }
219 #define _M2AB(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_HW_SM_QUERY_METRIC_##n] = { { \
220 { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g0, 0, s0 }, \
221 { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g1, 0, s1 }, \
222 {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } }
223
224 /* NOTES:
225 * active_warps: bit 0 alternates btw 0 and 1 for odd nr of warps
226 * inst_executed etc.: we only count a single warp scheduler
227 * metric-ipXc: we simply multiply by 4 to account for the 4 warp schedulers;
228 * this is inaccurate !
229 */
230 static const struct nvc0_hw_sm_query_cfg nve4_hw_sm_queries[] =
231 {
232 _Q1B(ACTIVE_CYCLES, 0x0001, B6, WARP, 0x00000000, 1, 1),
233 _Q1B(ACTIVE_WARPS, 0x003f, B6, WARP, 0x31483104, 2, 1),
234 _Q1A(ATOM_COUNT, 0x0001, B6, BRANCH, 0x00000000, 1, 1),
235 _Q1A(BRANCH, 0x0001, B6, BRANCH, 0x0000000c, 1, 1),
236 _Q1A(DIVERGENT_BRANCH, 0x0001, B6, BRANCH, 0x00000010, 1, 1),
237 _Q1A(GLD_REQUEST, 0x0001, B6, LDST, 0x00000010, 1, 1),
238 _Q1B(GLD_MEM_DIV_REPLAY, 0x0001, B6, REPLAY, 0x00000010, 1, 1),
239 _Q1B(GST_TRANSACTIONS, 0x0001, B6, MEM, 0x00000004, 1, 1),
240 _Q1B(GST_MEM_DIV_REPLAY, 0x0001, B6, REPLAY, 0x00000014, 1, 1),
241 _Q1A(GRED_COUNT, 0x0001, B6, BRANCH, 0x00000008, 1, 1),
242 _Q1A(GST_REQUEST, 0x0001, B6, LDST, 0x00000014, 1, 1),
243 _Q1A(INST_EXECUTED, 0x0003, B6, EXEC, 0x00000398, 1, 1),
244 _Q1A(INST_ISSUED, 0x0003, B6, ISSUE, 0x00000104, 1, 1),
245 _Q1A(INST_ISSUED1, 0x0001, B6, ISSUE, 0x00000004, 1, 1),
246 _Q1A(INST_ISSUED2, 0x0001, B6, ISSUE, 0x00000008, 1, 1),
247 _Q1B(L1_GLD_HIT, 0x0001, B6, L1, 0x00000010, 1, 1),
248 _Q1B(L1_GLD_MISS, 0x0001, B6, L1, 0x00000014, 1, 1),
249 _Q1B(L1_LOCAL_LD_HIT, 0x0001, B6, L1, 0x00000000, 1, 1),
250 _Q1B(L1_LOCAL_LD_MISS, 0x0001, B6, L1, 0x00000004, 1, 1),
251 _Q1B(L1_LOCAL_ST_HIT, 0x0001, B6, L1, 0x00000008, 1, 1),
252 _Q1B(L1_LOCAL_ST_MISS, 0x0001, B6, L1, 0x0000000c, 1, 1),
253 _Q1B(L1_SHARED_LD_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000008, 1, 1),
254 _Q1B(L1_SHARED_ST_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x0000000c, 1, 1),
255 _Q1A(LOCAL_LD, 0x0001, B6, LDST, 0x00000008, 1, 1),
256 _Q1B(LOCAL_LD_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000000, 1, 1),
257 _Q1A(LOCAL_ST, 0x0001, B6, LDST, 0x0000000c, 1, 1),
258 _Q1B(LOCAL_ST_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000004, 1, 1),
259 _Q1A(PROF_TRIGGER_0, 0x0001, B6, USER, 0x00000000, 1, 1),
260 _Q1A(PROF_TRIGGER_1, 0x0001, B6, USER, 0x00000004, 1, 1),
261 _Q1A(PROF_TRIGGER_2, 0x0001, B6, USER, 0x00000008, 1, 1),
262 _Q1A(PROF_TRIGGER_3, 0x0001, B6, USER, 0x0000000c, 1, 1),
263 _Q1A(PROF_TRIGGER_4, 0x0001, B6, USER, 0x00000010, 1, 1),
264 _Q1A(PROF_TRIGGER_5, 0x0001, B6, USER, 0x00000014, 1, 1),
265 _Q1A(PROF_TRIGGER_6, 0x0001, B6, USER, 0x00000018, 1, 1),
266 _Q1A(PROF_TRIGGER_7, 0x0001, B6, USER, 0x0000001c, 1, 1),
267 _Q1A(SHARED_LD, 0x0001, B6, LDST, 0x00000000, 1, 1),
268 _Q1B(SHARED_LD_REPLAY, 0x0001, B6, REPLAY, 0x00000008, 1, 1),
269 _Q1A(SHARED_ST, 0x0001, B6, LDST, 0x00000004, 1, 1),
270 _Q1B(SHARED_ST_REPLAY, 0x0001, B6, REPLAY, 0x0000000c, 1, 1),
271 _Q1B(SM_CTA_LAUNCHED, 0x0001, B6, WARP, 0x0000001c, 1, 1),
272 _Q1A(THREADS_LAUNCHED, 0x003f, B6, LAUNCH, 0x398a4188, 1, 1),
273 _Q1B(UNCACHED_GLD_TRANSACTIONS, 0x0001, B6, MEM, 0x00000000, 1, 1),
274 _Q1A(WARPS_LAUNCHED, 0x0001, B6, LAUNCH, 0x00000004, 1, 1),
275 _M2AB(IPC, 0x3, B6, EXEC, 0x398, 0xffff, LOGOP, WARP, 0x0, DIV_SUM_M0, 10, 1),
276 _M2AB(IPAC, 0x3, B6, EXEC, 0x398, 0x1, B6, WARP, 0x0, AVG_DIV_MM, 10, 1),
277 _M2A(IPEC, 0x3, B6, EXEC, 0x398, 0xe, LOGOP, EXEC, 0x398, AVG_DIV_MM, 10, 1),
278 _M2A(INST_REPLAY_OHEAD, 0x3, B6, ISSUE, 0x104, 0x3, B6, EXEC, 0x398, REL_SUM_MM, 100, 1),
279 _M2B(MP_OCCUPANCY, 0x3f, B6, WARP, 0x31483104, 0x01, B6, WARP, 0x0, AVG_DIV_MM, 200, 64),
280 _M2B(MP_EFFICIENCY, 0x01, B6, WARP, 0x0, 0xffff, LOGOP, WARP, 0x0, AVG_DIV_M0, 100, 1),
281 };
282
283 #undef _Q1A
284 #undef _Q1B
285 #undef _M2A
286 #undef _M2B
287
288 /* === PERFORMANCE MONITORING COUNTERS for NVC0:NVE4 === */
289 static const char *nvc0_hw_sm_query_names[] =
290 {
291 /* MP counters */
292 "active_cycles",
293 "active_warps",
294 "atom_count",
295 "branch",
296 "divergent_branch",
297 "gld_request",
298 "gred_count",
299 "gst_request",
300 "inst_executed",
301 "inst_issued",
302 "inst_issued1_0",
303 "inst_issued1_1",
304 "inst_issued2_0",
305 "inst_issued2_1",
306 "local_load",
307 "local_store",
308 "prof_trigger_00",
309 "prof_trigger_01",
310 "prof_trigger_02",
311 "prof_trigger_03",
312 "prof_trigger_04",
313 "prof_trigger_05",
314 "prof_trigger_06",
315 "prof_trigger_07",
316 "shared_load",
317 "shared_store",
318 "threads_launched",
319 "thread_inst_executed_0",
320 "thread_inst_executed_1",
321 "thread_inst_executed_2",
322 "thread_inst_executed_3",
323 "warps_launched",
324 };
325
326 static const uint64_t nvc0_read_hw_sm_counters_code[] =
327 {
328 /* mov b32 $r8 $tidx
329 * mov b32 $r9 $physid
330 * mov b32 $r0 $pm0
331 * mov b32 $r1 $pm1
332 * mov b32 $r2 $pm2
333 * mov b32 $r3 $pm3
334 * mov b32 $r4 $pm4
335 * mov b32 $r5 $pm5
336 * mov b32 $r6 $pm6
337 * mov b32 $r7 $pm7
338 * set $p0 0x1 eq u32 $r8 0x0
339 * mov b32 $r10 c0[0x0]
340 * mov b32 $r11 c0[0x4]
341 * ext u32 $r8 $r9 0x414
342 * (not $p0) exit
343 * mul $r8 u32 $r8 u32 48
344 * add b32 $r10 $c $r10 $r8
345 * add b32 $r11 $r11 0x0 $c
346 * mov b32 $r8 c0[0x8]
347 * st b128 wt g[$r10d+0x00] $r0q
348 * st b128 wt g[$r10d+0x10] $r4q
349 * st b32 wt g[$r10d+0x20] $r8
350 * exit */
351 0x2c00000084021c04ULL,
352 0x2c0000000c025c04ULL,
353 0x2c00000010001c04ULL,
354 0x2c00000014005c04ULL,
355 0x2c00000018009c04ULL,
356 0x2c0000001c00dc04ULL,
357 0x2c00000020011c04ULL,
358 0x2c00000024015c04ULL,
359 0x2c00000028019c04ULL,
360 0x2c0000002c01dc04ULL,
361 0x190e0000fc81dc03ULL,
362 0x2800400000029de4ULL,
363 0x280040001002dde4ULL,
364 0x7000c01050921c03ULL,
365 0x80000000000021e7ULL,
366 0x10000000c0821c02ULL,
367 0x4801000020a29c03ULL,
368 0x0800000000b2dc42ULL,
369 0x2800400020021de4ULL,
370 0x9400000000a01fc5ULL,
371 0x9400000040a11fc5ULL,
372 0x9400000080a21f85ULL,
373 0x8000000000001de7ULL
374 };
375
376 #define _C(f, o, g, m, s) { f, NVC0_COMPUTE_MP_PM_OP_MODE_##o, 0, g, m, s }
377 #define _Q(n, c) [NVC0_HW_SM_QUERY_##n] = c
378
379 /* ==== Compute capability 2.0 (GF100/GF110) ==== */
380 static const struct nvc0_hw_sm_query_cfg
381 sm20_active_cycles =
382 {
383 .ctr[0] = _C(0xaaaa, LOGOP, 0x11, 0x000000ff, 0x00000000),
384 .num_counters = 1,
385 .op = NVC0_COUNTER_OPn_SUM,
386 .norm = { 1, 1 },
387 };
388
389 static const struct nvc0_hw_sm_query_cfg
390 sm20_active_warps =
391 {
392 .ctr[0] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000010),
393 .ctr[1] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000020),
394 .ctr[2] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000030),
395 .ctr[3] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000040),
396 .ctr[4] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000050),
397 .ctr[5] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000060),
398 .num_counters = 6,
399 .op = NVC0_COUNTER_OPn_SUM,
400 .norm = { 1, 1 },
401 };
402
403 static const struct nvc0_hw_sm_query_cfg
404 sm20_atom_count =
405 {
406 .ctr[0] = _C(0xaaaa, LOGOP, 0x63, 0x000000ff, 0x00000030),
407 .num_counters = 1,
408 .op = NVC0_COUNTER_OPn_SUM,
409 .norm = { 1, 1 },
410 };
411
412 static const struct nvc0_hw_sm_query_cfg
413 sm20_branch =
414 {
415 .ctr[0] = _C(0xaaaa, LOGOP, 0x1a, 0x000000ff, 0x00000000),
416 .ctr[1] = _C(0xaaaa, LOGOP, 0x1a, 0x000000ff, 0x00000010),
417 .num_counters = 2,
418 .op = NVC0_COUNTER_OPn_SUM,
419 .norm = { 1, 1 },
420 };
421
422 static const struct nvc0_hw_sm_query_cfg
423 sm20_divergent_branch =
424 {
425 .ctr[0] = _C(0xaaaa, LOGOP, 0x19, 0x000000ff, 0x00000020),
426 .ctr[1] = _C(0xaaaa, LOGOP, 0x19, 0x000000ff, 0x00000030),
427 .num_counters = 2,
428 .op = NVC0_COUNTER_OPn_SUM,
429 .norm = { 1, 1 },
430 };
431
432 static const struct nvc0_hw_sm_query_cfg
433 sm20_gld_request =
434 {
435 .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000030),
436 .num_counters = 1,
437 .op = NVC0_COUNTER_OPn_SUM,
438 .norm = { 1, 1 },
439 };
440
441 static const struct nvc0_hw_sm_query_cfg
442 sm20_gred_count =
443 {
444 .ctr[0] = _C(0xaaaa, LOGOP, 0x63, 0x000000ff, 0x00000040),
445 .num_counters = 1,
446 .op = NVC0_COUNTER_OPn_SUM,
447 .norm = { 1, 1 },
448 };
449
450 static const struct nvc0_hw_sm_query_cfg
451 sm20_gst_request =
452 {
453 .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000060),
454 .num_counters = 1,
455 .op = NVC0_COUNTER_OPn_SUM,
456 .norm = { 1, 1 },
457 };
458
459 static const struct nvc0_hw_sm_query_cfg
460 sm20_inst_executed =
461 {
462 .ctr[0] = _C(0xaaaa, LOGOP, 0x2d, 0x0000ffff, 0x00001000),
463 .ctr[1] = _C(0xaaaa, LOGOP, 0x2d, 0x0000ffff, 0x00001010),
464 .num_counters = 2,
465 .op = NVC0_COUNTER_OPn_SUM,
466 .norm = { 1, 1 },
467 };
468
469 static const struct nvc0_hw_sm_query_cfg
470 sm20_inst_issued =
471 {
472 .ctr[0] = _C(0xaaaa, LOGOP, 0x27, 0x0000ffff, 0x00007060),
473 .ctr[1] = _C(0xaaaa, LOGOP, 0x27, 0x0000ffff, 0x00007070),
474 .num_counters = 2,
475 .op = NVC0_COUNTER_OPn_SUM,
476 .norm = { 1, 1 },
477 };
478
479 static const struct nvc0_hw_sm_query_cfg
480 sm20_local_ld =
481 {
482 .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000020),
483 .num_counters = 1,
484 .op = NVC0_COUNTER_OPn_SUM,
485 .norm = { 1, 1 },
486 };
487
488 static const struct nvc0_hw_sm_query_cfg
489 sm20_local_st =
490 {
491 .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000050),
492 .num_counters = 1,
493 .op = NVC0_COUNTER_OPn_SUM,
494 .norm = { 1, 1 },
495 };
496
497 static const struct nvc0_hw_sm_query_cfg
498 sm20_prof_trigger_0 =
499 {
500 .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000000),
501 .num_counters = 1,
502 .op = NVC0_COUNTER_OPn_SUM,
503 .norm = { 1, 1 },
504 };
505
506 static const struct nvc0_hw_sm_query_cfg
507 sm20_prof_trigger_1 =
508 {
509 .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000010),
510 .num_counters = 1,
511 .op = NVC0_COUNTER_OPn_SUM,
512 .norm = { 1, 1 },
513 };
514
515 static const struct nvc0_hw_sm_query_cfg
516 sm20_prof_trigger_2 =
517 {
518 .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000020),
519 .num_counters = 1,
520 .op = NVC0_COUNTER_OPn_SUM,
521 .norm = { 1, 1 },
522 };
523
524 static const struct nvc0_hw_sm_query_cfg
525 sm20_prof_trigger_3 =
526 {
527 .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000030),
528 .num_counters = 1,
529 .op = NVC0_COUNTER_OPn_SUM,
530 .norm = { 1, 1 },
531 };
532
533 static const struct nvc0_hw_sm_query_cfg
534 sm20_prof_trigger_4 =
535 {
536 .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000040),
537 .num_counters = 1,
538 .op = NVC0_COUNTER_OPn_SUM,
539 .norm = { 1, 1 },
540 };
541
542 static const struct nvc0_hw_sm_query_cfg
543 sm20_prof_trigger_5 =
544 {
545 .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000050),
546 .num_counters = 1,
547 .op = NVC0_COUNTER_OPn_SUM,
548 .norm = { 1, 1 },
549 };
550
551 static const struct nvc0_hw_sm_query_cfg
552 sm20_prof_trigger_6 =
553 {
554 .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000060),
555 .num_counters = 1,
556 .op = NVC0_COUNTER_OPn_SUM,
557 .norm = { 1, 1 },
558 };
559
560 static const struct nvc0_hw_sm_query_cfg
561 sm20_prof_trigger_7 =
562 {
563 .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000070),
564 .num_counters = 1,
565 .op = NVC0_COUNTER_OPn_SUM,
566 .norm = { 1, 1 },
567 };
568
569 static const struct nvc0_hw_sm_query_cfg
570 sm20_shared_ld =
571 {
572 .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000010),
573 .num_counters = 1,
574 .op = NVC0_COUNTER_OPn_SUM,
575 .norm = { 1, 1 },
576 };
577
578 static const struct nvc0_hw_sm_query_cfg
579 sm20_shared_st =
580 {
581 .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000040),
582 .num_counters = 1,
583 .op = NVC0_COUNTER_OPn_SUM,
584 .norm = { 1, 1 },
585 };
586
587 static const struct nvc0_hw_sm_query_cfg
588 sm20_threads_launched =
589 {
590 .ctr[0] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000010),
591 .ctr[1] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000020),
592 .ctr[2] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000030),
593 .ctr[3] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000040),
594 .ctr[4] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000050),
595 .ctr[5] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000060),
596 .num_counters = 6,
597 .op = NVC0_COUNTER_OPn_SUM,
598 .norm = { 1, 1 },
599 };
600
601 static const struct nvc0_hw_sm_query_cfg
602 sm20_th_inst_executed_0 =
603 {
604 .ctr[0] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000000),
605 .ctr[1] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000010),
606 .ctr[2] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000020),
607 .ctr[3] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000030),
608 .ctr[4] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000040),
609 .ctr[5] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000050),
610 .num_counters = 6,
611 .op = NVC0_COUNTER_OPn_SUM,
612 .norm = { 1, 1 },
613 };
614
615 static const struct nvc0_hw_sm_query_cfg
616 sm20_th_inst_executed_1 =
617 {
618 .ctr[0] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000000),
619 .ctr[1] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000010),
620 .ctr[2] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000020),
621 .ctr[3] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000030),
622 .ctr[4] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000040),
623 .ctr[5] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000050),
624 .num_counters = 6,
625 .op = NVC0_COUNTER_OPn_SUM,
626 .norm = { 1, 1 },
627 };
628
629 static const struct nvc0_hw_sm_query_cfg
630 sm20_warps_launched =
631 {
632 .ctr[0] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000000),
633 .num_counters = 1,
634 .op = NVC0_COUNTER_OPn_SUM,
635 .norm = { 1, 1 },
636 };
637
638 static const struct nvc0_hw_sm_query_cfg *sm20_hw_sm_queries[] =
639 {
640 _Q(ACTIVE_CYCLES, &sm20_active_cycles),
641 _Q(ACTIVE_WARPS, &sm20_active_warps),
642 _Q(ATOM_COUNT, &sm20_atom_count),
643 _Q(BRANCH, &sm20_branch),
644 _Q(DIVERGENT_BRANCH, &sm20_divergent_branch),
645 _Q(GLD_REQUEST, &sm20_gld_request),
646 _Q(GRED_COUNT, &sm20_gred_count),
647 _Q(GST_REQUEST, &sm20_gst_request),
648 _Q(INST_EXECUTED, &sm20_inst_executed),
649 _Q(INST_ISSUED, &sm20_inst_issued),
650 _Q(INST_ISSUED1_0, NULL),
651 _Q(INST_ISSUED1_1, NULL),
652 _Q(INST_ISSUED2_0, NULL),
653 _Q(INST_ISSUED2_1, NULL),
654 _Q(LOCAL_LD, &sm20_local_ld),
655 _Q(LOCAL_ST, &sm20_local_st),
656 _Q(PROF_TRIGGER_0, &sm20_prof_trigger_0),
657 _Q(PROF_TRIGGER_1, &sm20_prof_trigger_1),
658 _Q(PROF_TRIGGER_2, &sm20_prof_trigger_2),
659 _Q(PROF_TRIGGER_3, &sm20_prof_trigger_3),
660 _Q(PROF_TRIGGER_4, &sm20_prof_trigger_4),
661 _Q(PROF_TRIGGER_5, &sm20_prof_trigger_5),
662 _Q(PROF_TRIGGER_6, &sm20_prof_trigger_6),
663 _Q(PROF_TRIGGER_7, &sm20_prof_trigger_7),
664 _Q(SHARED_LD, &sm20_shared_ld),
665 _Q(SHARED_ST, &sm20_shared_st),
666 _Q(THREADS_LAUNCHED, &sm20_threads_launched),
667 _Q(TH_INST_EXECUTED_0, &sm20_th_inst_executed_0),
668 _Q(TH_INST_EXECUTED_1, &sm20_th_inst_executed_1),
669 _Q(TH_INST_EXECUTED_2, NULL),
670 _Q(TH_INST_EXECUTED_3, NULL),
671 _Q(WARPS_LAUNCHED, &sm20_warps_launched),
672 };
673
674 /* ==== Compute capability 2.1 (GF108+ except GF110) ==== */
675 static const struct nvc0_hw_sm_query_cfg
676 sm21_inst_executed =
677 {
678 .ctr[0] = _C(0xaaaa, LOGOP, 0x2d, 0x000000ff, 0x00000000),
679 .ctr[1] = _C(0xaaaa, LOGOP, 0x2d, 0x000000ff, 0x00000010),
680 .ctr[2] = _C(0xaaaa, LOGOP, 0x2d, 0x000000ff, 0x00000020),
681 .num_counters = 3,
682 .op = NVC0_COUNTER_OPn_SUM,
683 .norm = { 1, 1 },
684 };
685
686 static const struct nvc0_hw_sm_query_cfg
687 sm21_inst_issued1_0 =
688 {
689 .ctr[0] = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000010),
690 .num_counters = 1,
691 .op = NVC0_COUNTER_OPn_SUM,
692 .norm = { 1, 1 },
693 };
694
695 static const struct nvc0_hw_sm_query_cfg
696 sm21_inst_issued1_1 =
697 {
698 .ctr[0] = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000040),
699 .num_counters = 1,
700 .op = NVC0_COUNTER_OPn_SUM,
701 .norm = { 1, 1 },
702 };
703
704 static const struct nvc0_hw_sm_query_cfg
705 sm21_inst_issued2_0 =
706 {
707 .ctr[0] = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000020),
708 .num_counters = 1,
709 .op = NVC0_COUNTER_OPn_SUM,
710 .norm = { 1, 1 },
711 };
712
713 static const struct nvc0_hw_sm_query_cfg
714 sm21_inst_issued2_1 =
715 {
716 .ctr[0] = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000050),
717 .num_counters = 1,
718 .op = NVC0_COUNTER_OPn_SUM,
719 .norm = { 1, 1 },
720 };
721
722 static const struct nvc0_hw_sm_query_cfg
723 sm21_th_inst_executed_0 =
724 {
725 .ctr[0] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000000),
726 .ctr[1] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000010),
727 .ctr[2] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000020),
728 .ctr[3] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000030),
729 .ctr[4] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000040),
730 .ctr[5] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000050),
731 .num_counters = 6,
732 .op = NVC0_COUNTER_OPn_SUM,
733 .norm = { 1, 1 },
734 };
735
736 static const struct nvc0_hw_sm_query_cfg
737 sm21_th_inst_executed_1 =
738 {
739 .ctr[0] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000000),
740 .ctr[1] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000010),
741 .ctr[2] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000020),
742 .ctr[3] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000030),
743 .ctr[4] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000040),
744 .ctr[5] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000050),
745 .num_counters = 6,
746 .op = NVC0_COUNTER_OPn_SUM,
747 .norm = { 1, 1 },
748 };
749
750 static const struct nvc0_hw_sm_query_cfg
751 sm21_th_inst_executed_2 =
752 {
753 .ctr[0] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000000),
754 .ctr[1] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000010),
755 .ctr[2] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000020),
756 .ctr[3] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000030),
757 .ctr[4] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000040),
758 .ctr[5] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000050),
759 .num_counters = 6,
760 .op = NVC0_COUNTER_OPn_SUM,
761 .norm = { 1, 1 },
762 };
763
764 static const struct nvc0_hw_sm_query_cfg
765 sm21_th_inst_executed_3 =
766 {
767 .ctr[0] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000000),
768 .ctr[1] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000010),
769 .ctr[2] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000020),
770 .ctr[3] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000030),
771 .ctr[4] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000040),
772 .ctr[5] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000050),
773 .num_counters = 6,
774 .op = NVC0_COUNTER_OPn_SUM,
775 .norm = { 1, 1 },
776 };
777
778 static const struct nvc0_hw_sm_query_cfg *sm21_hw_sm_queries[] =
779 {
780 _Q(ACTIVE_CYCLES, &sm20_active_cycles),
781 _Q(ACTIVE_WARPS, &sm20_active_warps),
782 _Q(ATOM_COUNT, &sm20_atom_count),
783 _Q(BRANCH, &sm20_branch),
784 _Q(DIVERGENT_BRANCH, &sm20_divergent_branch),
785 _Q(GLD_REQUEST, &sm20_gld_request),
786 _Q(GRED_COUNT, &sm20_gred_count),
787 _Q(GST_REQUEST, &sm20_gst_request),
788 _Q(INST_EXECUTED, &sm21_inst_executed),
789 _Q(INST_ISSUED, NULL),
790 _Q(INST_ISSUED1_0, &sm21_inst_issued1_0),
791 _Q(INST_ISSUED1_1, &sm21_inst_issued1_1),
792 _Q(INST_ISSUED2_0, &sm21_inst_issued2_0),
793 _Q(INST_ISSUED2_1, &sm21_inst_issued2_1),
794 _Q(LOCAL_LD, &sm20_local_ld),
795 _Q(LOCAL_ST, &sm20_local_st),
796 _Q(PROF_TRIGGER_0, &sm20_prof_trigger_0),
797 _Q(PROF_TRIGGER_1, &sm20_prof_trigger_1),
798 _Q(PROF_TRIGGER_2, &sm20_prof_trigger_2),
799 _Q(PROF_TRIGGER_3, &sm20_prof_trigger_3),
800 _Q(PROF_TRIGGER_4, &sm20_prof_trigger_4),
801 _Q(PROF_TRIGGER_5, &sm20_prof_trigger_5),
802 _Q(PROF_TRIGGER_6, &sm20_prof_trigger_6),
803 _Q(PROF_TRIGGER_7, &sm20_prof_trigger_7),
804 _Q(SHARED_LD, &sm20_shared_ld),
805 _Q(SHARED_ST, &sm20_shared_st),
806 _Q(THREADS_LAUNCHED, &sm20_threads_launched),
807 _Q(TH_INST_EXECUTED_0, &sm21_th_inst_executed_0),
808 _Q(TH_INST_EXECUTED_1, &sm21_th_inst_executed_1),
809 _Q(TH_INST_EXECUTED_2, &sm21_th_inst_executed_2),
810 _Q(TH_INST_EXECUTED_3, &sm21_th_inst_executed_3),
811 _Q(WARPS_LAUNCHED, &sm20_warps_launched),
812 };
813
814 #undef _Q
815 #undef _C
816
817 static inline const struct nvc0_hw_sm_query_cfg **
818 nvc0_hw_sm_get_queries(struct nvc0_screen *screen)
819 {
820 struct nouveau_device *dev = screen->base.device;
821
822 if (dev->chipset == 0xc0 || dev->chipset == 0xc8)
823 return sm20_hw_sm_queries;
824 return sm21_hw_sm_queries;
825 }
826
827 static const struct nvc0_hw_sm_query_cfg *
828 nvc0_hw_sm_query_get_cfg(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
829 {
830 struct nvc0_screen *screen = nvc0->screen;
831 struct nvc0_query *q = &hq->base;
832
833 if (screen->base.class_3d >= NVE4_3D_CLASS)
834 return &nve4_hw_sm_queries[q->type - PIPE_QUERY_DRIVER_SPECIFIC];
835
836 if (q->type >= NVC0_HW_SM_QUERY(0) && q->type <= NVC0_HW_SM_QUERY_LAST) {
837 const struct nvc0_hw_sm_query_cfg **queries =
838 nvc0_hw_sm_get_queries(screen);
839 return queries[q->type - NVC0_HW_SM_QUERY(0)];
840 }
841 debug_printf("invalid query type: %d\n", q->type);
842 return NULL;
843 }
844
845 static void
846 nvc0_hw_sm_destroy_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
847 {
848 struct nvc0_query *q = &hq->base;
849 q->funcs->destroy_query(nvc0, q);
850 }
851
852 static boolean
853 nve4_hw_sm_begin_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
854 {
855 struct nvc0_screen *screen = nvc0->screen;
856 struct nouveau_pushbuf *push = nvc0->base.pushbuf;
857 struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
858 const struct nvc0_hw_sm_query_cfg *cfg;
859 unsigned i, c;
860 unsigned num_ab[2] = { 0, 0 };
861
862 cfg = nvc0_hw_sm_query_get_cfg(nvc0, hq);
863
864 /* check if we have enough free counter slots */
865 for (i = 0; i < cfg->num_counters; ++i)
866 num_ab[cfg->ctr[i].sig_dom]++;
867
868 if (screen->pm.num_hw_sm_active[0] + num_ab[0] > 4 ||
869 screen->pm.num_hw_sm_active[1] + num_ab[1] > 4) {
870 NOUVEAU_ERR("Not enough free MP counter slots !\n");
871 return false;
872 }
873
874 assert(cfg->num_counters <= 4);
875 PUSH_SPACE(push, 4 * 8 * + 6);
876
877 if (!screen->pm.mp_counters_enabled) {
878 screen->pm.mp_counters_enabled = true;
879 BEGIN_NVC0(push, SUBC_SW(0x06ac), 1);
880 PUSH_DATA (push, 0x1fcb);
881 }
882
883 /* set sequence field to 0 (used to check if result is available) */
884 for (i = 0; i < screen->mp_count; ++i)
885 hq->data[i * 10 + 10] = 0;
886 hq->sequence++;
887
888 for (i = 0; i < cfg->num_counters; ++i) {
889 const unsigned d = cfg->ctr[i].sig_dom;
890
891 if (!screen->pm.num_hw_sm_active[d]) {
892 uint32_t m = (1 << 22) | (1 << (7 + (8 * !d)));
893 if (screen->pm.num_hw_sm_active[!d])
894 m |= 1 << (7 + (8 * d));
895 BEGIN_NVC0(push, SUBC_SW(0x0600), 1);
896 PUSH_DATA (push, m);
897 }
898 screen->pm.num_hw_sm_active[d]++;
899
900 for (c = d * 4; c < (d * 4 + 4); ++c) {
901 if (!screen->pm.mp_counter[c]) {
902 hsq->ctr[i] = c;
903 screen->pm.mp_counter[c] = hsq;
904 break;
905 }
906 }
907 assert(c <= (d * 4 + 3)); /* must succeed, already checked for space */
908
909 /* configure and reset the counter(s) */
910 if (d == 0)
911 BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_A_SIGSEL(c & 3)), 1);
912 else
913 BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_B_SIGSEL(c & 3)), 1);
914 PUSH_DATA (push, cfg->ctr[i].sig_sel);
915 BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_SRCSEL(c)), 1);
916 PUSH_DATA (push, cfg->ctr[i].src_sel + 0x2108421 * (c & 3));
917 BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(c)), 1);
918 PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode);
919 BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_SET(c)), 1);
920 PUSH_DATA (push, 0);
921 }
922 return true;
923 }
924
925 static boolean
926 nvc0_hw_sm_begin_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
927 {
928 struct nvc0_screen *screen = nvc0->screen;
929 struct nouveau_pushbuf *push = nvc0->base.pushbuf;
930 struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
931 const struct nvc0_hw_sm_query_cfg *cfg;
932 unsigned i, c;
933
934 if (screen->base.class_3d >= NVE4_3D_CLASS)
935 return nve4_hw_sm_begin_query(nvc0, hq);
936
937 cfg = nvc0_hw_sm_query_get_cfg(nvc0, hq);
938
939 /* check if we have enough free counter slots */
940 if (screen->pm.num_hw_sm_active[0] + cfg->num_counters > 8) {
941 NOUVEAU_ERR("Not enough free MP counter slots !\n");
942 return false;
943 }
944
945 assert(cfg->num_counters <= 8);
946 PUSH_SPACE(push, 8 * 8 + 2);
947
948 /* set sequence field to 0 (used to check if result is available) */
949 for (i = 0; i < screen->mp_count; ++i) {
950 const unsigned b = (0x30 / 4) * i;
951 hq->data[b + 8] = 0;
952 }
953 hq->sequence++;
954
955 for (i = 0; i < cfg->num_counters; ++i) {
956 uint32_t mask_sel = 0x00000000;
957
958 if (!screen->pm.num_hw_sm_active[0]) {
959 BEGIN_NVC0(push, SUBC_SW(0x0600), 1);
960 PUSH_DATA (push, 0x80000000);
961 }
962 screen->pm.num_hw_sm_active[0]++;
963
964 for (c = 0; c < 8; ++c) {
965 if (!screen->pm.mp_counter[c]) {
966 hsq->ctr[i] = c;
967 screen->pm.mp_counter[c] = hsq;
968 break;
969 }
970 }
971
972 /* Oddly-enough, the signal id depends on the slot selected on Fermi but
973 * not on Kepler. Fortunately, the signal ids are just offseted by the
974 * slot id! */
975 mask_sel |= c;
976 mask_sel |= (c << 8);
977 mask_sel |= (c << 16);
978 mask_sel |= (c << 24);
979 mask_sel &= cfg->ctr[i].src_mask;
980
981 /* configure and reset the counter(s) */
982 BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SIGSEL(c)), 1);
983 PUSH_DATA (push, cfg->ctr[i].sig_sel);
984 BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SRCSEL(c)), 1);
985 PUSH_DATA (push, cfg->ctr[i].src_sel | mask_sel);
986 BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_OP(c)), 1);
987 PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode);
988 BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SET(c)), 1);
989 PUSH_DATA (push, 0);
990 }
991 return true;
992 }
993
994 static void
995 nvc0_hw_sm_end_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
996 {
997 struct nvc0_screen *screen = nvc0->screen;
998 struct pipe_context *pipe = &nvc0->base.pipe;
999 struct nouveau_pushbuf *push = nvc0->base.pushbuf;
1000 const bool is_nve4 = screen->base.class_3d >= NVE4_3D_CLASS;
1001 struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
1002 uint32_t mask;
1003 uint32_t input[3];
1004 const uint block[3] = { 32, is_nve4 ? 4 : 1, 1 };
1005 const uint grid[3] = { screen->mp_count, screen->gpc_count, 1 };
1006 unsigned c;
1007
1008 if (unlikely(!screen->pm.prog)) {
1009 struct nvc0_program *prog = CALLOC_STRUCT(nvc0_program);
1010 prog->type = PIPE_SHADER_COMPUTE;
1011 prog->translated = true;
1012 prog->num_gprs = 14;
1013 prog->parm_size = 12;
1014 if (is_nve4) {
1015 prog->code = (uint32_t *)nve4_read_hw_sm_counters_code;
1016 prog->code_size = sizeof(nve4_read_hw_sm_counters_code);
1017 } else {
1018 prog->code = (uint32_t *)nvc0_read_hw_sm_counters_code;
1019 prog->code_size = sizeof(nvc0_read_hw_sm_counters_code);
1020 }
1021 screen->pm.prog = prog;
1022 }
1023
1024 /* disable all counting */
1025 PUSH_SPACE(push, 8);
1026 for (c = 0; c < 8; ++c)
1027 if (screen->pm.mp_counter[c]) {
1028 if (is_nve4) {
1029 IMMED_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(c)), 0);
1030 } else {
1031 IMMED_NVC0(push, NVC0_COMPUTE(MP_PM_OP(c)), 0);
1032 }
1033 }
1034 /* release counters for this query */
1035 for (c = 0; c < 8; ++c) {
1036 if (screen->pm.mp_counter[c] == hsq) {
1037 uint8_t d = is_nve4 ? c / 4 : 0; /* only one domain for NVC0:NVE4 */
1038 screen->pm.num_hw_sm_active[d]--;
1039 screen->pm.mp_counter[c] = NULL;
1040 }
1041 }
1042
1043 BCTX_REFN_bo(nvc0->bufctx_cp, CP_QUERY, NOUVEAU_BO_GART | NOUVEAU_BO_WR,
1044 hq->bo);
1045
1046 PUSH_SPACE(push, 1);
1047 IMMED_NVC0(push, SUBC_COMPUTE(NV50_GRAPH_SERIALIZE), 0);
1048
1049 pipe->bind_compute_state(pipe, screen->pm.prog);
1050 input[0] = (hq->bo->offset + hq->base_offset);
1051 input[1] = (hq->bo->offset + hq->base_offset) >> 32;
1052 input[2] = hq->sequence;
1053 pipe->launch_grid(pipe, block, grid, 0, input);
1054
1055 nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_QUERY);
1056
1057 /* re-activate other counters */
1058 PUSH_SPACE(push, 16);
1059 mask = 0;
1060 for (c = 0; c < 8; ++c) {
1061 const struct nvc0_hw_sm_query_cfg *cfg;
1062 unsigned i;
1063
1064 hsq = screen->pm.mp_counter[c];
1065 if (!hsq)
1066 continue;
1067
1068 cfg = nvc0_hw_sm_query_get_cfg(nvc0, &hsq->base);
1069 for (i = 0; i < cfg->num_counters; ++i) {
1070 if (mask & (1 << hsq->ctr[i]))
1071 break;
1072 mask |= 1 << hsq->ctr[i];
1073 if (is_nve4) {
1074 BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(hsq->ctr[i])), 1);
1075 } else {
1076 BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_OP(hsq->ctr[i])), 1);
1077 }
1078 PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode);
1079 }
1080 }
1081 }
1082
1083 static inline bool
1084 nvc0_hw_sm_query_read_data(uint32_t count[32][8],
1085 struct nvc0_context *nvc0, bool wait,
1086 struct nvc0_hw_query *hq,
1087 const struct nvc0_hw_sm_query_cfg *cfg,
1088 unsigned mp_count)
1089 {
1090 struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
1091 unsigned p, c;
1092
1093 for (p = 0; p < mp_count; ++p) {
1094 const unsigned b = (0x30 / 4) * p;
1095
1096 for (c = 0; c < cfg->num_counters; ++c) {
1097 if (hq->data[b + 8] != hq->sequence) {
1098 if (!wait)
1099 return false;
1100 if (nouveau_bo_wait(hq->bo, NOUVEAU_BO_RD, nvc0->base.client))
1101 return false;
1102 }
1103 count[p][c] = hq->data[b + hsq->ctr[c]] * (1 << c);
1104 }
1105 }
1106 return true;
1107 }
1108
1109 static inline bool
1110 nve4_hw_sm_query_read_data(uint32_t count[32][8],
1111 struct nvc0_context *nvc0, bool wait,
1112 struct nvc0_hw_query *hq,
1113 const struct nvc0_hw_sm_query_cfg *cfg,
1114 unsigned mp_count)
1115 {
1116 struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
1117 unsigned p, c, d;
1118
1119 for (p = 0; p < mp_count; ++p) {
1120 const unsigned b = (0x60 / 4) * p;
1121
1122 for (c = 0; c < cfg->num_counters; ++c) {
1123 count[p][c] = 0;
1124 for (d = 0; d < ((hsq->ctr[c] & ~3) ? 1 : 4); ++d) {
1125 if (hq->data[b + 20 + d] != hq->sequence) {
1126 if (!wait)
1127 return false;
1128 if (nouveau_bo_wait(hq->bo, NOUVEAU_BO_RD, nvc0->base.client))
1129 return false;
1130 }
1131 if (hsq->ctr[c] & ~0x3)
1132 count[p][c] = hq->data[b + 16 + (hsq->ctr[c] & 3)];
1133 else
1134 count[p][c] += hq->data[b + d * 4 + hsq->ctr[c]];
1135 }
1136 }
1137 }
1138 return true;
1139 }
1140
1141 /* Metric calculations:
1142 * sum(x) ... sum of x over all MPs
1143 * avg(x) ... average of x over all MPs
1144 *
1145 * IPC : sum(inst_executed) / clock
1146 * INST_REPLAY_OHEAD: (sum(inst_issued) - sum(inst_executed)) / sum(inst_issued)
1147 * MP_OCCUPANCY : avg((active_warps / 64) / active_cycles)
1148 * MP_EFFICIENCY : avg(active_cycles / clock)
1149 *
1150 * NOTE: Interpretation of IPC requires knowledge of MP count.
1151 */
1152 static boolean
1153 nvc0_hw_sm_get_query_result(struct nvc0_context *nvc0, struct nvc0_hw_query *hq,
1154 boolean wait, union pipe_query_result *result)
1155 {
1156 uint32_t count[32][8];
1157 uint64_t value = 0;
1158 unsigned mp_count = MIN2(nvc0->screen->mp_count_compute, 32);
1159 unsigned p, c;
1160 const struct nvc0_hw_sm_query_cfg *cfg;
1161 bool ret;
1162
1163 cfg = nvc0_hw_sm_query_get_cfg(nvc0, hq);
1164
1165 if (nvc0->screen->base.class_3d >= NVE4_3D_CLASS)
1166 ret = nve4_hw_sm_query_read_data(count, nvc0, wait, hq, cfg, mp_count);
1167 else
1168 ret = nvc0_hw_sm_query_read_data(count, nvc0, wait, hq, cfg, mp_count);
1169 if (!ret)
1170 return false;
1171
1172 if (cfg->op == NVC0_COUNTER_OPn_SUM) {
1173 for (c = 0; c < cfg->num_counters; ++c)
1174 for (p = 0; p < mp_count; ++p)
1175 value += count[p][c];
1176 value = (value * cfg->norm[0]) / cfg->norm[1];
1177 } else
1178 if (cfg->op == NVC0_COUNTER_OPn_OR) {
1179 uint32_t v = 0;
1180 for (c = 0; c < cfg->num_counters; ++c)
1181 for (p = 0; p < mp_count; ++p)
1182 v |= count[p][c];
1183 value = ((uint64_t)v * cfg->norm[0]) / cfg->norm[1];
1184 } else
1185 if (cfg->op == NVC0_COUNTER_OPn_AND) {
1186 uint32_t v = ~0;
1187 for (c = 0; c < cfg->num_counters; ++c)
1188 for (p = 0; p < mp_count; ++p)
1189 v &= count[p][c];
1190 value = ((uint64_t)v * cfg->norm[0]) / cfg->norm[1];
1191 } else
1192 if (cfg->op == NVC0_COUNTER_OP2_REL_SUM_MM) {
1193 uint64_t v[2] = { 0, 0 };
1194 for (p = 0; p < mp_count; ++p) {
1195 v[0] += count[p][0];
1196 v[1] += count[p][1];
1197 }
1198 if (v[0])
1199 value = ((v[0] - v[1]) * cfg->norm[0]) / (v[0] * cfg->norm[1]);
1200 } else
1201 if (cfg->op == NVC0_COUNTER_OP2_DIV_SUM_M0) {
1202 for (p = 0; p < mp_count; ++p)
1203 value += count[p][0];
1204 if (count[0][1])
1205 value = (value * cfg->norm[0]) / (count[0][1] * cfg->norm[1]);
1206 else
1207 value = 0;
1208 } else
1209 if (cfg->op == NVC0_COUNTER_OP2_AVG_DIV_MM) {
1210 unsigned mp_used = 0;
1211 for (p = 0; p < mp_count; ++p, mp_used += !!count[p][0])
1212 if (count[p][1])
1213 value += (count[p][0] * cfg->norm[0]) / count[p][1];
1214 if (mp_used)
1215 value /= (uint64_t)mp_used * cfg->norm[1];
1216 } else
1217 if (cfg->op == NVC0_COUNTER_OP2_AVG_DIV_M0) {
1218 unsigned mp_used = 0;
1219 for (p = 0; p < mp_count; ++p, mp_used += !!count[p][0])
1220 value += count[p][0];
1221 if (count[0][1] && mp_used) {
1222 value *= cfg->norm[0];
1223 value /= (uint64_t)count[0][1] * mp_used * cfg->norm[1];
1224 } else {
1225 value = 0;
1226 }
1227 }
1228
1229 *(uint64_t *)result = value;
1230 return true;
1231 }
1232
1233 static const struct nvc0_hw_query_funcs hw_sm_query_funcs = {
1234 .destroy_query = nvc0_hw_sm_destroy_query,
1235 .begin_query = nvc0_hw_sm_begin_query,
1236 .end_query = nvc0_hw_sm_end_query,
1237 .get_query_result = nvc0_hw_sm_get_query_result,
1238 };
1239
1240 struct nvc0_hw_query *
1241 nvc0_hw_sm_create_query(struct nvc0_context *nvc0, unsigned type)
1242 {
1243 struct nvc0_screen *screen = nvc0->screen;
1244 struct nvc0_hw_sm_query *hsq;
1245 struct nvc0_hw_query *hq;
1246 unsigned space;
1247
1248 if (nvc0->screen->base.device->drm_version < 0x01000101)
1249 return NULL;
1250
1251 if ((type < NVE4_HW_SM_QUERY(0) || type > NVE4_HW_SM_QUERY_LAST) &&
1252 (type < NVC0_HW_SM_QUERY(0) || type > NVC0_HW_SM_QUERY_LAST))
1253 return NULL;
1254
1255 hsq = CALLOC_STRUCT(nvc0_hw_sm_query);
1256 if (!hsq)
1257 return NULL;
1258
1259 hq = &hsq->base;
1260 hq->funcs = &hw_sm_query_funcs;
1261 hq->base.type = type;
1262
1263 if (screen->base.class_3d >= NVE4_3D_CLASS) {
1264 /* for each MP:
1265 * [00] = WS0.C0
1266 * [04] = WS0.C1
1267 * [08] = WS0.C2
1268 * [0c] = WS0.C3
1269 * [24] = WS2.C1
1270 * [28] = WS2.C2
1271 * [2c] = WS2.C3
1272 * [30] = WS3.C0
1273 * [34] = WS3.C1
1274 * [38] = WS3.C2
1275 * [3c] = WS3.C3
1276 * [40] = MP.C4
1277 * [44] = MP.C5
1278 * [48] = MP.C6
1279 * [4c] = MP.C7
1280 * [50] = WS0.sequence
1281 * [54] = WS1.sequence
1282 * [58] = WS2.sequence
1283 * [5c] = WS3.sequence
1284 */
1285 space = (4 * 4 + 4 + 4) * nvc0->screen->mp_count * sizeof(uint32_t);
1286 } else {
1287 /*
1288 * Note that padding is used to align memory access to 128 bits.
1289 *
1290 * for each MP:
1291 * [00] = MP.C0
1292 * [04] = MP.C1
1293 * [08] = MP.C2
1294 * [0c] = MP.C3
1295 * [10] = MP.C4
1296 * [14] = MP.C5
1297 * [18] = MP.C6
1298 * [1c] = MP.C7
1299 * [20] = MP.sequence
1300 * [24] = padding
1301 * [28] = padding
1302 * [2c] = padding
1303 */
1304 space = (8 + 1 + 3) * nvc0->screen->mp_count * sizeof(uint32_t);
1305 }
1306
1307 if (!nvc0_hw_query_allocate(nvc0, &hq->base, space)) {
1308 FREE(hq);
1309 return NULL;
1310 }
1311
1312 return hq;
1313 }
1314
1315 static int
1316 nvc0_hw_sm_get_next_query_id(const struct nvc0_hw_sm_query_cfg **queries,
1317 unsigned id)
1318 {
1319 unsigned i, next = 0;
1320
1321 for (i = 0; i < NVC0_HW_SM_QUERY_COUNT; i++) {
1322 if (!queries[i]) {
1323 next++;
1324 } else
1325 if (i >= id && queries[id + next]) {
1326 break;
1327 }
1328 }
1329 return id + next;
1330 }
1331
1332 int
1333 nvc0_hw_sm_get_driver_query_info(struct nvc0_screen *screen, unsigned id,
1334 struct pipe_driver_query_info *info)
1335 {
1336 int count = 0;
1337
1338 if (screen->base.device->drm_version >= 0x01000101) {
1339 if (screen->compute) {
1340 if (screen->base.class_3d == NVE4_3D_CLASS) {
1341 count += NVE4_HW_SM_QUERY_COUNT;
1342 } else
1343 if (screen->base.class_3d < NVE4_3D_CLASS) {
1344 const struct nvc0_hw_sm_query_cfg **queries =
1345 nvc0_hw_sm_get_queries(screen);
1346 unsigned i;
1347
1348 for (i = 0; i < NVC0_HW_SM_QUERY_COUNT; i++) {
1349 if (queries[i])
1350 count++;
1351 }
1352 }
1353 }
1354 }
1355
1356 if (!info)
1357 return count;
1358
1359 if (id < count) {
1360 if (screen->compute) {
1361 if (screen->base.class_3d == NVE4_3D_CLASS) {
1362 info->name = nve4_hw_sm_query_names[id];
1363 info->query_type = NVE4_HW_SM_QUERY(id);
1364 info->max_value.u64 =
1365 (id < NVE4_HW_SM_QUERY_METRIC_MP_OCCUPANCY) ? 0 : 100;
1366 info->group_id = NVC0_HW_SM_QUERY_GROUP;
1367 return 1;
1368 } else
1369 if (screen->base.class_3d < NVE4_3D_CLASS) {
1370 const struct nvc0_hw_sm_query_cfg **queries =
1371 nvc0_hw_sm_get_queries(screen);
1372
1373 id = nvc0_hw_sm_get_next_query_id(queries, id);
1374 info->name = nvc0_hw_sm_query_names[id];
1375 info->query_type = NVC0_HW_SM_QUERY(id);
1376 info->group_id = NVC0_HW_SM_QUERY_GROUP;
1377 return 1;
1378 }
1379 }
1380 }
1381 return 0;
1382 }