nouveau: relax fence emit space assert
[mesa.git] / src / gallium / drivers / nouveau / nvc0 / nvc0_query_hw_sm.c
1 /*
2 * Copyright 2011 Christoph Bumiller
3 * Copyright 2015 Samuel Pitoiset
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21 * OTHER DEALINGS IN THE SOFTWARE.
22 */
23
24 #define NVC0_PUSH_EXPLICIT_SPACE_CHECKING
25
26 #include "nvc0/nvc0_context.h"
27 #include "nvc0/nvc0_query_hw_sm.h"
28
29 #include "nv_object.xml.h"
30 #include "nvc0/nve4_compute.xml.h"
31 #include "nvc0/nvc0_compute.xml.h"
32
33 /* === PERFORMANCE MONITORING COUNTERS for NVE4+ === */
34
35 /* NOTE: intentionally using the same names as NV */
36 static const char *nve4_hw_sm_query_names[] =
37 {
38 /* MP counters */
39 "active_cycles",
40 "active_warps",
41 "atom_count",
42 "branch",
43 "divergent_branch",
44 "gld_request",
45 "global_ld_mem_divergence_replays",
46 "global_store_transaction",
47 "global_st_mem_divergence_replays",
48 "gred_count",
49 "gst_request",
50 "inst_executed",
51 "inst_issued",
52 "inst_issued1",
53 "inst_issued2",
54 "l1_global_load_hit",
55 "l1_global_load_miss",
56 "l1_local_load_hit",
57 "l1_local_load_miss",
58 "l1_local_store_hit",
59 "l1_local_store_miss",
60 "l1_shared_load_transactions",
61 "l1_shared_store_transactions",
62 "local_load",
63 "local_load_transactions",
64 "local_store",
65 "local_store_transactions",
66 "prof_trigger_00",
67 "prof_trigger_01",
68 "prof_trigger_02",
69 "prof_trigger_03",
70 "prof_trigger_04",
71 "prof_trigger_05",
72 "prof_trigger_06",
73 "prof_trigger_07",
74 "shared_load",
75 "shared_load_replay",
76 "shared_store",
77 "shared_store_replay",
78 "sm_cta_launched",
79 "threads_launched",
80 "uncached_global_load_transaction",
81 "warps_launched",
82 /* metrics, i.e. functions of the MP counters */
83 "metric-ipc", /* inst_executed, clock */
84 "metric-ipac", /* inst_executed, active_cycles */
85 "metric-ipec", /* inst_executed, (bool)inst_executed */
86 "metric-achieved_occupancy", /* active_warps, active_cycles */
87 "metric-sm_efficiency", /* active_cycles, clock */
88 "metric-inst_replay_overhead" /* inst_issued, inst_executed */
89 };
90
91 /* Code to read out MP counters: They are accessible via mmio, too, but let's
92 * just avoid mapping registers in userspace. We'd have to know which MPs are
93 * enabled/present, too, and that information is not presently exposed.
94 * We could add a kernel interface for it, but reading the counters like this
95 * has the advantage of being async (if get_result isn't called immediately).
96 */
97 static const uint64_t nve4_read_hw_sm_counters_code[] =
98 {
99 /* sched 0x20 0x20 0x20 0x20 0x20 0x20 0x20
100 * mov b32 $r8 $tidx
101 * mov b32 $r12 $physid
102 * mov b32 $r0 $pm0
103 * mov b32 $r1 $pm1
104 * mov b32 $r2 $pm2
105 * mov b32 $r3 $pm3
106 * mov b32 $r4 $pm4
107 * sched 0x20 0x20 0x23 0x04 0x20 0x04 0x2b
108 * mov b32 $r5 $pm5
109 * mov b32 $r6 $pm6
110 * mov b32 $r7 $pm7
111 * set $p0 0x1 eq u32 $r8 0x0
112 * mov b32 $r10 c0[0x0]
113 * ext u32 $r8 $r12 0x414
114 * mov b32 $r11 c0[0x4]
115 * sched 0x04 0x2e 0x04 0x20 0x20 0x28 0x04
116 * ext u32 $r9 $r12 0x208
117 * (not $p0) exit
118 * set $p1 0x1 eq u32 $r9 0x0
119 * mul $r8 u32 $r8 u32 96
120 * mul $r12 u32 $r9 u32 16
121 * mul $r13 u32 $r9 u32 4
122 * add b32 $r9 $r8 $r13
123 * sched 0x28 0x04 0x2c 0x04 0x2c 0x04 0x2c
124 * add b32 $r8 $r8 $r12
125 * mov b32 $r12 $r10
126 * add b32 $r10 $c $r10 $r8
127 * mov b32 $r13 $r11
128 * add b32 $r11 $r11 0x0 $c
129 * add b32 $r12 $c $r12 $r9
130 * st b128 wt g[$r10d] $r0q
131 * sched 0x4 0x2c 0x20 0x04 0x2e 0x00 0x00
132 * mov b32 $r0 c0[0x8]
133 * add b32 $r13 $r13 0x0 $c
134 * $p1 st b128 wt g[$r12d+0x40] $r4q
135 * st b32 wt g[$r12d+0x50] $r0
136 * exit */
137 0x2202020202020207ULL,
138 0x2c00000084021c04ULL,
139 0x2c0000000c031c04ULL,
140 0x2c00000010001c04ULL,
141 0x2c00000014005c04ULL,
142 0x2c00000018009c04ULL,
143 0x2c0000001c00dc04ULL,
144 0x2c00000020011c04ULL,
145 0x22b0420042320207ULL,
146 0x2c00000024015c04ULL,
147 0x2c00000028019c04ULL,
148 0x2c0000002c01dc04ULL,
149 0x190e0000fc81dc03ULL,
150 0x2800400000029de4ULL,
151 0x7000c01050c21c03ULL,
152 0x280040001002dde4ULL,
153 0x204282020042e047ULL,
154 0x7000c00820c25c03ULL,
155 0x80000000000021e7ULL,
156 0x190e0000fc93dc03ULL,
157 0x1000000180821c02ULL,
158 0x1000000040931c02ULL,
159 0x1000000010935c02ULL,
160 0x4800000034825c03ULL,
161 0x22c042c042c04287ULL,
162 0x4800000030821c03ULL,
163 0x2800000028031de4ULL,
164 0x4801000020a29c03ULL,
165 0x280000002c035de4ULL,
166 0x0800000000b2dc42ULL,
167 0x4801000024c31c03ULL,
168 0x9400000000a01fc5ULL,
169 0x200002e04202c047ULL,
170 0x2800400020001de4ULL,
171 0x0800000000d35c42ULL,
172 0x9400000100c107c5ULL,
173 0x9400000140c01f85ULL,
174 0x8000000000001de7ULL
175 };
176
177 /* For simplicity, we will allocate as many group slots as we allocate counter
178 * slots. This means that a single counter which wants to source from 2 groups
179 * will have to be declared as using 2 counter slots. This shouldn't really be
180 * a problem because such queries don't make much sense ... (unless someone is
181 * really creative).
182 */
183 struct nvc0_hw_sm_counter_cfg
184 {
185 uint32_t func : 16; /* mask or 4-bit logic op (depending on mode) */
186 uint32_t mode : 4; /* LOGOP,B6,LOGOP_B6(_PULSE) */
187 uint32_t sig_dom : 1; /* if 0, MP_PM_A (per warp-sched), if 1, MP_PM_B */
188 uint32_t sig_sel : 8; /* signal group */
189 uint32_t src_mask; /* mask for signal selection (only for NVC0:NVE4) */
190 uint32_t src_sel; /* signal selection for up to 4 sources */
191 };
192
193 #define NVC0_COUNTER_OPn_SUM 0
194 #define NVC0_COUNTER_OPn_OR 1
195 #define NVC0_COUNTER_OPn_AND 2
196 #define NVC0_COUNTER_OP2_REL_SUM_MM 3 /* (sum(ctr0) - sum(ctr1)) / sum(ctr0) */
197 #define NVC0_COUNTER_OP2_DIV_SUM_M0 4 /* sum(ctr0) / ctr1 of MP[0]) */
198 #define NVC0_COUNTER_OP2_AVG_DIV_MM 5 /* avg(ctr0 / ctr1) */
199 #define NVC0_COUNTER_OP2_AVG_DIV_M0 6 /* avg(ctr0) / ctr1 of MP[0]) */
200
201 struct nvc0_hw_sm_query_cfg
202 {
203 struct nvc0_hw_sm_counter_cfg ctr[8];
204 uint8_t num_counters;
205 uint8_t op;
206 uint8_t norm[2]; /* normalization num,denom */
207 };
208
209 #define _Q1A(n, f, m, g, s, nu, dn) [NVE4_HW_SM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g, 0, s }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { nu, dn } }
210 #define _Q1B(n, f, m, g, s, nu, dn) [NVE4_HW_SM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g, 0, s }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { nu, dn } }
211 #define _M2A(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_HW_SM_QUERY_METRIC_##n] = { { \
212 { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g0, 0, s0 }, \
213 { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g1, 0, s1 }, \
214 {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } }
215 #define _M2B(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_HW_SM_QUERY_METRIC_##n] = { { \
216 { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g0, 0, s0 }, \
217 { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g1, 0, s1 }, \
218 {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } }
219 #define _M2AB(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_HW_SM_QUERY_METRIC_##n] = { { \
220 { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g0, 0, s0 }, \
221 { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g1, 0, s1 }, \
222 {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } }
223
224 /* NOTES:
225 * active_warps: bit 0 alternates btw 0 and 1 for odd nr of warps
226 * inst_executed etc.: we only count a single warp scheduler
227 * metric-ipXc: we simply multiply by 4 to account for the 4 warp schedulers;
228 * this is inaccurate !
229 */
230 static const struct nvc0_hw_sm_query_cfg nve4_hw_sm_queries[] =
231 {
232 _Q1B(ACTIVE_CYCLES, 0x0001, B6, WARP, 0x00000000, 1, 1),
233 _Q1B(ACTIVE_WARPS, 0x003f, B6, WARP, 0x31483104, 2, 1),
234 _Q1A(ATOM_COUNT, 0x0001, B6, BRANCH, 0x00000000, 1, 1),
235 _Q1A(BRANCH, 0x0001, B6, BRANCH, 0x0000000c, 1, 1),
236 _Q1A(DIVERGENT_BRANCH, 0x0001, B6, BRANCH, 0x00000010, 1, 1),
237 _Q1A(GLD_REQUEST, 0x0001, B6, LDST, 0x00000010, 1, 1),
238 _Q1B(GLD_MEM_DIV_REPLAY, 0x0001, B6, REPLAY, 0x00000010, 1, 1),
239 _Q1B(GST_TRANSACTIONS, 0x0001, B6, MEM, 0x00000004, 1, 1),
240 _Q1B(GST_MEM_DIV_REPLAY, 0x0001, B6, REPLAY, 0x00000014, 1, 1),
241 _Q1A(GRED_COUNT, 0x0001, B6, BRANCH, 0x00000008, 1, 1),
242 _Q1A(GST_REQUEST, 0x0001, B6, LDST, 0x00000014, 1, 1),
243 _Q1A(INST_EXECUTED, 0x0003, B6, EXEC, 0x00000398, 1, 1),
244 _Q1A(INST_ISSUED, 0x0003, B6, ISSUE, 0x00000104, 1, 1),
245 _Q1A(INST_ISSUED1, 0x0001, B6, ISSUE, 0x00000004, 1, 1),
246 _Q1A(INST_ISSUED2, 0x0001, B6, ISSUE, 0x00000008, 1, 1),
247 _Q1B(L1_GLD_HIT, 0x0001, B6, L1, 0x00000010, 1, 1),
248 _Q1B(L1_GLD_MISS, 0x0001, B6, L1, 0x00000014, 1, 1),
249 _Q1B(L1_LOCAL_LD_HIT, 0x0001, B6, L1, 0x00000000, 1, 1),
250 _Q1B(L1_LOCAL_LD_MISS, 0x0001, B6, L1, 0x00000004, 1, 1),
251 _Q1B(L1_LOCAL_ST_HIT, 0x0001, B6, L1, 0x00000008, 1, 1),
252 _Q1B(L1_LOCAL_ST_MISS, 0x0001, B6, L1, 0x0000000c, 1, 1),
253 _Q1B(L1_SHARED_LD_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000008, 1, 1),
254 _Q1B(L1_SHARED_ST_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x0000000c, 1, 1),
255 _Q1A(LOCAL_LD, 0x0001, B6, LDST, 0x00000008, 1, 1),
256 _Q1B(LOCAL_LD_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000000, 1, 1),
257 _Q1A(LOCAL_ST, 0x0001, B6, LDST, 0x0000000c, 1, 1),
258 _Q1B(LOCAL_ST_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000004, 1, 1),
259 _Q1A(PROF_TRIGGER_0, 0x0001, B6, USER, 0x00000000, 1, 1),
260 _Q1A(PROF_TRIGGER_1, 0x0001, B6, USER, 0x00000004, 1, 1),
261 _Q1A(PROF_TRIGGER_2, 0x0001, B6, USER, 0x00000008, 1, 1),
262 _Q1A(PROF_TRIGGER_3, 0x0001, B6, USER, 0x0000000c, 1, 1),
263 _Q1A(PROF_TRIGGER_4, 0x0001, B6, USER, 0x00000010, 1, 1),
264 _Q1A(PROF_TRIGGER_5, 0x0001, B6, USER, 0x00000014, 1, 1),
265 _Q1A(PROF_TRIGGER_6, 0x0001, B6, USER, 0x00000018, 1, 1),
266 _Q1A(PROF_TRIGGER_7, 0x0001, B6, USER, 0x0000001c, 1, 1),
267 _Q1A(SHARED_LD, 0x0001, B6, LDST, 0x00000000, 1, 1),
268 _Q1B(SHARED_LD_REPLAY, 0x0001, B6, REPLAY, 0x00000008, 1, 1),
269 _Q1A(SHARED_ST, 0x0001, B6, LDST, 0x00000004, 1, 1),
270 _Q1B(SHARED_ST_REPLAY, 0x0001, B6, REPLAY, 0x0000000c, 1, 1),
271 _Q1B(SM_CTA_LAUNCHED, 0x0001, B6, WARP, 0x0000001c, 1, 1),
272 _Q1A(THREADS_LAUNCHED, 0x003f, B6, LAUNCH, 0x398a4188, 1, 1),
273 _Q1B(UNCACHED_GLD_TRANSACTIONS, 0x0001, B6, MEM, 0x00000000, 1, 1),
274 _Q1A(WARPS_LAUNCHED, 0x0001, B6, LAUNCH, 0x00000004, 1, 1),
275 _M2AB(IPC, 0x3, B6, EXEC, 0x398, 0xffff, LOGOP, WARP, 0x0, DIV_SUM_M0, 10, 1),
276 _M2AB(IPAC, 0x3, B6, EXEC, 0x398, 0x1, B6, WARP, 0x0, AVG_DIV_MM, 10, 1),
277 _M2A(IPEC, 0x3, B6, EXEC, 0x398, 0xe, LOGOP, EXEC, 0x398, AVG_DIV_MM, 10, 1),
278 _M2A(INST_REPLAY_OHEAD, 0x3, B6, ISSUE, 0x104, 0x3, B6, EXEC, 0x398, REL_SUM_MM, 100, 1),
279 _M2B(MP_OCCUPANCY, 0x3f, B6, WARP, 0x31483104, 0x01, B6, WARP, 0x0, AVG_DIV_MM, 200, 64),
280 _M2B(MP_EFFICIENCY, 0x01, B6, WARP, 0x0, 0xffff, LOGOP, WARP, 0x0, AVG_DIV_M0, 100, 1),
281 };
282
283 #undef _Q1A
284 #undef _Q1B
285 #undef _M2A
286 #undef _M2B
287
288 /* === PERFORMANCE MONITORING COUNTERS for NVC0:NVE4 === */
289 /* NOTES:
290 * - MP counters on GF100/GF110 (compute capability 2.0) are buggy
291 * because there is a context-switch problem that we need to fix.
292 * Results might be wrong sometimes, be careful!
293 */
294 static const char *nvc0_hw_sm_query_names[] =
295 {
296 /* MP counters */
297 "active_cycles",
298 "active_warps",
299 "atom_count",
300 "branch",
301 "divergent_branch",
302 "gld_request",
303 "gred_count",
304 "gst_request",
305 "inst_executed",
306 "inst_issued",
307 "inst_issued1_0",
308 "inst_issued1_1",
309 "inst_issued2_0",
310 "inst_issued2_1",
311 "local_load",
312 "local_store",
313 "prof_trigger_00",
314 "prof_trigger_01",
315 "prof_trigger_02",
316 "prof_trigger_03",
317 "prof_trigger_04",
318 "prof_trigger_05",
319 "prof_trigger_06",
320 "prof_trigger_07",
321 "shared_load",
322 "shared_store",
323 "threads_launched",
324 "thread_inst_executed_0",
325 "thread_inst_executed_1",
326 "thread_inst_executed_2",
327 "thread_inst_executed_3",
328 "warps_launched",
329 };
330
331 static const uint64_t nvc0_read_hw_sm_counters_code[] =
332 {
333 /* mov b32 $r8 $tidx
334 * mov b32 $r9 $physid
335 * mov b32 $r0 $pm0
336 * mov b32 $r1 $pm1
337 * mov b32 $r2 $pm2
338 * mov b32 $r3 $pm3
339 * mov b32 $r4 $pm4
340 * mov b32 $r5 $pm5
341 * mov b32 $r6 $pm6
342 * mov b32 $r7 $pm7
343 * set $p0 0x1 eq u32 $r8 0x0
344 * mov b32 $r10 c0[0x0]
345 * mov b32 $r11 c0[0x4]
346 * ext u32 $r8 $r9 0x414
347 * (not $p0) exit
348 * mul $r8 u32 $r8 u32 48
349 * add b32 $r10 $c $r10 $r8
350 * add b32 $r11 $r11 0x0 $c
351 * mov b32 $r8 c0[0x8]
352 * st b128 wt g[$r10d+0x00] $r0q
353 * st b128 wt g[$r10d+0x10] $r4q
354 * st b32 wt g[$r10d+0x20] $r8
355 * exit */
356 0x2c00000084021c04ULL,
357 0x2c0000000c025c04ULL,
358 0x2c00000010001c04ULL,
359 0x2c00000014005c04ULL,
360 0x2c00000018009c04ULL,
361 0x2c0000001c00dc04ULL,
362 0x2c00000020011c04ULL,
363 0x2c00000024015c04ULL,
364 0x2c00000028019c04ULL,
365 0x2c0000002c01dc04ULL,
366 0x190e0000fc81dc03ULL,
367 0x2800400000029de4ULL,
368 0x280040001002dde4ULL,
369 0x7000c01050921c03ULL,
370 0x80000000000021e7ULL,
371 0x10000000c0821c02ULL,
372 0x4801000020a29c03ULL,
373 0x0800000000b2dc42ULL,
374 0x2800400020021de4ULL,
375 0x9400000000a01fc5ULL,
376 0x9400000040a11fc5ULL,
377 0x9400000080a21f85ULL,
378 0x8000000000001de7ULL
379 };
380
381 #define _C(f, o, g, m, s) { f, NVC0_COMPUTE_MP_PM_OP_MODE_##o, 0, g, m, s }
382 #define _Q(n, c) [NVC0_HW_SM_QUERY_##n] = c
383
384 /* ==== Compute capability 2.0 (GF100/GF110) ==== */
385 static const struct nvc0_hw_sm_query_cfg
386 sm20_active_cycles =
387 {
388 .ctr[0] = _C(0xaaaa, LOGOP, 0x11, 0x000000ff, 0x00000000),
389 .num_counters = 1,
390 .op = NVC0_COUNTER_OPn_SUM,
391 .norm = { 1, 1 },
392 };
393
394 static const struct nvc0_hw_sm_query_cfg
395 sm20_active_warps =
396 {
397 .ctr[0] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000010),
398 .ctr[1] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000020),
399 .ctr[2] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000030),
400 .ctr[3] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000040),
401 .ctr[4] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000050),
402 .ctr[5] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000060),
403 .num_counters = 6,
404 .op = NVC0_COUNTER_OPn_SUM,
405 .norm = { 1, 1 },
406 };
407
408 static const struct nvc0_hw_sm_query_cfg
409 sm20_atom_count =
410 {
411 .ctr[0] = _C(0xaaaa, LOGOP, 0x63, 0x000000ff, 0x00000030),
412 .num_counters = 1,
413 .op = NVC0_COUNTER_OPn_SUM,
414 .norm = { 1, 1 },
415 };
416
417 static const struct nvc0_hw_sm_query_cfg
418 sm20_branch =
419 {
420 .ctr[0] = _C(0xaaaa, LOGOP, 0x1a, 0x000000ff, 0x00000000),
421 .ctr[1] = _C(0xaaaa, LOGOP, 0x1a, 0x000000ff, 0x00000010),
422 .num_counters = 2,
423 .op = NVC0_COUNTER_OPn_SUM,
424 .norm = { 1, 1 },
425 };
426
427 static const struct nvc0_hw_sm_query_cfg
428 sm20_divergent_branch =
429 {
430 .ctr[0] = _C(0xaaaa, LOGOP, 0x19, 0x000000ff, 0x00000020),
431 .ctr[1] = _C(0xaaaa, LOGOP, 0x19, 0x000000ff, 0x00000030),
432 .num_counters = 2,
433 .op = NVC0_COUNTER_OPn_SUM,
434 .norm = { 1, 1 },
435 };
436
437 static const struct nvc0_hw_sm_query_cfg
438 sm20_gld_request =
439 {
440 .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000030),
441 .num_counters = 1,
442 .op = NVC0_COUNTER_OPn_SUM,
443 .norm = { 1, 1 },
444 };
445
446 static const struct nvc0_hw_sm_query_cfg
447 sm20_gred_count =
448 {
449 .ctr[0] = _C(0xaaaa, LOGOP, 0x63, 0x000000ff, 0x00000040),
450 .num_counters = 1,
451 .op = NVC0_COUNTER_OPn_SUM,
452 .norm = { 1, 1 },
453 };
454
455 static const struct nvc0_hw_sm_query_cfg
456 sm20_gst_request =
457 {
458 .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000060),
459 .num_counters = 1,
460 .op = NVC0_COUNTER_OPn_SUM,
461 .norm = { 1, 1 },
462 };
463
464 static const struct nvc0_hw_sm_query_cfg
465 sm20_inst_executed =
466 {
467 .ctr[0] = _C(0xaaaa, LOGOP, 0x2d, 0x0000ffff, 0x00001000),
468 .ctr[1] = _C(0xaaaa, LOGOP, 0x2d, 0x0000ffff, 0x00001010),
469 .num_counters = 2,
470 .op = NVC0_COUNTER_OPn_SUM,
471 .norm = { 1, 1 },
472 };
473
474 static const struct nvc0_hw_sm_query_cfg
475 sm20_inst_issued =
476 {
477 .ctr[0] = _C(0xaaaa, LOGOP, 0x27, 0x0000ffff, 0x00007060),
478 .ctr[1] = _C(0xaaaa, LOGOP, 0x27, 0x0000ffff, 0x00007070),
479 .num_counters = 2,
480 .op = NVC0_COUNTER_OPn_SUM,
481 .norm = { 1, 1 },
482 };
483
484 static const struct nvc0_hw_sm_query_cfg
485 sm20_local_ld =
486 {
487 .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000020),
488 .num_counters = 1,
489 .op = NVC0_COUNTER_OPn_SUM,
490 .norm = { 1, 1 },
491 };
492
493 static const struct nvc0_hw_sm_query_cfg
494 sm20_local_st =
495 {
496 .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000050),
497 .num_counters = 1,
498 .op = NVC0_COUNTER_OPn_SUM,
499 .norm = { 1, 1 },
500 };
501
502 static const struct nvc0_hw_sm_query_cfg
503 sm20_prof_trigger_0 =
504 {
505 .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000000),
506 .num_counters = 1,
507 .op = NVC0_COUNTER_OPn_SUM,
508 .norm = { 1, 1 },
509 };
510
511 static const struct nvc0_hw_sm_query_cfg
512 sm20_prof_trigger_1 =
513 {
514 .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000010),
515 .num_counters = 1,
516 .op = NVC0_COUNTER_OPn_SUM,
517 .norm = { 1, 1 },
518 };
519
520 static const struct nvc0_hw_sm_query_cfg
521 sm20_prof_trigger_2 =
522 {
523 .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000020),
524 .num_counters = 1,
525 .op = NVC0_COUNTER_OPn_SUM,
526 .norm = { 1, 1 },
527 };
528
529 static const struct nvc0_hw_sm_query_cfg
530 sm20_prof_trigger_3 =
531 {
532 .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000030),
533 .num_counters = 1,
534 .op = NVC0_COUNTER_OPn_SUM,
535 .norm = { 1, 1 },
536 };
537
538 static const struct nvc0_hw_sm_query_cfg
539 sm20_prof_trigger_4 =
540 {
541 .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000040),
542 .num_counters = 1,
543 .op = NVC0_COUNTER_OPn_SUM,
544 .norm = { 1, 1 },
545 };
546
547 static const struct nvc0_hw_sm_query_cfg
548 sm20_prof_trigger_5 =
549 {
550 .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000050),
551 .num_counters = 1,
552 .op = NVC0_COUNTER_OPn_SUM,
553 .norm = { 1, 1 },
554 };
555
556 static const struct nvc0_hw_sm_query_cfg
557 sm20_prof_trigger_6 =
558 {
559 .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000060),
560 .num_counters = 1,
561 .op = NVC0_COUNTER_OPn_SUM,
562 .norm = { 1, 1 },
563 };
564
565 static const struct nvc0_hw_sm_query_cfg
566 sm20_prof_trigger_7 =
567 {
568 .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000070),
569 .num_counters = 1,
570 .op = NVC0_COUNTER_OPn_SUM,
571 .norm = { 1, 1 },
572 };
573
574 static const struct nvc0_hw_sm_query_cfg
575 sm20_shared_ld =
576 {
577 .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000010),
578 .num_counters = 1,
579 .op = NVC0_COUNTER_OPn_SUM,
580 .norm = { 1, 1 },
581 };
582
583 static const struct nvc0_hw_sm_query_cfg
584 sm20_shared_st =
585 {
586 .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000040),
587 .num_counters = 1,
588 .op = NVC0_COUNTER_OPn_SUM,
589 .norm = { 1, 1 },
590 };
591
592 static const struct nvc0_hw_sm_query_cfg
593 sm20_threads_launched =
594 {
595 .ctr[0] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000010),
596 .ctr[1] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000020),
597 .ctr[2] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000030),
598 .ctr[3] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000040),
599 .ctr[4] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000050),
600 .ctr[5] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000060),
601 .num_counters = 6,
602 .op = NVC0_COUNTER_OPn_SUM,
603 .norm = { 1, 1 },
604 };
605
606 static const struct nvc0_hw_sm_query_cfg
607 sm20_th_inst_executed_0 =
608 {
609 .ctr[0] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000000),
610 .ctr[1] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000010),
611 .ctr[2] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000020),
612 .ctr[3] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000030),
613 .ctr[4] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000040),
614 .ctr[5] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000050),
615 .num_counters = 6,
616 .op = NVC0_COUNTER_OPn_SUM,
617 .norm = { 1, 1 },
618 };
619
620 static const struct nvc0_hw_sm_query_cfg
621 sm20_th_inst_executed_1 =
622 {
623 .ctr[0] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000000),
624 .ctr[1] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000010),
625 .ctr[2] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000020),
626 .ctr[3] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000030),
627 .ctr[4] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000040),
628 .ctr[5] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000050),
629 .num_counters = 6,
630 .op = NVC0_COUNTER_OPn_SUM,
631 .norm = { 1, 1 },
632 };
633
634 static const struct nvc0_hw_sm_query_cfg
635 sm20_warps_launched =
636 {
637 .ctr[0] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000000),
638 .num_counters = 1,
639 .op = NVC0_COUNTER_OPn_SUM,
640 .norm = { 1, 1 },
641 };
642
643 static const struct nvc0_hw_sm_query_cfg *sm20_hw_sm_queries[] =
644 {
645 _Q(ACTIVE_CYCLES, &sm20_active_cycles),
646 _Q(ACTIVE_WARPS, &sm20_active_warps),
647 _Q(ATOM_COUNT, &sm20_atom_count),
648 _Q(BRANCH, &sm20_branch),
649 _Q(DIVERGENT_BRANCH, &sm20_divergent_branch),
650 _Q(GLD_REQUEST, &sm20_gld_request),
651 _Q(GRED_COUNT, &sm20_gred_count),
652 _Q(GST_REQUEST, &sm20_gst_request),
653 _Q(INST_EXECUTED, &sm20_inst_executed),
654 _Q(INST_ISSUED, &sm20_inst_issued),
655 _Q(INST_ISSUED1_0, NULL),
656 _Q(INST_ISSUED1_1, NULL),
657 _Q(INST_ISSUED2_0, NULL),
658 _Q(INST_ISSUED2_1, NULL),
659 _Q(LOCAL_LD, &sm20_local_ld),
660 _Q(LOCAL_ST, &sm20_local_st),
661 _Q(PROF_TRIGGER_0, &sm20_prof_trigger_0),
662 _Q(PROF_TRIGGER_1, &sm20_prof_trigger_1),
663 _Q(PROF_TRIGGER_2, &sm20_prof_trigger_2),
664 _Q(PROF_TRIGGER_3, &sm20_prof_trigger_3),
665 _Q(PROF_TRIGGER_4, &sm20_prof_trigger_4),
666 _Q(PROF_TRIGGER_5, &sm20_prof_trigger_5),
667 _Q(PROF_TRIGGER_6, &sm20_prof_trigger_6),
668 _Q(PROF_TRIGGER_7, &sm20_prof_trigger_7),
669 _Q(SHARED_LD, &sm20_shared_ld),
670 _Q(SHARED_ST, &sm20_shared_st),
671 _Q(THREADS_LAUNCHED, &sm20_threads_launched),
672 _Q(TH_INST_EXECUTED_0, &sm20_th_inst_executed_0),
673 _Q(TH_INST_EXECUTED_1, &sm20_th_inst_executed_1),
674 _Q(TH_INST_EXECUTED_2, NULL),
675 _Q(TH_INST_EXECUTED_3, NULL),
676 _Q(WARPS_LAUNCHED, &sm20_warps_launched),
677 };
678
679 /* ==== Compute capability 2.1 (GF108+ except GF110) ==== */
680 static const struct nvc0_hw_sm_query_cfg
681 sm21_inst_executed =
682 {
683 .ctr[0] = _C(0xaaaa, LOGOP, 0x2d, 0x000000ff, 0x00000000),
684 .ctr[1] = _C(0xaaaa, LOGOP, 0x2d, 0x000000ff, 0x00000010),
685 .ctr[2] = _C(0xaaaa, LOGOP, 0x2d, 0x000000ff, 0x00000020),
686 .num_counters = 3,
687 .op = NVC0_COUNTER_OPn_SUM,
688 .norm = { 1, 1 },
689 };
690
691 static const struct nvc0_hw_sm_query_cfg
692 sm21_inst_issued1_0 =
693 {
694 .ctr[0] = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000010),
695 .num_counters = 1,
696 .op = NVC0_COUNTER_OPn_SUM,
697 .norm = { 1, 1 },
698 };
699
700 static const struct nvc0_hw_sm_query_cfg
701 sm21_inst_issued1_1 =
702 {
703 .ctr[0] = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000040),
704 .num_counters = 1,
705 .op = NVC0_COUNTER_OPn_SUM,
706 .norm = { 1, 1 },
707 };
708
709 static const struct nvc0_hw_sm_query_cfg
710 sm21_inst_issued2_0 =
711 {
712 .ctr[0] = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000020),
713 .num_counters = 1,
714 .op = NVC0_COUNTER_OPn_SUM,
715 .norm = { 1, 1 },
716 };
717
718 static const struct nvc0_hw_sm_query_cfg
719 sm21_inst_issued2_1 =
720 {
721 .ctr[0] = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000050),
722 .num_counters = 1,
723 .op = NVC0_COUNTER_OPn_SUM,
724 .norm = { 1, 1 },
725 };
726
727 static const struct nvc0_hw_sm_query_cfg
728 sm21_th_inst_executed_0 =
729 {
730 .ctr[0] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000000),
731 .ctr[1] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000010),
732 .ctr[2] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000020),
733 .ctr[3] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000030),
734 .ctr[4] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000040),
735 .ctr[5] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000050),
736 .num_counters = 6,
737 .op = NVC0_COUNTER_OPn_SUM,
738 .norm = { 1, 1 },
739 };
740
741 static const struct nvc0_hw_sm_query_cfg
742 sm21_th_inst_executed_1 =
743 {
744 .ctr[0] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000000),
745 .ctr[1] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000010),
746 .ctr[2] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000020),
747 .ctr[3] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000030),
748 .ctr[4] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000040),
749 .ctr[5] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000050),
750 .num_counters = 6,
751 .op = NVC0_COUNTER_OPn_SUM,
752 .norm = { 1, 1 },
753 };
754
755 static const struct nvc0_hw_sm_query_cfg
756 sm21_th_inst_executed_2 =
757 {
758 .ctr[0] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000000),
759 .ctr[1] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000010),
760 .ctr[2] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000020),
761 .ctr[3] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000030),
762 .ctr[4] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000040),
763 .ctr[5] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000050),
764 .num_counters = 6,
765 .op = NVC0_COUNTER_OPn_SUM,
766 .norm = { 1, 1 },
767 };
768
769 static const struct nvc0_hw_sm_query_cfg
770 sm21_th_inst_executed_3 =
771 {
772 .ctr[0] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000000),
773 .ctr[1] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000010),
774 .ctr[2] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000020),
775 .ctr[3] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000030),
776 .ctr[4] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000040),
777 .ctr[5] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000050),
778 .num_counters = 6,
779 .op = NVC0_COUNTER_OPn_SUM,
780 .norm = { 1, 1 },
781 };
782
783 static const struct nvc0_hw_sm_query_cfg *sm21_hw_sm_queries[] =
784 {
785 _Q(ACTIVE_CYCLES, &sm20_active_cycles),
786 _Q(ACTIVE_WARPS, &sm20_active_warps),
787 _Q(ATOM_COUNT, &sm20_atom_count),
788 _Q(BRANCH, &sm20_branch),
789 _Q(DIVERGENT_BRANCH, &sm20_divergent_branch),
790 _Q(GLD_REQUEST, &sm20_gld_request),
791 _Q(GRED_COUNT, &sm20_gred_count),
792 _Q(GST_REQUEST, &sm20_gst_request),
793 _Q(INST_EXECUTED, &sm21_inst_executed),
794 _Q(INST_ISSUED, NULL),
795 _Q(INST_ISSUED1_0, &sm21_inst_issued1_0),
796 _Q(INST_ISSUED1_1, &sm21_inst_issued1_1),
797 _Q(INST_ISSUED2_0, &sm21_inst_issued2_0),
798 _Q(INST_ISSUED2_1, &sm21_inst_issued2_1),
799 _Q(LOCAL_LD, &sm20_local_ld),
800 _Q(LOCAL_ST, &sm20_local_st),
801 _Q(PROF_TRIGGER_0, &sm20_prof_trigger_0),
802 _Q(PROF_TRIGGER_1, &sm20_prof_trigger_1),
803 _Q(PROF_TRIGGER_2, &sm20_prof_trigger_2),
804 _Q(PROF_TRIGGER_3, &sm20_prof_trigger_3),
805 _Q(PROF_TRIGGER_4, &sm20_prof_trigger_4),
806 _Q(PROF_TRIGGER_5, &sm20_prof_trigger_5),
807 _Q(PROF_TRIGGER_6, &sm20_prof_trigger_6),
808 _Q(PROF_TRIGGER_7, &sm20_prof_trigger_7),
809 _Q(SHARED_LD, &sm20_shared_ld),
810 _Q(SHARED_ST, &sm20_shared_st),
811 _Q(THREADS_LAUNCHED, &sm20_threads_launched),
812 _Q(TH_INST_EXECUTED_0, &sm21_th_inst_executed_0),
813 _Q(TH_INST_EXECUTED_1, &sm21_th_inst_executed_1),
814 _Q(TH_INST_EXECUTED_2, &sm21_th_inst_executed_2),
815 _Q(TH_INST_EXECUTED_3, &sm21_th_inst_executed_3),
816 _Q(WARPS_LAUNCHED, &sm20_warps_launched),
817 };
818
819 #undef _Q
820 #undef _C
821
822 static inline const struct nvc0_hw_sm_query_cfg **
823 nvc0_hw_sm_get_queries(struct nvc0_screen *screen)
824 {
825 struct nouveau_device *dev = screen->base.device;
826
827 if (dev->chipset == 0xc0 || dev->chipset == 0xc8)
828 return sm20_hw_sm_queries;
829 return sm21_hw_sm_queries;
830 }
831
832 static const struct nvc0_hw_sm_query_cfg *
833 nvc0_hw_sm_query_get_cfg(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
834 {
835 struct nvc0_screen *screen = nvc0->screen;
836 struct nvc0_query *q = &hq->base;
837
838 if (screen->base.class_3d >= NVE4_3D_CLASS)
839 return &nve4_hw_sm_queries[q->type - PIPE_QUERY_DRIVER_SPECIFIC];
840
841 if (q->type >= NVC0_HW_SM_QUERY(0) && q->type <= NVC0_HW_SM_QUERY_LAST) {
842 const struct nvc0_hw_sm_query_cfg **queries =
843 nvc0_hw_sm_get_queries(screen);
844 return queries[q->type - NVC0_HW_SM_QUERY(0)];
845 }
846 debug_printf("invalid query type: %d\n", q->type);
847 return NULL;
848 }
849
850 static void
851 nvc0_hw_sm_destroy_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
852 {
853 struct nvc0_query *q = &hq->base;
854 q->funcs->destroy_query(nvc0, q);
855 }
856
857 static boolean
858 nve4_hw_sm_begin_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
859 {
860 struct nvc0_screen *screen = nvc0->screen;
861 struct nouveau_pushbuf *push = nvc0->base.pushbuf;
862 struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
863 const struct nvc0_hw_sm_query_cfg *cfg;
864 unsigned i, c;
865 unsigned num_ab[2] = { 0, 0 };
866
867 cfg = nvc0_hw_sm_query_get_cfg(nvc0, hq);
868
869 /* check if we have enough free counter slots */
870 for (i = 0; i < cfg->num_counters; ++i)
871 num_ab[cfg->ctr[i].sig_dom]++;
872
873 if (screen->pm.num_hw_sm_active[0] + num_ab[0] > 4 ||
874 screen->pm.num_hw_sm_active[1] + num_ab[1] > 4) {
875 NOUVEAU_ERR("Not enough free MP counter slots !\n");
876 return false;
877 }
878
879 assert(cfg->num_counters <= 4);
880 PUSH_SPACE(push, 4 * 8 * + 6);
881
882 if (!screen->pm.mp_counters_enabled) {
883 screen->pm.mp_counters_enabled = true;
884 BEGIN_NVC0(push, SUBC_SW(0x06ac), 1);
885 PUSH_DATA (push, 0x1fcb);
886 }
887
888 /* set sequence field to 0 (used to check if result is available) */
889 for (i = 0; i < screen->mp_count; ++i)
890 hq->data[i * 10 + 10] = 0;
891 hq->sequence++;
892
893 for (i = 0; i < cfg->num_counters; ++i) {
894 const unsigned d = cfg->ctr[i].sig_dom;
895
896 if (!screen->pm.num_hw_sm_active[d]) {
897 uint32_t m = (1 << 22) | (1 << (7 + (8 * !d)));
898 if (screen->pm.num_hw_sm_active[!d])
899 m |= 1 << (7 + (8 * d));
900 BEGIN_NVC0(push, SUBC_SW(0x0600), 1);
901 PUSH_DATA (push, m);
902 }
903 screen->pm.num_hw_sm_active[d]++;
904
905 for (c = d * 4; c < (d * 4 + 4); ++c) {
906 if (!screen->pm.mp_counter[c]) {
907 hsq->ctr[i] = c;
908 screen->pm.mp_counter[c] = hsq;
909 break;
910 }
911 }
912 assert(c <= (d * 4 + 3)); /* must succeed, already checked for space */
913
914 /* configure and reset the counter(s) */
915 if (d == 0)
916 BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_A_SIGSEL(c & 3)), 1);
917 else
918 BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_B_SIGSEL(c & 3)), 1);
919 PUSH_DATA (push, cfg->ctr[i].sig_sel);
920 BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_SRCSEL(c)), 1);
921 PUSH_DATA (push, cfg->ctr[i].src_sel + 0x2108421 * (c & 3));
922 BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(c)), 1);
923 PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode);
924 BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_SET(c)), 1);
925 PUSH_DATA (push, 0);
926 }
927 return true;
928 }
929
930 static boolean
931 nvc0_hw_sm_begin_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
932 {
933 struct nvc0_screen *screen = nvc0->screen;
934 struct nouveau_pushbuf *push = nvc0->base.pushbuf;
935 struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
936 const struct nvc0_hw_sm_query_cfg *cfg;
937 unsigned i, c;
938
939 if (screen->base.class_3d >= NVE4_3D_CLASS)
940 return nve4_hw_sm_begin_query(nvc0, hq);
941
942 cfg = nvc0_hw_sm_query_get_cfg(nvc0, hq);
943
944 /* check if we have enough free counter slots */
945 if (screen->pm.num_hw_sm_active[0] + cfg->num_counters > 8) {
946 NOUVEAU_ERR("Not enough free MP counter slots !\n");
947 return false;
948 }
949
950 assert(cfg->num_counters <= 8);
951 PUSH_SPACE(push, 8 * 8 + 2);
952
953 /* set sequence field to 0 (used to check if result is available) */
954 for (i = 0; i < screen->mp_count; ++i) {
955 const unsigned b = (0x30 / 4) * i;
956 hq->data[b + 8] = 0;
957 }
958 hq->sequence++;
959
960 for (i = 0; i < cfg->num_counters; ++i) {
961 uint32_t mask_sel = 0x00000000;
962
963 if (!screen->pm.num_hw_sm_active[0]) {
964 BEGIN_NVC0(push, SUBC_SW(0x0600), 1);
965 PUSH_DATA (push, 0x80000000);
966 }
967 screen->pm.num_hw_sm_active[0]++;
968
969 for (c = 0; c < 8; ++c) {
970 if (!screen->pm.mp_counter[c]) {
971 hsq->ctr[i] = c;
972 screen->pm.mp_counter[c] = hsq;
973 break;
974 }
975 }
976
977 /* Oddly-enough, the signal id depends on the slot selected on Fermi but
978 * not on Kepler. Fortunately, the signal ids are just offseted by the
979 * slot id! */
980 mask_sel |= c;
981 mask_sel |= (c << 8);
982 mask_sel |= (c << 16);
983 mask_sel |= (c << 24);
984 mask_sel &= cfg->ctr[i].src_mask;
985
986 /* configure and reset the counter(s) */
987 BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SIGSEL(c)), 1);
988 PUSH_DATA (push, cfg->ctr[i].sig_sel);
989 BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SRCSEL(c)), 1);
990 PUSH_DATA (push, cfg->ctr[i].src_sel | mask_sel);
991 BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_OP(c)), 1);
992 PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode);
993 BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SET(c)), 1);
994 PUSH_DATA (push, 0);
995 }
996 return true;
997 }
998
999 static void
1000 nvc0_hw_sm_end_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
1001 {
1002 struct nvc0_screen *screen = nvc0->screen;
1003 struct pipe_context *pipe = &nvc0->base.pipe;
1004 struct nouveau_pushbuf *push = nvc0->base.pushbuf;
1005 const bool is_nve4 = screen->base.class_3d >= NVE4_3D_CLASS;
1006 struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
1007 uint32_t mask;
1008 uint32_t input[3];
1009 const uint block[3] = { 32, is_nve4 ? 4 : 1, 1 };
1010 const uint grid[3] = { screen->mp_count, screen->gpc_count, 1 };
1011 unsigned c;
1012
1013 if (unlikely(!screen->pm.prog)) {
1014 struct nvc0_program *prog = CALLOC_STRUCT(nvc0_program);
1015 prog->type = PIPE_SHADER_COMPUTE;
1016 prog->translated = true;
1017 prog->num_gprs = 14;
1018 prog->parm_size = 12;
1019 if (is_nve4) {
1020 prog->code = (uint32_t *)nve4_read_hw_sm_counters_code;
1021 prog->code_size = sizeof(nve4_read_hw_sm_counters_code);
1022 } else {
1023 prog->code = (uint32_t *)nvc0_read_hw_sm_counters_code;
1024 prog->code_size = sizeof(nvc0_read_hw_sm_counters_code);
1025 }
1026 screen->pm.prog = prog;
1027 }
1028
1029 /* disable all counting */
1030 PUSH_SPACE(push, 8);
1031 for (c = 0; c < 8; ++c)
1032 if (screen->pm.mp_counter[c]) {
1033 if (is_nve4) {
1034 IMMED_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(c)), 0);
1035 } else {
1036 IMMED_NVC0(push, NVC0_COMPUTE(MP_PM_OP(c)), 0);
1037 }
1038 }
1039 /* release counters for this query */
1040 for (c = 0; c < 8; ++c) {
1041 if (screen->pm.mp_counter[c] == hsq) {
1042 uint8_t d = is_nve4 ? c / 4 : 0; /* only one domain for NVC0:NVE4 */
1043 screen->pm.num_hw_sm_active[d]--;
1044 screen->pm.mp_counter[c] = NULL;
1045 }
1046 }
1047
1048 BCTX_REFN_bo(nvc0->bufctx_cp, CP_QUERY, NOUVEAU_BO_GART | NOUVEAU_BO_WR,
1049 hq->bo);
1050
1051 PUSH_SPACE(push, 1);
1052 IMMED_NVC0(push, SUBC_COMPUTE(NV50_GRAPH_SERIALIZE), 0);
1053
1054 pipe->bind_compute_state(pipe, screen->pm.prog);
1055 input[0] = (hq->bo->offset + hq->base_offset);
1056 input[1] = (hq->bo->offset + hq->base_offset) >> 32;
1057 input[2] = hq->sequence;
1058 pipe->launch_grid(pipe, block, grid, 0, input);
1059
1060 nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_QUERY);
1061
1062 /* re-activate other counters */
1063 PUSH_SPACE(push, 16);
1064 mask = 0;
1065 for (c = 0; c < 8; ++c) {
1066 const struct nvc0_hw_sm_query_cfg *cfg;
1067 unsigned i;
1068
1069 hsq = screen->pm.mp_counter[c];
1070 if (!hsq)
1071 continue;
1072
1073 cfg = nvc0_hw_sm_query_get_cfg(nvc0, &hsq->base);
1074 for (i = 0; i < cfg->num_counters; ++i) {
1075 if (mask & (1 << hsq->ctr[i]))
1076 break;
1077 mask |= 1 << hsq->ctr[i];
1078 if (is_nve4) {
1079 BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(hsq->ctr[i])), 1);
1080 } else {
1081 BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_OP(hsq->ctr[i])), 1);
1082 }
1083 PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode);
1084 }
1085 }
1086 }
1087
1088 static inline bool
1089 nvc0_hw_sm_query_read_data(uint32_t count[32][8],
1090 struct nvc0_context *nvc0, bool wait,
1091 struct nvc0_hw_query *hq,
1092 const struct nvc0_hw_sm_query_cfg *cfg,
1093 unsigned mp_count)
1094 {
1095 struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
1096 unsigned p, c;
1097
1098 for (p = 0; p < mp_count; ++p) {
1099 const unsigned b = (0x30 / 4) * p;
1100
1101 for (c = 0; c < cfg->num_counters; ++c) {
1102 if (hq->data[b + 8] != hq->sequence) {
1103 if (!wait)
1104 return false;
1105 if (nouveau_bo_wait(hq->bo, NOUVEAU_BO_RD, nvc0->base.client))
1106 return false;
1107 }
1108 count[p][c] = hq->data[b + hsq->ctr[c]] * (1 << c);
1109 }
1110 }
1111 return true;
1112 }
1113
1114 static inline bool
1115 nve4_hw_sm_query_read_data(uint32_t count[32][8],
1116 struct nvc0_context *nvc0, bool wait,
1117 struct nvc0_hw_query *hq,
1118 const struct nvc0_hw_sm_query_cfg *cfg,
1119 unsigned mp_count)
1120 {
1121 struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
1122 unsigned p, c, d;
1123
1124 for (p = 0; p < mp_count; ++p) {
1125 const unsigned b = (0x60 / 4) * p;
1126
1127 for (c = 0; c < cfg->num_counters; ++c) {
1128 count[p][c] = 0;
1129 for (d = 0; d < ((hsq->ctr[c] & ~3) ? 1 : 4); ++d) {
1130 if (hq->data[b + 20 + d] != hq->sequence) {
1131 if (!wait)
1132 return false;
1133 if (nouveau_bo_wait(hq->bo, NOUVEAU_BO_RD, nvc0->base.client))
1134 return false;
1135 }
1136 if (hsq->ctr[c] & ~0x3)
1137 count[p][c] = hq->data[b + 16 + (hsq->ctr[c] & 3)];
1138 else
1139 count[p][c] += hq->data[b + d * 4 + hsq->ctr[c]];
1140 }
1141 }
1142 }
1143 return true;
1144 }
1145
1146 /* Metric calculations:
1147 * sum(x) ... sum of x over all MPs
1148 * avg(x) ... average of x over all MPs
1149 *
1150 * IPC : sum(inst_executed) / clock
1151 * INST_REPLAY_OHEAD: (sum(inst_issued) - sum(inst_executed)) / sum(inst_issued)
1152 * MP_OCCUPANCY : avg((active_warps / 64) / active_cycles)
1153 * MP_EFFICIENCY : avg(active_cycles / clock)
1154 *
1155 * NOTE: Interpretation of IPC requires knowledge of MP count.
1156 */
1157 static boolean
1158 nvc0_hw_sm_get_query_result(struct nvc0_context *nvc0, struct nvc0_hw_query *hq,
1159 boolean wait, union pipe_query_result *result)
1160 {
1161 uint32_t count[32][8];
1162 uint64_t value = 0;
1163 unsigned mp_count = MIN2(nvc0->screen->mp_count_compute, 32);
1164 unsigned p, c;
1165 const struct nvc0_hw_sm_query_cfg *cfg;
1166 bool ret;
1167
1168 cfg = nvc0_hw_sm_query_get_cfg(nvc0, hq);
1169
1170 if (nvc0->screen->base.class_3d >= NVE4_3D_CLASS)
1171 ret = nve4_hw_sm_query_read_data(count, nvc0, wait, hq, cfg, mp_count);
1172 else
1173 ret = nvc0_hw_sm_query_read_data(count, nvc0, wait, hq, cfg, mp_count);
1174 if (!ret)
1175 return false;
1176
1177 if (cfg->op == NVC0_COUNTER_OPn_SUM) {
1178 for (c = 0; c < cfg->num_counters; ++c)
1179 for (p = 0; p < mp_count; ++p)
1180 value += count[p][c];
1181 value = (value * cfg->norm[0]) / cfg->norm[1];
1182 } else
1183 if (cfg->op == NVC0_COUNTER_OPn_OR) {
1184 uint32_t v = 0;
1185 for (c = 0; c < cfg->num_counters; ++c)
1186 for (p = 0; p < mp_count; ++p)
1187 v |= count[p][c];
1188 value = ((uint64_t)v * cfg->norm[0]) / cfg->norm[1];
1189 } else
1190 if (cfg->op == NVC0_COUNTER_OPn_AND) {
1191 uint32_t v = ~0;
1192 for (c = 0; c < cfg->num_counters; ++c)
1193 for (p = 0; p < mp_count; ++p)
1194 v &= count[p][c];
1195 value = ((uint64_t)v * cfg->norm[0]) / cfg->norm[1];
1196 } else
1197 if (cfg->op == NVC0_COUNTER_OP2_REL_SUM_MM) {
1198 uint64_t v[2] = { 0, 0 };
1199 for (p = 0; p < mp_count; ++p) {
1200 v[0] += count[p][0];
1201 v[1] += count[p][1];
1202 }
1203 if (v[0])
1204 value = ((v[0] - v[1]) * cfg->norm[0]) / (v[0] * cfg->norm[1]);
1205 } else
1206 if (cfg->op == NVC0_COUNTER_OP2_DIV_SUM_M0) {
1207 for (p = 0; p < mp_count; ++p)
1208 value += count[p][0];
1209 if (count[0][1])
1210 value = (value * cfg->norm[0]) / (count[0][1] * cfg->norm[1]);
1211 else
1212 value = 0;
1213 } else
1214 if (cfg->op == NVC0_COUNTER_OP2_AVG_DIV_MM) {
1215 unsigned mp_used = 0;
1216 for (p = 0; p < mp_count; ++p, mp_used += !!count[p][0])
1217 if (count[p][1])
1218 value += (count[p][0] * cfg->norm[0]) / count[p][1];
1219 if (mp_used)
1220 value /= (uint64_t)mp_used * cfg->norm[1];
1221 } else
1222 if (cfg->op == NVC0_COUNTER_OP2_AVG_DIV_M0) {
1223 unsigned mp_used = 0;
1224 for (p = 0; p < mp_count; ++p, mp_used += !!count[p][0])
1225 value += count[p][0];
1226 if (count[0][1] && mp_used) {
1227 value *= cfg->norm[0];
1228 value /= (uint64_t)count[0][1] * mp_used * cfg->norm[1];
1229 } else {
1230 value = 0;
1231 }
1232 }
1233
1234 *(uint64_t *)result = value;
1235 return true;
1236 }
1237
1238 static const struct nvc0_hw_query_funcs hw_sm_query_funcs = {
1239 .destroy_query = nvc0_hw_sm_destroy_query,
1240 .begin_query = nvc0_hw_sm_begin_query,
1241 .end_query = nvc0_hw_sm_end_query,
1242 .get_query_result = nvc0_hw_sm_get_query_result,
1243 };
1244
1245 struct nvc0_hw_query *
1246 nvc0_hw_sm_create_query(struct nvc0_context *nvc0, unsigned type)
1247 {
1248 struct nvc0_screen *screen = nvc0->screen;
1249 struct nvc0_hw_sm_query *hsq;
1250 struct nvc0_hw_query *hq;
1251 unsigned space;
1252
1253 if (nvc0->screen->base.device->drm_version < 0x01000101)
1254 return NULL;
1255
1256 if ((type < NVE4_HW_SM_QUERY(0) || type > NVE4_HW_SM_QUERY_LAST) &&
1257 (type < NVC0_HW_SM_QUERY(0) || type > NVC0_HW_SM_QUERY_LAST))
1258 return NULL;
1259
1260 hsq = CALLOC_STRUCT(nvc0_hw_sm_query);
1261 if (!hsq)
1262 return NULL;
1263
1264 hq = &hsq->base;
1265 hq->funcs = &hw_sm_query_funcs;
1266 hq->base.type = type;
1267
1268 if (screen->base.class_3d >= NVE4_3D_CLASS) {
1269 /* for each MP:
1270 * [00] = WS0.C0
1271 * [04] = WS0.C1
1272 * [08] = WS0.C2
1273 * [0c] = WS0.C3
1274 * [24] = WS2.C1
1275 * [28] = WS2.C2
1276 * [2c] = WS2.C3
1277 * [30] = WS3.C0
1278 * [34] = WS3.C1
1279 * [38] = WS3.C2
1280 * [3c] = WS3.C3
1281 * [40] = MP.C4
1282 * [44] = MP.C5
1283 * [48] = MP.C6
1284 * [4c] = MP.C7
1285 * [50] = WS0.sequence
1286 * [54] = WS1.sequence
1287 * [58] = WS2.sequence
1288 * [5c] = WS3.sequence
1289 */
1290 space = (4 * 4 + 4 + 4) * nvc0->screen->mp_count * sizeof(uint32_t);
1291 } else {
1292 /*
1293 * Note that padding is used to align memory access to 128 bits.
1294 *
1295 * for each MP:
1296 * [00] = MP.C0
1297 * [04] = MP.C1
1298 * [08] = MP.C2
1299 * [0c] = MP.C3
1300 * [10] = MP.C4
1301 * [14] = MP.C5
1302 * [18] = MP.C6
1303 * [1c] = MP.C7
1304 * [20] = MP.sequence
1305 * [24] = padding
1306 * [28] = padding
1307 * [2c] = padding
1308 */
1309 space = (8 + 1 + 3) * nvc0->screen->mp_count * sizeof(uint32_t);
1310 }
1311
1312 if (!nvc0_hw_query_allocate(nvc0, &hq->base, space)) {
1313 FREE(hq);
1314 return NULL;
1315 }
1316
1317 return hq;
1318 }
1319
1320 static int
1321 nvc0_hw_sm_get_next_query_id(const struct nvc0_hw_sm_query_cfg **queries,
1322 unsigned id)
1323 {
1324 unsigned i, next = 0;
1325
1326 for (i = 0; i < NVC0_HW_SM_QUERY_COUNT; i++) {
1327 if (!queries[i]) {
1328 next++;
1329 } else
1330 if (i >= id && queries[id + next]) {
1331 break;
1332 }
1333 }
1334 return id + next;
1335 }
1336
1337 int
1338 nvc0_hw_sm_get_driver_query_info(struct nvc0_screen *screen, unsigned id,
1339 struct pipe_driver_query_info *info)
1340 {
1341 int count = 0;
1342
1343 if (screen->base.device->drm_version >= 0x01000101) {
1344 if (screen->compute) {
1345 if (screen->base.class_3d == NVE4_3D_CLASS) {
1346 count += NVE4_HW_SM_QUERY_COUNT;
1347 } else
1348 if (screen->base.class_3d < NVE4_3D_CLASS) {
1349 const struct nvc0_hw_sm_query_cfg **queries =
1350 nvc0_hw_sm_get_queries(screen);
1351 unsigned i;
1352
1353 for (i = 0; i < NVC0_HW_SM_QUERY_COUNT; i++) {
1354 if (queries[i])
1355 count++;
1356 }
1357 }
1358 }
1359 }
1360
1361 if (!info)
1362 return count;
1363
1364 if (id < count) {
1365 if (screen->compute) {
1366 if (screen->base.class_3d == NVE4_3D_CLASS) {
1367 info->name = nve4_hw_sm_query_names[id];
1368 info->query_type = NVE4_HW_SM_QUERY(id);
1369 info->max_value.u64 =
1370 (id < NVE4_HW_SM_QUERY_METRIC_MP_OCCUPANCY) ? 0 : 100;
1371 info->group_id = NVC0_HW_SM_QUERY_GROUP;
1372 return 1;
1373 } else
1374 if (screen->base.class_3d < NVE4_3D_CLASS) {
1375 const struct nvc0_hw_sm_query_cfg **queries =
1376 nvc0_hw_sm_get_queries(screen);
1377
1378 id = nvc0_hw_sm_get_next_query_id(queries, id);
1379 info->name = nvc0_hw_sm_query_names[id];
1380 info->query_type = NVC0_HW_SM_QUERY(id);
1381 info->group_id = NVC0_HW_SM_QUERY_GROUP;
1382 return 1;
1383 }
1384 }
1385 }
1386 return 0;
1387 }