nvc0: read MP counters of all GPCs on Fermi
[mesa.git] / src / gallium / drivers / nouveau / nvc0 / nvc0_query_hw_sm.c
1 /*
2 * Copyright 2011 Christoph Bumiller
3 * Copyright 2015 Samuel Pitoiset
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21 * OTHER DEALINGS IN THE SOFTWARE.
22 */
23
24 #define NVC0_PUSH_EXPLICIT_SPACE_CHECKING
25
26 #include "nvc0/nvc0_context.h"
27 #include "nvc0/nvc0_query_hw_sm.h"
28
29 #include "nv_object.xml.h"
30 #include "nvc0/nve4_compute.xml.h"
31 #include "nvc0/nvc0_compute.xml.h"
32
33 /* === PERFORMANCE MONITORING COUNTERS for NVE4+ === */
34
35 /* Code to read out MP counters: They are accessible via mmio, too, but let's
36 * just avoid mapping registers in userspace. We'd have to know which MPs are
37 * enabled/present, too, and that information is not presently exposed.
38 * We could add a kernel interface for it, but reading the counters like this
39 * has the advantage of being async (if get_result isn't called immediately).
40 */
41 static const uint64_t nve4_read_hw_sm_counters_code[] =
42 {
43 /* sched 0x20 0x20 0x20 0x20 0x20 0x20 0x20
44 * mov b32 $r8 $tidx
45 * mov b32 $r12 $physid
46 * mov b32 $r0 $pm0
47 * mov b32 $r1 $pm1
48 * mov b32 $r2 $pm2
49 * mov b32 $r3 $pm3
50 * mov b32 $r4 $pm4
51 * sched 0x20 0x20 0x23 0x04 0x20 0x04 0x2b
52 * mov b32 $r5 $pm5
53 * mov b32 $r6 $pm6
54 * mov b32 $r7 $pm7
55 * set $p0 0x1 eq u32 $r8 0x0
56 * mov b32 $r10 c0[0x0]
57 * ext u32 $r8 $r12 0x414
58 * mov b32 $r11 c0[0x4]
59 * sched 0x04 0x2e 0x04 0x20 0x20 0x28 0x04
60 * ext u32 $r9 $r12 0x208
61 * (not $p0) exit
62 * set $p1 0x1 eq u32 $r9 0x0
63 * mul $r8 u32 $r8 u32 96
64 * mul $r12 u32 $r9 u32 16
65 * mul $r13 u32 $r9 u32 4
66 * add b32 $r9 $r8 $r13
67 * sched 0x28 0x04 0x2c 0x04 0x2c 0x04 0x2c
68 * add b32 $r8 $r8 $r12
69 * mov b32 $r12 $r10
70 * add b32 $r10 $c $r10 $r8
71 * mov b32 $r13 $r11
72 * add b32 $r11 $r11 0x0 $c
73 * add b32 $r12 $c $r12 $r9
74 * st b128 wt g[$r10d] $r0q
75 * sched 0x4 0x2c 0x20 0x04 0x2e 0x00 0x00
76 * mov b32 $r0 c0[0x8]
77 * add b32 $r13 $r13 0x0 $c
78 * $p1 st b128 wt g[$r12d+0x40] $r4q
79 * st b32 wt g[$r12d+0x50] $r0
80 * exit */
81 0x2202020202020207ULL,
82 0x2c00000084021c04ULL,
83 0x2c0000000c031c04ULL,
84 0x2c00000010001c04ULL,
85 0x2c00000014005c04ULL,
86 0x2c00000018009c04ULL,
87 0x2c0000001c00dc04ULL,
88 0x2c00000020011c04ULL,
89 0x22b0420042320207ULL,
90 0x2c00000024015c04ULL,
91 0x2c00000028019c04ULL,
92 0x2c0000002c01dc04ULL,
93 0x190e0000fc81dc03ULL,
94 0x2800400000029de4ULL,
95 0x7000c01050c21c03ULL,
96 0x280040001002dde4ULL,
97 0x204282020042e047ULL,
98 0x7000c00820c25c03ULL,
99 0x80000000000021e7ULL,
100 0x190e0000fc93dc03ULL,
101 0x1000000180821c02ULL,
102 0x1000000040931c02ULL,
103 0x1000000010935c02ULL,
104 0x4800000034825c03ULL,
105 0x22c042c042c04287ULL,
106 0x4800000030821c03ULL,
107 0x2800000028031de4ULL,
108 0x4801000020a29c03ULL,
109 0x280000002c035de4ULL,
110 0x0800000000b2dc42ULL,
111 0x4801000024c31c03ULL,
112 0x9400000000a01fc5ULL,
113 0x200002e04202c047ULL,
114 0x2800400020001de4ULL,
115 0x0800000000d35c42ULL,
116 0x9400000100c107c5ULL,
117 0x9400000140c01f85ULL,
118 0x8000000000001de7ULL
119 };
120
121 /* For simplicity, we will allocate as many group slots as we allocate counter
122 * slots. This means that a single counter which wants to source from 2 groups
123 * will have to be declared as using 2 counter slots. This shouldn't really be
124 * a problem because such queries don't make much sense ... (unless someone is
125 * really creative).
126 */
127 struct nvc0_hw_sm_counter_cfg
128 {
129 uint32_t func : 16; /* mask or 4-bit logic op (depending on mode) */
130 uint32_t mode : 4; /* LOGOP,B6,LOGOP_B6(_PULSE) */
131 uint32_t sig_dom : 1; /* if 0, MP_PM_A (per warp-sched), if 1, MP_PM_B */
132 uint32_t sig_sel : 8; /* signal group */
133 uint32_t src_mask; /* mask for signal selection (only for NVC0:NVE4) */
134 uint32_t src_sel; /* signal selection for up to 4 sources */
135 };
136
137 #define NVC0_COUNTER_OPn_SUM 0
138 #define NVC0_COUNTER_OPn_OR 1
139 #define NVC0_COUNTER_OPn_AND 2
140 #define NVC0_COUNTER_OP2_REL_SUM_MM 3 /* (sum(ctr0) - sum(ctr1)) / sum(ctr0) */
141 #define NVC0_COUNTER_OP2_DIV_SUM_M0 4 /* sum(ctr0) / ctr1 of MP[0]) */
142 #define NVC0_COUNTER_OP2_AVG_DIV_MM 5 /* avg(ctr0 / ctr1) */
143 #define NVC0_COUNTER_OP2_AVG_DIV_M0 6 /* avg(ctr0) / ctr1 of MP[0]) */
144
145 struct nvc0_hw_sm_query_cfg
146 {
147 struct nvc0_hw_sm_counter_cfg ctr[8];
148 uint8_t num_counters;
149 uint8_t op;
150 uint8_t norm[2]; /* normalization num,denom */
151 };
152
153 #define _Q1A(n, f, m, g, s, nu, dn) [NVE4_HW_SM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g, 0, s }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { nu, dn } }
154 #define _Q1B(n, f, m, g, s, nu, dn) [NVE4_HW_SM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g, 0, s }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { nu, dn } }
155 #define _M2A(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_HW_SM_QUERY_METRIC_##n] = { { \
156 { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g0, 0, s0 }, \
157 { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g1, 0, s1 }, \
158 {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } }
159 #define _M2B(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_HW_SM_QUERY_METRIC_##n] = { { \
160 { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g0, 0, s0 }, \
161 { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g1, 0, s1 }, \
162 {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } }
163 #define _M2AB(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_HW_SM_QUERY_METRIC_##n] = { { \
164 { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g0, 0, s0 }, \
165 { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g1, 0, s1 }, \
166 {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } }
167
168 /* NOTES:
169 * active_warps: bit 0 alternates btw 0 and 1 for odd nr of warps
170 * inst_executed etc.: we only count a single warp scheduler
171 * metric-ipXc: we simply multiply by 4 to account for the 4 warp schedulers;
172 * this is inaccurate !
173 */
174 static const struct nvc0_hw_sm_query_cfg nve4_hw_sm_queries[] =
175 {
176 _Q1B(ACTIVE_CYCLES, 0x0001, B6, WARP, 0x00000000, 1, 1),
177 _Q1B(ACTIVE_WARPS, 0x003f, B6, WARP, 0x31483104, 2, 1),
178 _Q1A(ATOM_COUNT, 0x0001, B6, BRANCH, 0x00000000, 1, 1),
179 _Q1A(BRANCH, 0x0001, B6, BRANCH, 0x0000000c, 1, 1),
180 _Q1A(DIVERGENT_BRANCH, 0x0001, B6, BRANCH, 0x00000010, 1, 1),
181 _Q1A(GLD_REQUEST, 0x0001, B6, LDST, 0x00000010, 1, 1),
182 _Q1B(GLD_MEM_DIV_REPLAY, 0x0001, B6, REPLAY, 0x00000010, 1, 1),
183 _Q1B(GST_TRANSACTIONS, 0x0001, B6, MEM, 0x00000004, 1, 1),
184 _Q1B(GST_MEM_DIV_REPLAY, 0x0001, B6, REPLAY, 0x00000014, 1, 1),
185 _Q1A(GRED_COUNT, 0x0001, B6, BRANCH, 0x00000008, 1, 1),
186 _Q1A(GST_REQUEST, 0x0001, B6, LDST, 0x00000014, 1, 1),
187 _Q1A(INST_EXECUTED, 0x0003, B6, EXEC, 0x00000398, 1, 1),
188 _Q1A(INST_ISSUED, 0x0003, B6, ISSUE, 0x00000104, 1, 1),
189 _Q1A(INST_ISSUED1, 0x0001, B6, ISSUE, 0x00000004, 1, 1),
190 _Q1A(INST_ISSUED2, 0x0001, B6, ISSUE, 0x00000008, 1, 1),
191 _Q1B(L1_GLD_HIT, 0x0001, B6, L1, 0x00000010, 1, 1),
192 _Q1B(L1_GLD_MISS, 0x0001, B6, L1, 0x00000014, 1, 1),
193 _Q1B(L1_LOCAL_LD_HIT, 0x0001, B6, L1, 0x00000000, 1, 1),
194 _Q1B(L1_LOCAL_LD_MISS, 0x0001, B6, L1, 0x00000004, 1, 1),
195 _Q1B(L1_LOCAL_ST_HIT, 0x0001, B6, L1, 0x00000008, 1, 1),
196 _Q1B(L1_LOCAL_ST_MISS, 0x0001, B6, L1, 0x0000000c, 1, 1),
197 _Q1B(L1_SHARED_LD_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000008, 1, 1),
198 _Q1B(L1_SHARED_ST_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x0000000c, 1, 1),
199 _Q1A(LOCAL_LD, 0x0001, B6, LDST, 0x00000008, 1, 1),
200 _Q1B(LOCAL_LD_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000000, 1, 1),
201 _Q1A(LOCAL_ST, 0x0001, B6, LDST, 0x0000000c, 1, 1),
202 _Q1B(LOCAL_ST_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000004, 1, 1),
203 _Q1A(PROF_TRIGGER_0, 0x0001, B6, USER, 0x00000000, 1, 1),
204 _Q1A(PROF_TRIGGER_1, 0x0001, B6, USER, 0x00000004, 1, 1),
205 _Q1A(PROF_TRIGGER_2, 0x0001, B6, USER, 0x00000008, 1, 1),
206 _Q1A(PROF_TRIGGER_3, 0x0001, B6, USER, 0x0000000c, 1, 1),
207 _Q1A(PROF_TRIGGER_4, 0x0001, B6, USER, 0x00000010, 1, 1),
208 _Q1A(PROF_TRIGGER_5, 0x0001, B6, USER, 0x00000014, 1, 1),
209 _Q1A(PROF_TRIGGER_6, 0x0001, B6, USER, 0x00000018, 1, 1),
210 _Q1A(PROF_TRIGGER_7, 0x0001, B6, USER, 0x0000001c, 1, 1),
211 _Q1A(SHARED_LD, 0x0001, B6, LDST, 0x00000000, 1, 1),
212 _Q1B(SHARED_LD_REPLAY, 0x0001, B6, REPLAY, 0x00000008, 1, 1),
213 _Q1A(SHARED_ST, 0x0001, B6, LDST, 0x00000004, 1, 1),
214 _Q1B(SHARED_ST_REPLAY, 0x0001, B6, REPLAY, 0x0000000c, 1, 1),
215 _Q1B(SM_CTA_LAUNCHED, 0x0001, B6, WARP, 0x0000001c, 1, 1),
216 _Q1A(THREADS_LAUNCHED, 0x003f, B6, LAUNCH, 0x398a4188, 1, 1),
217 _Q1B(UNCACHED_GLD_TRANSACTIONS, 0x0001, B6, MEM, 0x00000000, 1, 1),
218 _Q1A(WARPS_LAUNCHED, 0x0001, B6, LAUNCH, 0x00000004, 1, 1),
219 _M2AB(IPC, 0x3, B6, EXEC, 0x398, 0xffff, LOGOP, WARP, 0x0, DIV_SUM_M0, 10, 1),
220 _M2AB(IPAC, 0x3, B6, EXEC, 0x398, 0x1, B6, WARP, 0x0, AVG_DIV_MM, 10, 1),
221 _M2A(IPEC, 0x3, B6, EXEC, 0x398, 0xe, LOGOP, EXEC, 0x398, AVG_DIV_MM, 10, 1),
222 _M2A(INST_REPLAY_OHEAD, 0x3, B6, ISSUE, 0x104, 0x3, B6, EXEC, 0x398, REL_SUM_MM, 100, 1),
223 _M2B(MP_OCCUPANCY, 0x3f, B6, WARP, 0x31483104, 0x01, B6, WARP, 0x0, AVG_DIV_MM, 200, 64),
224 _M2B(MP_EFFICIENCY, 0x01, B6, WARP, 0x0, 0xffff, LOGOP, WARP, 0x0, AVG_DIV_M0, 100, 1),
225 };
226
227 #undef _Q1A
228 #undef _Q1B
229 #undef _M2A
230 #undef _M2B
231
232 /* === PERFORMANCE MONITORING COUNTERS for NVC0:NVE4 === */
233 static const uint64_t nvc0_read_hw_sm_counters_code[] =
234 {
235 /* mov b32 $r8 $tidx
236 * mov b32 $r9 $physid
237 * mov b32 $r0 $pm0
238 * mov b32 $r1 $pm1
239 * mov b32 $r2 $pm2
240 * mov b32 $r3 $pm3
241 * mov b32 $r4 $pm4
242 * mov b32 $r5 $pm5
243 * mov b32 $r6 $pm6
244 * mov b32 $r7 $pm7
245 * set $p0 0x1 eq u32 $r8 0x0
246 * mov b32 $r10 c0[0x0]
247 * mov b32 $r11 c0[0x4]
248 * ext u32 $r8 $r9 0x414
249 * (not $p0) exit
250 * mul $r8 u32 $r8 u32 48
251 * add b32 $r10 $c $r10 $r8
252 * add b32 $r11 $r11 0x0 $c
253 * mov b32 $r8 c0[0x8]
254 * st b128 wt g[$r10d+0x00] $r0q
255 * st b128 wt g[$r10d+0x10] $r4q
256 * st b32 wt g[$r10d+0x20] $r8
257 * exit */
258 0x2c00000084021c04ULL,
259 0x2c0000000c025c04ULL,
260 0x2c00000010001c04ULL,
261 0x2c00000014005c04ULL,
262 0x2c00000018009c04ULL,
263 0x2c0000001c00dc04ULL,
264 0x2c00000020011c04ULL,
265 0x2c00000024015c04ULL,
266 0x2c00000028019c04ULL,
267 0x2c0000002c01dc04ULL,
268 0x190e0000fc81dc03ULL,
269 0x2800400000029de4ULL,
270 0x280040001002dde4ULL,
271 0x7000c01050921c03ULL,
272 0x80000000000021e7ULL,
273 0x10000000c0821c02ULL,
274 0x4801000020a29c03ULL,
275 0x0800000000b2dc42ULL,
276 0x2800400020021de4ULL,
277 0x9400000000a01fc5ULL,
278 0x9400000040a11fc5ULL,
279 0x9400000080a21f85ULL,
280 0x8000000000001de7ULL
281 };
282
283 #define _C(f, o, g, m, s) { f, NVC0_COMPUTE_MP_PM_OP_MODE_##o, 0, g, m, s }
284 #define _Q(n, c, ...) [NVC0_HW_SM_QUERY_##n] = { \
285 { __VA_ARGS__ }, c, NVC0_COUNTER_OPn_SUM, { 1, 1 }, \
286 }
287
288 static const struct nvc0_hw_sm_query_cfg nvc0_hw_sm_queries[] =
289 {
290 _Q(ACTIVE_CYCLES, 1, _C(0xaaaa, LOGOP, 0x11, 0x000000ff, 0x00000000)),
291 _Q(ACTIVE_WARPS, 6, _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000010),
292 _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000020),
293 _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000030),
294 _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000040),
295 _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000050),
296 _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000060)),
297 _Q(ATOM_COUNT, 1, _C(0xaaaa, LOGOP, 0x63, 0x000000ff, 0x00000030)),
298 _Q(BRANCH, 2, _C(0xaaaa, LOGOP, 0x1a, 0x000000ff, 0x00000000),
299 _C(0xaaaa, LOGOP, 0x1a, 0x000000ff, 0x00000010)),
300 _Q(DIVERGENT_BRANCH, 2, _C(0xaaaa, LOGOP, 0x19, 0x000000ff, 0x00000020),
301 _C(0xaaaa, LOGOP, 0x19, 0x000000ff, 0x00000030)),
302 _Q(GLD_REQUEST, 1, _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000030)),
303 _Q(GRED_COUNT, 1, _C(0xaaaa, LOGOP, 0x63, 0x000000ff, 0x00000040)),
304 _Q(GST_REQUEST, 1, _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000060)),
305 _Q(INST_EXECUTED, 3, _C(0xaaaa, LOGOP, 0x2d, 0x000000ff, 0x00000000),
306 _C(0xaaaa, LOGOP, 0x2d, 0x000000ff, 0x00000010),
307 _C(0xaaaa, LOGOP, 0x2d, 0x000000ff, 0x00000020)),
308 _Q(INST_ISSUED1_0, 1, _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000010)),
309 _Q(INST_ISSUED1_1, 1, _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000040)),
310 _Q(INST_ISSUED2_0, 1, _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000020)),
311 _Q(INST_ISSUED2_1, 1, _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000050)),
312 _Q(LOCAL_LD, 1, _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000020)),
313 _Q(LOCAL_ST, 1, _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000050)),
314 _Q(PROF_TRIGGER_0, 1, _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000000)),
315 _Q(PROF_TRIGGER_1, 1, _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000010)),
316 _Q(PROF_TRIGGER_2, 1, _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000020)),
317 _Q(PROF_TRIGGER_3, 1, _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000030)),
318 _Q(PROF_TRIGGER_4, 1, _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000040)),
319 _Q(PROF_TRIGGER_5, 1, _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000050)),
320 _Q(PROF_TRIGGER_6, 1, _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000060)),
321 _Q(PROF_TRIGGER_7, 1, _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000070)),
322 _Q(SHARED_LD, 1, _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000010)),
323 _Q(SHARED_ST, 1, _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000040)),
324 _Q(THREADS_LAUNCHED, 6, _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000010),
325 _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000020),
326 _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000030),
327 _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000040),
328 _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000050),
329 _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000060)),
330 _Q(TH_INST_EXECUTED_0, 6, _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000000),
331 _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000010),
332 _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000020),
333 _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000030),
334 _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000040),
335 _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000050)),
336 _Q(TH_INST_EXECUTED_1, 6, _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000000),
337 _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000010),
338 _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000020),
339 _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000030),
340 _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000040),
341 _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000050)),
342 _Q(TH_INST_EXECUTED_2, 6, _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000000),
343 _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000010),
344 _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000020),
345 _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000030),
346 _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000040),
347 _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000050)),
348 _Q(TH_INST_EXECUTED_3, 6, _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000000),
349 _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000010),
350 _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000020),
351 _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000030),
352 _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000040),
353 _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000050)),
354 _Q(WARPS_LAUNCHED, 1, _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000000)),
355 };
356
357 #undef _Q
358 #undef _C
359
360 static const struct nvc0_hw_sm_query_cfg *
361 nvc0_hw_sm_query_get_cfg(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
362 {
363 struct nvc0_screen *screen = nvc0->screen;
364 struct nvc0_query *q = &hq->base;
365
366 if (screen->base.class_3d >= NVE4_3D_CLASS)
367 return &nve4_hw_sm_queries[q->type - PIPE_QUERY_DRIVER_SPECIFIC];
368 return &nvc0_hw_sm_queries[q->type - NVC0_HW_SM_QUERY(0)];
369 }
370
371 static void
372 nvc0_hw_sm_destroy_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
373 {
374 struct nvc0_query *q = &hq->base;
375 q->funcs->destroy_query(nvc0, q);
376 }
377
378 static boolean
379 nve4_hw_sm_begin_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
380 {
381 struct nvc0_screen *screen = nvc0->screen;
382 struct nouveau_pushbuf *push = nvc0->base.pushbuf;
383 struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
384 const struct nvc0_hw_sm_query_cfg *cfg;
385 unsigned i, c;
386 unsigned num_ab[2] = { 0, 0 };
387
388 cfg = nvc0_hw_sm_query_get_cfg(nvc0, hq);
389
390 /* check if we have enough free counter slots */
391 for (i = 0; i < cfg->num_counters; ++i)
392 num_ab[cfg->ctr[i].sig_dom]++;
393
394 if (screen->pm.num_hw_sm_active[0] + num_ab[0] > 4 ||
395 screen->pm.num_hw_sm_active[1] + num_ab[1] > 4) {
396 NOUVEAU_ERR("Not enough free MP counter slots !\n");
397 return false;
398 }
399
400 assert(cfg->num_counters <= 4);
401 PUSH_SPACE(push, 4 * 8 * + 6);
402
403 if (!screen->pm.mp_counters_enabled) {
404 screen->pm.mp_counters_enabled = true;
405 BEGIN_NVC0(push, SUBC_SW(0x06ac), 1);
406 PUSH_DATA (push, 0x1fcb);
407 }
408
409 /* set sequence field to 0 (used to check if result is available) */
410 for (i = 0; i < screen->mp_count; ++i)
411 hq->data[i * 10 + 10] = 0;
412 hq->sequence++;
413
414 for (i = 0; i < cfg->num_counters; ++i) {
415 const unsigned d = cfg->ctr[i].sig_dom;
416
417 if (!screen->pm.num_hw_sm_active[d]) {
418 uint32_t m = (1 << 22) | (1 << (7 + (8 * !d)));
419 if (screen->pm.num_hw_sm_active[!d])
420 m |= 1 << (7 + (8 * d));
421 BEGIN_NVC0(push, SUBC_SW(0x0600), 1);
422 PUSH_DATA (push, m);
423 }
424 screen->pm.num_hw_sm_active[d]++;
425
426 for (c = d * 4; c < (d * 4 + 4); ++c) {
427 if (!screen->pm.mp_counter[c]) {
428 hsq->ctr[i] = c;
429 screen->pm.mp_counter[c] = hsq;
430 break;
431 }
432 }
433 assert(c <= (d * 4 + 3)); /* must succeed, already checked for space */
434
435 /* configure and reset the counter(s) */
436 if (d == 0)
437 BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_A_SIGSEL(c & 3)), 1);
438 else
439 BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_B_SIGSEL(c & 3)), 1);
440 PUSH_DATA (push, cfg->ctr[i].sig_sel);
441 BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_SRCSEL(c)), 1);
442 PUSH_DATA (push, cfg->ctr[i].src_sel + 0x2108421 * (c & 3));
443 BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(c)), 1);
444 PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode);
445 BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_SET(c)), 1);
446 PUSH_DATA (push, 0);
447 }
448 return true;
449 }
450
451 static boolean
452 nvc0_hw_sm_begin_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
453 {
454 struct nvc0_screen *screen = nvc0->screen;
455 struct nouveau_pushbuf *push = nvc0->base.pushbuf;
456 struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
457 const struct nvc0_hw_sm_query_cfg *cfg;
458 unsigned i, c;
459
460 if (screen->base.class_3d >= NVE4_3D_CLASS)
461 return nve4_hw_sm_begin_query(nvc0, hq);
462
463 cfg = nvc0_hw_sm_query_get_cfg(nvc0, hq);
464
465 /* check if we have enough free counter slots */
466 if (screen->pm.num_hw_sm_active[0] + cfg->num_counters > 8) {
467 NOUVEAU_ERR("Not enough free MP counter slots !\n");
468 return false;
469 }
470
471 assert(cfg->num_counters <= 8);
472 PUSH_SPACE(push, 8 * 8 + 2);
473
474 /* set sequence field to 0 (used to check if result is available) */
475 for (i = 0; i < screen->mp_count; ++i) {
476 const unsigned b = (0x30 / 4) * i;
477 hq->data[b + 8] = 0;
478 }
479 hq->sequence++;
480
481 for (i = 0; i < cfg->num_counters; ++i) {
482 uint32_t mask_sel = 0x00000000;
483
484 if (!screen->pm.num_hw_sm_active[0]) {
485 BEGIN_NVC0(push, SUBC_SW(0x0600), 1);
486 PUSH_DATA (push, 0x80000000);
487 }
488 screen->pm.num_hw_sm_active[0]++;
489
490 for (c = 0; c < 8; ++c) {
491 if (!screen->pm.mp_counter[c]) {
492 hsq->ctr[i] = c;
493 screen->pm.mp_counter[c] = hsq;
494 break;
495 }
496 }
497
498 /* Oddly-enough, the signal id depends on the slot selected on Fermi but
499 * not on Kepler. Fortunately, the signal ids are just offseted by the
500 * slot id! */
501 mask_sel |= c;
502 mask_sel |= (c << 8);
503 mask_sel |= (c << 16);
504 mask_sel |= (c << 24);
505 mask_sel &= cfg->ctr[i].src_mask;
506
507 /* configure and reset the counter(s) */
508 BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SIGSEL(c)), 1);
509 PUSH_DATA (push, cfg->ctr[i].sig_sel);
510 BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SRCSEL(c)), 1);
511 PUSH_DATA (push, cfg->ctr[i].src_sel | mask_sel);
512 BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_OP(c)), 1);
513 PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode);
514 BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SET(c)), 1);
515 PUSH_DATA (push, 0);
516 }
517 return true;
518 }
519
520 static void
521 nvc0_hw_sm_end_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
522 {
523 struct nvc0_screen *screen = nvc0->screen;
524 struct pipe_context *pipe = &nvc0->base.pipe;
525 struct nouveau_pushbuf *push = nvc0->base.pushbuf;
526 const bool is_nve4 = screen->base.class_3d >= NVE4_3D_CLASS;
527 struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
528 uint32_t mask;
529 uint32_t input[3];
530 const uint block[3] = { 32, is_nve4 ? 4 : 1, 1 };
531 const uint grid[3] = { screen->mp_count, screen->gpc_count, 1 };
532 unsigned c;
533
534 if (unlikely(!screen->pm.prog)) {
535 struct nvc0_program *prog = CALLOC_STRUCT(nvc0_program);
536 prog->type = PIPE_SHADER_COMPUTE;
537 prog->translated = true;
538 prog->num_gprs = 14;
539 prog->parm_size = 12;
540 if (is_nve4) {
541 prog->code = (uint32_t *)nve4_read_hw_sm_counters_code;
542 prog->code_size = sizeof(nve4_read_hw_sm_counters_code);
543 } else {
544 prog->code = (uint32_t *)nvc0_read_hw_sm_counters_code;
545 prog->code_size = sizeof(nvc0_read_hw_sm_counters_code);
546 }
547 screen->pm.prog = prog;
548 }
549
550 /* disable all counting */
551 PUSH_SPACE(push, 8);
552 for (c = 0; c < 8; ++c)
553 if (screen->pm.mp_counter[c]) {
554 if (is_nve4) {
555 IMMED_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(c)), 0);
556 } else {
557 IMMED_NVC0(push, NVC0_COMPUTE(MP_PM_OP(c)), 0);
558 }
559 }
560 /* release counters for this query */
561 for (c = 0; c < 8; ++c) {
562 if (screen->pm.mp_counter[c] == hsq) {
563 uint8_t d = is_nve4 ? c / 4 : 0; /* only one domain for NVC0:NVE4 */
564 screen->pm.num_hw_sm_active[d]--;
565 screen->pm.mp_counter[c] = NULL;
566 }
567 }
568
569 BCTX_REFN_bo(nvc0->bufctx_cp, CP_QUERY, NOUVEAU_BO_GART | NOUVEAU_BO_WR,
570 hq->bo);
571
572 PUSH_SPACE(push, 1);
573 IMMED_NVC0(push, SUBC_COMPUTE(NV50_GRAPH_SERIALIZE), 0);
574
575 pipe->bind_compute_state(pipe, screen->pm.prog);
576 input[0] = (hq->bo->offset + hq->base_offset);
577 input[1] = (hq->bo->offset + hq->base_offset) >> 32;
578 input[2] = hq->sequence;
579 pipe->launch_grid(pipe, block, grid, 0, input);
580
581 nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_QUERY);
582
583 /* re-activate other counters */
584 PUSH_SPACE(push, 16);
585 mask = 0;
586 for (c = 0; c < 8; ++c) {
587 const struct nvc0_hw_sm_query_cfg *cfg;
588 unsigned i;
589
590 hsq = screen->pm.mp_counter[c];
591 if (!hsq)
592 continue;
593
594 cfg = nvc0_hw_sm_query_get_cfg(nvc0, &hsq->base);
595 for (i = 0; i < cfg->num_counters; ++i) {
596 if (mask & (1 << hsq->ctr[i]))
597 break;
598 mask |= 1 << hsq->ctr[i];
599 if (is_nve4) {
600 BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(hsq->ctr[i])), 1);
601 } else {
602 BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_OP(hsq->ctr[i])), 1);
603 }
604 PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode);
605 }
606 }
607 }
608
609 static inline bool
610 nvc0_hw_sm_query_read_data(uint32_t count[32][8],
611 struct nvc0_context *nvc0, bool wait,
612 struct nvc0_hw_query *hq,
613 const struct nvc0_hw_sm_query_cfg *cfg,
614 unsigned mp_count)
615 {
616 struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
617 unsigned p, c;
618
619 for (p = 0; p < mp_count; ++p) {
620 const unsigned b = (0x30 / 4) * p;
621
622 for (c = 0; c < cfg->num_counters; ++c) {
623 if (hq->data[b + 8] != hq->sequence) {
624 if (!wait)
625 return false;
626 if (nouveau_bo_wait(hq->bo, NOUVEAU_BO_RD, nvc0->base.client))
627 return false;
628 }
629 count[p][c] = hq->data[b + hsq->ctr[c]] * (1 << c);
630 }
631 }
632 return true;
633 }
634
635 static inline bool
636 nve4_hw_sm_query_read_data(uint32_t count[32][8],
637 struct nvc0_context *nvc0, bool wait,
638 struct nvc0_hw_query *hq,
639 const struct nvc0_hw_sm_query_cfg *cfg,
640 unsigned mp_count)
641 {
642 struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
643 unsigned p, c, d;
644
645 for (p = 0; p < mp_count; ++p) {
646 const unsigned b = (0x60 / 4) * p;
647
648 for (c = 0; c < cfg->num_counters; ++c) {
649 count[p][c] = 0;
650 for (d = 0; d < ((hsq->ctr[c] & ~3) ? 1 : 4); ++d) {
651 if (hq->data[b + 20 + d] != hq->sequence) {
652 if (!wait)
653 return false;
654 if (nouveau_bo_wait(hq->bo, NOUVEAU_BO_RD, nvc0->base.client))
655 return false;
656 }
657 if (hsq->ctr[c] & ~0x3)
658 count[p][c] = hq->data[b + 16 + (hsq->ctr[c] & 3)];
659 else
660 count[p][c] += hq->data[b + d * 4 + hsq->ctr[c]];
661 }
662 }
663 }
664 return true;
665 }
666
667 /* Metric calculations:
668 * sum(x) ... sum of x over all MPs
669 * avg(x) ... average of x over all MPs
670 *
671 * IPC : sum(inst_executed) / clock
672 * INST_REPLAY_OHEAD: (sum(inst_issued) - sum(inst_executed)) / sum(inst_issued)
673 * MP_OCCUPANCY : avg((active_warps / 64) / active_cycles)
674 * MP_EFFICIENCY : avg(active_cycles / clock)
675 *
676 * NOTE: Interpretation of IPC requires knowledge of MP count.
677 */
678 static boolean
679 nvc0_hw_sm_get_query_result(struct nvc0_context *nvc0, struct nvc0_hw_query *hq,
680 boolean wait, union pipe_query_result *result)
681 {
682 uint32_t count[32][8];
683 uint64_t value = 0;
684 unsigned mp_count = MIN2(nvc0->screen->mp_count_compute, 32);
685 unsigned p, c;
686 const struct nvc0_hw_sm_query_cfg *cfg;
687 bool ret;
688
689 cfg = nvc0_hw_sm_query_get_cfg(nvc0, hq);
690
691 if (nvc0->screen->base.class_3d >= NVE4_3D_CLASS)
692 ret = nve4_hw_sm_query_read_data(count, nvc0, wait, hq, cfg, mp_count);
693 else
694 ret = nvc0_hw_sm_query_read_data(count, nvc0, wait, hq, cfg, mp_count);
695 if (!ret)
696 return false;
697
698 if (cfg->op == NVC0_COUNTER_OPn_SUM) {
699 for (c = 0; c < cfg->num_counters; ++c)
700 for (p = 0; p < mp_count; ++p)
701 value += count[p][c];
702 value = (value * cfg->norm[0]) / cfg->norm[1];
703 } else
704 if (cfg->op == NVC0_COUNTER_OPn_OR) {
705 uint32_t v = 0;
706 for (c = 0; c < cfg->num_counters; ++c)
707 for (p = 0; p < mp_count; ++p)
708 v |= count[p][c];
709 value = ((uint64_t)v * cfg->norm[0]) / cfg->norm[1];
710 } else
711 if (cfg->op == NVC0_COUNTER_OPn_AND) {
712 uint32_t v = ~0;
713 for (c = 0; c < cfg->num_counters; ++c)
714 for (p = 0; p < mp_count; ++p)
715 v &= count[p][c];
716 value = ((uint64_t)v * cfg->norm[0]) / cfg->norm[1];
717 } else
718 if (cfg->op == NVC0_COUNTER_OP2_REL_SUM_MM) {
719 uint64_t v[2] = { 0, 0 };
720 for (p = 0; p < mp_count; ++p) {
721 v[0] += count[p][0];
722 v[1] += count[p][1];
723 }
724 if (v[0])
725 value = ((v[0] - v[1]) * cfg->norm[0]) / (v[0] * cfg->norm[1]);
726 } else
727 if (cfg->op == NVC0_COUNTER_OP2_DIV_SUM_M0) {
728 for (p = 0; p < mp_count; ++p)
729 value += count[p][0];
730 if (count[0][1])
731 value = (value * cfg->norm[0]) / (count[0][1] * cfg->norm[1]);
732 else
733 value = 0;
734 } else
735 if (cfg->op == NVC0_COUNTER_OP2_AVG_DIV_MM) {
736 unsigned mp_used = 0;
737 for (p = 0; p < mp_count; ++p, mp_used += !!count[p][0])
738 if (count[p][1])
739 value += (count[p][0] * cfg->norm[0]) / count[p][1];
740 if (mp_used)
741 value /= (uint64_t)mp_used * cfg->norm[1];
742 } else
743 if (cfg->op == NVC0_COUNTER_OP2_AVG_DIV_M0) {
744 unsigned mp_used = 0;
745 for (p = 0; p < mp_count; ++p, mp_used += !!count[p][0])
746 value += count[p][0];
747 if (count[0][1] && mp_used) {
748 value *= cfg->norm[0];
749 value /= (uint64_t)count[0][1] * mp_used * cfg->norm[1];
750 } else {
751 value = 0;
752 }
753 }
754
755 *(uint64_t *)result = value;
756 return true;
757 }
758
759 static const struct nvc0_hw_query_funcs hw_sm_query_funcs = {
760 .destroy_query = nvc0_hw_sm_destroy_query,
761 .begin_query = nvc0_hw_sm_begin_query,
762 .end_query = nvc0_hw_sm_end_query,
763 .get_query_result = nvc0_hw_sm_get_query_result,
764 };
765
766 struct nvc0_hw_query *
767 nvc0_hw_sm_create_query(struct nvc0_context *nvc0, unsigned type)
768 {
769 struct nvc0_screen *screen = nvc0->screen;
770 struct nvc0_hw_sm_query *hsq;
771 struct nvc0_hw_query *hq;
772 unsigned space;
773
774 if (nvc0->screen->base.device->drm_version < 0x01000101)
775 return NULL;
776
777 if ((type < NVE4_HW_SM_QUERY(0) || type > NVE4_HW_SM_QUERY_LAST) &&
778 (type < NVC0_HW_SM_QUERY(0) || type > NVC0_HW_SM_QUERY_LAST))
779 return NULL;
780
781 hsq = CALLOC_STRUCT(nvc0_hw_sm_query);
782 if (!hsq)
783 return NULL;
784
785 hq = &hsq->base;
786 hq->funcs = &hw_sm_query_funcs;
787 hq->base.type = type;
788
789 if (screen->base.class_3d >= NVE4_3D_CLASS) {
790 /* for each MP:
791 * [00] = WS0.C0
792 * [04] = WS0.C1
793 * [08] = WS0.C2
794 * [0c] = WS0.C3
795 * [10] = WS1.C0
796 * [14] = WS1.C1
797 * [18] = WS1.C2
798 * [1c] = WS1.C3
799 * [20] = WS2.C0
800 * [24] = WS2.C1
801 * [28] = WS2.C2
802 * [2c] = WS2.C3
803 * [30] = WS3.C0
804 * [34] = WS3.C1
805 * [38] = WS3.C2
806 * [3c] = WS3.C3
807 * [40] = MP.C4
808 * [44] = MP.C5
809 * [48] = MP.C6
810 * [4c] = MP.C7
811 * [50] = WS0.sequence
812 * [54] = WS1.sequence
813 * [58] = WS2.sequence
814 * [5c] = WS3.sequence
815 */
816 space = (4 * 4 + 4 + 4) * nvc0->screen->mp_count * sizeof(uint32_t);
817 } else {
818 /*
819 * Note that padding is used to align memory access to 128 bits.
820 *
821 * for each MP:
822 * [00] = MP.C0
823 * [04] = MP.C1
824 * [08] = MP.C2
825 * [0c] = MP.C3
826 * [10] = MP.C4
827 * [14] = MP.C5
828 * [18] = MP.C6
829 * [1c] = MP.C7
830 * [20] = MP.sequence
831 * [24] = padding
832 * [28] = padding
833 * [2c] = padding
834 */
835 space = (8 + 1 + 3) * nvc0->screen->mp_count * sizeof(uint32_t);
836 }
837
838 if (!nvc0_hw_query_allocate(nvc0, &hq->base, space)) {
839 FREE(hq);
840 return NULL;
841 }
842
843 return hq;
844 }