nvc0: rip off the kepler MP-enabling logic from the Fermi codepath
[mesa.git] / src / gallium / drivers / nouveau / nvc0 / nvc0_query_hw_sm.c
1 /*
2 * Copyright 2011 Christoph Bumiller
3 * Copyright 2015 Samuel Pitoiset
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21 * OTHER DEALINGS IN THE SOFTWARE.
22 */
23
24 #define NVC0_PUSH_EXPLICIT_SPACE_CHECKING
25
26 #include "nvc0/nvc0_context.h"
27 #include "nvc0/nvc0_query_hw_sm.h"
28
29 #include "nv_object.xml.h"
30 #include "nvc0/nve4_compute.xml.h"
31 #include "nvc0/nvc0_compute.xml.h"
32
33 /* === PERFORMANCE MONITORING COUNTERS for NVE4+ === */
34
35 /* Code to read out MP counters: They are accessible via mmio, too, but let's
36 * just avoid mapping registers in userspace. We'd have to know which MPs are
37 * enabled/present, too, and that information is not presently exposed.
38 * We could add a kernel interface for it, but reading the counters like this
39 * has the advantage of being async (if get_result isn't called immediately).
40 */
41 static const uint64_t nve4_read_hw_sm_counters_code[] =
42 {
43 /* sched 0x20 0x20 0x20 0x20 0x20 0x20 0x20
44 * mov b32 $r8 $tidx
45 * mov b32 $r12 $physid
46 * mov b32 $r0 $pm0
47 * mov b32 $r1 $pm1
48 * mov b32 $r2 $pm2
49 * mov b32 $r3 $pm3
50 * mov b32 $r4 $pm4
51 * sched 0x20 0x20 0x23 0x04 0x20 0x04 0x2b
52 * mov b32 $r5 $pm5
53 * mov b32 $r6 $pm6
54 * mov b32 $r7 $pm7
55 * set $p0 0x1 eq u32 $r8 0x0
56 * mov b32 $r10 c0[0x0]
57 * ext u32 $r8 $r12 0x414
58 * mov b32 $r11 c0[0x4]
59 * sched 0x04 0x2e 0x04 0x20 0x20 0x28 0x04
60 * ext u32 $r9 $r12 0x208
61 * (not $p0) exit
62 * set $p1 0x1 eq u32 $r9 0x0
63 * mul $r8 u32 $r8 u32 96
64 * mul $r12 u32 $r9 u32 16
65 * mul $r13 u32 $r9 u32 4
66 * add b32 $r9 $r8 $r13
67 * sched 0x28 0x04 0x2c 0x04 0x2c 0x04 0x2c
68 * add b32 $r8 $r8 $r12
69 * mov b32 $r12 $r10
70 * add b32 $r10 $c $r10 $r8
71 * mov b32 $r13 $r11
72 * add b32 $r11 $r11 0x0 $c
73 * add b32 $r12 $c $r12 $r9
74 * st b128 wt g[$r10d] $r0q
75 * sched 0x4 0x2c 0x20 0x04 0x2e 0x00 0x00
76 * mov b32 $r0 c0[0x8]
77 * add b32 $r13 $r13 0x0 $c
78 * $p1 st b128 wt g[$r12d+0x40] $r4q
79 * st b32 wt g[$r12d+0x50] $r0
80 * exit */
81 0x2202020202020207ULL,
82 0x2c00000084021c04ULL,
83 0x2c0000000c031c04ULL,
84 0x2c00000010001c04ULL,
85 0x2c00000014005c04ULL,
86 0x2c00000018009c04ULL,
87 0x2c0000001c00dc04ULL,
88 0x2c00000020011c04ULL,
89 0x22b0420042320207ULL,
90 0x2c00000024015c04ULL,
91 0x2c00000028019c04ULL,
92 0x2c0000002c01dc04ULL,
93 0x190e0000fc81dc03ULL,
94 0x2800400000029de4ULL,
95 0x7000c01050c21c03ULL,
96 0x280040001002dde4ULL,
97 0x204282020042e047ULL,
98 0x7000c00820c25c03ULL,
99 0x80000000000021e7ULL,
100 0x190e0000fc93dc03ULL,
101 0x1000000180821c02ULL,
102 0x1000000040931c02ULL,
103 0x1000000010935c02ULL,
104 0x4800000034825c03ULL,
105 0x22c042c042c04287ULL,
106 0x4800000030821c03ULL,
107 0x2800000028031de4ULL,
108 0x4801000020a29c03ULL,
109 0x280000002c035de4ULL,
110 0x0800000000b2dc42ULL,
111 0x4801000024c31c03ULL,
112 0x9400000000a01fc5ULL,
113 0x200002e04202c047ULL,
114 0x2800400020001de4ULL,
115 0x0800000000d35c42ULL,
116 0x9400000100c107c5ULL,
117 0x9400000140c01f85ULL,
118 0x8000000000001de7ULL
119 };
120
121 /* For simplicity, we will allocate as many group slots as we allocate counter
122 * slots. This means that a single counter which wants to source from 2 groups
123 * will have to be declared as using 2 counter slots. This shouldn't really be
124 * a problem because such queries don't make much sense ... (unless someone is
125 * really creative).
126 */
127 struct nvc0_hw_sm_counter_cfg
128 {
129 uint32_t func : 16; /* mask or 4-bit logic op (depending on mode) */
130 uint32_t mode : 4; /* LOGOP,B6,LOGOP_B6(_PULSE) */
131 uint32_t num_src : 3; /* number of sources (1 - 6, only for NVC0:NVE4) */
132 uint32_t sig_dom : 1; /* if 0, MP_PM_A (per warp-sched), if 1, MP_PM_B */
133 uint32_t sig_sel : 8; /* signal group */
134 uint64_t src_sel; /* signal selection for up to 6 sources (48 bit) */
135 };
136
137 #define NVC0_COUNTER_OPn_SUM 0
138 #define NVC0_COUNTER_OPn_OR 1
139 #define NVC0_COUNTER_OPn_AND 2
140 #define NVC0_COUNTER_OP2_REL_SUM_MM 3 /* (sum(ctr0) - sum(ctr1)) / sum(ctr0) */
141 #define NVC0_COUNTER_OP2_DIV_SUM_M0 4 /* sum(ctr0) / ctr1 of MP[0]) */
142 #define NVC0_COUNTER_OP2_AVG_DIV_MM 5 /* avg(ctr0 / ctr1) */
143 #define NVC0_COUNTER_OP2_AVG_DIV_M0 6 /* avg(ctr0) / ctr1 of MP[0]) */
144
145 struct nvc0_hw_sm_query_cfg
146 {
147 struct nvc0_hw_sm_counter_cfg ctr[4];
148 uint8_t num_counters;
149 uint8_t op;
150 uint8_t norm[2]; /* normalization num,denom */
151 };
152
153 #define _Q1A(n, f, m, g, s, nu, dn) [NVE4_HW_SM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g, s }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { nu, dn } }
154 #define _Q1B(n, f, m, g, s, nu, dn) [NVE4_HW_SM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g, s }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { nu, dn } }
155 #define _M2A(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_HW_SM_QUERY_METRIC_##n] = { { \
156 { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g0, s0 }, \
157 { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g1, s1 }, \
158 {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } }
159 #define _M2B(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_HW_SM_QUERY_METRIC_##n] = { { \
160 { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g0, s0 }, \
161 { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g1, s1 }, \
162 {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } }
163 #define _M2AB(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_HW_SM_QUERY_METRIC_##n] = { { \
164 { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g0, s0 }, \
165 { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g1, s1 }, \
166 {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } }
167
168 /* NOTES:
169 * active_warps: bit 0 alternates btw 0 and 1 for odd nr of warps
170 * inst_executed etc.: we only count a single warp scheduler
171 * metric-ipXc: we simply multiply by 4 to account for the 4 warp schedulers;
172 * this is inaccurate !
173 */
174 static const struct nvc0_hw_sm_query_cfg nve4_hw_sm_queries[] =
175 {
176 _Q1B(ACTIVE_CYCLES, 0x0001, B6, WARP, 0x00000000, 1, 1),
177 _Q1B(ACTIVE_WARPS, 0x003f, B6, WARP, 0x31483104, 2, 1),
178 _Q1A(ATOM_COUNT, 0x0001, B6, BRANCH, 0x00000000, 1, 1),
179 _Q1A(BRANCH, 0x0001, B6, BRANCH, 0x0000000c, 1, 1),
180 _Q1A(DIVERGENT_BRANCH, 0x0001, B6, BRANCH, 0x00000010, 1, 1),
181 _Q1A(GLD_REQUEST, 0x0001, B6, LDST, 0x00000010, 1, 1),
182 _Q1B(GLD_MEM_DIV_REPLAY, 0x0001, B6, REPLAY, 0x00000010, 1, 1),
183 _Q1B(GST_TRANSACTIONS, 0x0001, B6, MEM, 0x00000004, 1, 1),
184 _Q1B(GST_MEM_DIV_REPLAY, 0x0001, B6, REPLAY, 0x00000014, 1, 1),
185 _Q1A(GRED_COUNT, 0x0001, B6, BRANCH, 0x00000008, 1, 1),
186 _Q1A(GST_REQUEST, 0x0001, B6, LDST, 0x00000014, 1, 1),
187 _Q1A(INST_EXECUTED, 0x0003, B6, EXEC, 0x00000398, 1, 1),
188 _Q1A(INST_ISSUED, 0x0003, B6, ISSUE, 0x00000104, 1, 1),
189 _Q1A(INST_ISSUED1, 0x0001, B6, ISSUE, 0x00000004, 1, 1),
190 _Q1A(INST_ISSUED2, 0x0001, B6, ISSUE, 0x00000008, 1, 1),
191 _Q1B(L1_GLD_HIT, 0x0001, B6, L1, 0x00000010, 1, 1),
192 _Q1B(L1_GLD_MISS, 0x0001, B6, L1, 0x00000014, 1, 1),
193 _Q1B(L1_LOCAL_LD_HIT, 0x0001, B6, L1, 0x00000000, 1, 1),
194 _Q1B(L1_LOCAL_LD_MISS, 0x0001, B6, L1, 0x00000004, 1, 1),
195 _Q1B(L1_LOCAL_ST_HIT, 0x0001, B6, L1, 0x00000008, 1, 1),
196 _Q1B(L1_LOCAL_ST_MISS, 0x0001, B6, L1, 0x0000000c, 1, 1),
197 _Q1B(L1_SHARED_LD_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000008, 1, 1),
198 _Q1B(L1_SHARED_ST_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x0000000c, 1, 1),
199 _Q1A(LOCAL_LD, 0x0001, B6, LDST, 0x00000008, 1, 1),
200 _Q1B(LOCAL_LD_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000000, 1, 1),
201 _Q1A(LOCAL_ST, 0x0001, B6, LDST, 0x0000000c, 1, 1),
202 _Q1B(LOCAL_ST_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000004, 1, 1),
203 _Q1A(PROF_TRIGGER_0, 0x0001, B6, USER, 0x00000000, 1, 1),
204 _Q1A(PROF_TRIGGER_1, 0x0001, B6, USER, 0x00000004, 1, 1),
205 _Q1A(PROF_TRIGGER_2, 0x0001, B6, USER, 0x00000008, 1, 1),
206 _Q1A(PROF_TRIGGER_3, 0x0001, B6, USER, 0x0000000c, 1, 1),
207 _Q1A(PROF_TRIGGER_4, 0x0001, B6, USER, 0x00000010, 1, 1),
208 _Q1A(PROF_TRIGGER_5, 0x0001, B6, USER, 0x00000014, 1, 1),
209 _Q1A(PROF_TRIGGER_6, 0x0001, B6, USER, 0x00000018, 1, 1),
210 _Q1A(PROF_TRIGGER_7, 0x0001, B6, USER, 0x0000001c, 1, 1),
211 _Q1A(SHARED_LD, 0x0001, B6, LDST, 0x00000000, 1, 1),
212 _Q1B(SHARED_LD_REPLAY, 0x0001, B6, REPLAY, 0x00000008, 1, 1),
213 _Q1A(SHARED_ST, 0x0001, B6, LDST, 0x00000004, 1, 1),
214 _Q1B(SHARED_ST_REPLAY, 0x0001, B6, REPLAY, 0x0000000c, 1, 1),
215 _Q1B(SM_CTA_LAUNCHED, 0x0001, B6, WARP, 0x0000001c, 1, 1),
216 _Q1A(THREADS_LAUNCHED, 0x003f, B6, LAUNCH, 0x398a4188, 1, 1),
217 _Q1B(UNCACHED_GLD_TRANSACTIONS, 0x0001, B6, MEM, 0x00000000, 1, 1),
218 _Q1A(WARPS_LAUNCHED, 0x0001, B6, LAUNCH, 0x00000004, 1, 1),
219 _M2AB(IPC, 0x3, B6, EXEC, 0x398, 0xffff, LOGOP, WARP, 0x0, DIV_SUM_M0, 10, 1),
220 _M2AB(IPAC, 0x3, B6, EXEC, 0x398, 0x1, B6, WARP, 0x0, AVG_DIV_MM, 10, 1),
221 _M2A(IPEC, 0x3, B6, EXEC, 0x398, 0xe, LOGOP, EXEC, 0x398, AVG_DIV_MM, 10, 1),
222 _M2A(INST_REPLAY_OHEAD, 0x3, B6, ISSUE, 0x104, 0x3, B6, EXEC, 0x398, REL_SUM_MM, 100, 1),
223 _M2B(MP_OCCUPANCY, 0x3f, B6, WARP, 0x31483104, 0x01, B6, WARP, 0x0, AVG_DIV_MM, 200, 64),
224 _M2B(MP_EFFICIENCY, 0x01, B6, WARP, 0x0, 0xffff, LOGOP, WARP, 0x0, AVG_DIV_M0, 100, 1),
225 };
226
227 #undef _Q1A
228 #undef _Q1B
229 #undef _M2A
230 #undef _M2B
231
232 /* === PERFORMANCE MONITORING COUNTERS for NVC0:NVE4 === */
233 static const uint64_t nvc0_read_hw_sm_counters_code[] =
234 {
235 /* mov b32 $r8 $tidx
236 * mov b32 $r9 $physid
237 * mov b32 $r0 $pm0
238 * mov b32 $r1 $pm1
239 * mov b32 $r2 $pm2
240 * mov b32 $r3 $pm3
241 * mov b32 $r4 $pm4
242 * mov b32 $r5 $pm5
243 * mov b32 $r6 $pm6
244 * mov b32 $r7 $pm7
245 * set $p0 0x1 eq u32 $r8 0x0
246 * mov b32 $r10 c0[0x0]
247 * mov b32 $r11 c0[0x4]
248 * ext u32 $r8 $r9 0x414
249 * (not $p0) exit
250 * mul $r8 u32 $r8 u32 36
251 * add b32 $r10 $c $r10 $r8
252 * add b32 $r11 $r11 0x0 $c
253 * mov b32 $r8 c0[0x8]
254 * st b128 wt g[$r10d+0x00] $r0q
255 * st b128 wt g[$r10d+0x10] $r4q
256 * st b32 wt g[$r10d+0x20] $r8
257 * exit */
258 0x2c00000084021c04ULL,
259 0x2c0000000c025c04ULL,
260 0x2c00000010001c04ULL,
261 0x2c00000014005c04ULL,
262 0x2c00000018009c04ULL,
263 0x2c0000001c00dc04ULL,
264 0x2c00000020011c04ULL,
265 0x2c00000024015c04ULL,
266 0x2c00000028019c04ULL,
267 0x2c0000002c01dc04ULL,
268 0x190e0000fc81dc03ULL,
269 0x2800400000029de4ULL,
270 0x280040001002dde4ULL,
271 0x7000c01050921c03ULL,
272 0x80000000000021e7ULL,
273 0x1000000090821c02ULL,
274 0x4801000020a29c03ULL,
275 0x0800000000b2dc42ULL,
276 0x2800400020021de4ULL,
277 0x9400000000a01fc5ULL,
278 0x9400000040a11fc5ULL,
279 0x9400000080a21f85ULL,
280 0x8000000000001de7ULL
281 };
282
283 #define _Q(n, f, m, g, c, s0, s1, s2, s3, s4, s5) [NVC0_HW_SM_QUERY_##n] = { { { f, NVC0_COMPUTE_MP_PM_OP_MODE_##m, c, 0, g, s0|(s1 << 8)|(s2 << 16)|(s3 << 24)|(s4##ULL << 32)|(s5##ULL << 40) }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { 1, 1 } }
284
285 static const struct nvc0_hw_sm_query_cfg nvc0_hw_sm_queries[] =
286 {
287 _Q(ACTIVE_CYCLES, 0xaaaa, LOGOP, 0x11, 1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00),
288 _Q(ACTIVE_WARPS, 0xaaaa, LOGOP, 0x24, 6, 0x10, 0x21, 0x32, 0x43, 0x54, 0x65),
289 _Q(ATOM_COUNT, 0xaaaa, LOGOP, 0x63, 1, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00),
290 _Q(BRANCH, 0xaaaa, LOGOP, 0x1a, 2, 0x00, 0x11, 0x00, 0x00, 0x00, 0x00),
291 _Q(DIVERGENT_BRANCH, 0xaaaa, LOGOP, 0x19, 2, 0x20, 0x31, 0x00, 0x00, 0x00, 0x00),
292 _Q(GLD_REQUEST, 0xaaaa, LOGOP, 0x64, 1, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00),
293 _Q(GRED_COUNT, 0xaaaa, LOGOP, 0x63, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00),
294 _Q(GST_REQUEST, 0xaaaa, LOGOP, 0x64, 1, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00),
295 _Q(INST_EXECUTED, 0xaaaa, LOGOP, 0x2d, 3, 0x00, 0x11, 0x22, 0x00, 0x00, 0x00),
296 _Q(INST_ISSUED1_0, 0xaaaa, LOGOP, 0x7e, 1, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00),
297 _Q(INST_ISSUED1_1, 0xaaaa, LOGOP, 0x7e, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00),
298 _Q(INST_ISSUED2_0, 0xaaaa, LOGOP, 0x7e, 1, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00),
299 _Q(INST_ISSUED2_1, 0xaaaa, LOGOP, 0x7e, 1, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00),
300 _Q(LOCAL_LD, 0xaaaa, LOGOP, 0x64, 1, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00),
301 _Q(LOCAL_ST, 0xaaaa, LOGOP, 0x64, 1, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00),
302 _Q(PROF_TRIGGER_0, 0xaaaa, LOGOP, 0x01, 1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00),
303 _Q(PROF_TRIGGER_1, 0xaaaa, LOGOP, 0x01, 1, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00),
304 _Q(PROF_TRIGGER_2, 0xaaaa, LOGOP, 0x01, 1, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00),
305 _Q(PROF_TRIGGER_3, 0xaaaa, LOGOP, 0x01, 1, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00),
306 _Q(PROF_TRIGGER_4, 0xaaaa, LOGOP, 0x01, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00),
307 _Q(PROF_TRIGGER_5, 0xaaaa, LOGOP, 0x01, 1, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00),
308 _Q(PROF_TRIGGER_6, 0xaaaa, LOGOP, 0x01, 1, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00),
309 _Q(PROF_TRIGGER_7, 0xaaaa, LOGOP, 0x01, 1, 0x70, 0x00, 0x00, 0x00, 0x00, 0x00),
310 _Q(SHARED_LD, 0xaaaa, LOGOP, 0x64, 1, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00),
311 _Q(SHARED_ST, 0xaaaa, LOGOP, 0x64, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00),
312 _Q(THREADS_LAUNCHED, 0xaaaa, LOGOP, 0x26, 6, 0x10, 0x21, 0x32, 0x43, 0x54, 0x65),
313 _Q(TH_INST_EXECUTED_0, 0xaaaa, LOGOP, 0xa3, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55),
314 _Q(TH_INST_EXECUTED_1, 0xaaaa, LOGOP, 0xa5, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55),
315 _Q(TH_INST_EXECUTED_2, 0xaaaa, LOGOP, 0xa4, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55),
316 _Q(TH_INST_EXECUTED_3, 0xaaaa, LOGOP, 0xa6, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55),
317 _Q(WARPS_LAUNCHED, 0xaaaa, LOGOP, 0x26, 1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00),
318 };
319
320 #undef _Q
321
322 static const struct nvc0_hw_sm_query_cfg *
323 nvc0_hw_sm_query_get_cfg(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
324 {
325 struct nvc0_screen *screen = nvc0->screen;
326 struct nvc0_query *q = &hq->base;
327
328 if (screen->base.class_3d >= NVE4_3D_CLASS)
329 return &nve4_hw_sm_queries[q->type - PIPE_QUERY_DRIVER_SPECIFIC];
330 return &nvc0_hw_sm_queries[q->type - NVC0_HW_SM_QUERY(0)];
331 }
332
333 static void
334 nvc0_hw_sm_destroy_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
335 {
336 struct nvc0_query *q = &hq->base;
337 q->funcs->destroy_query(nvc0, q);
338 }
339
340 static boolean
341 nve4_hw_sm_begin_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
342 {
343 struct nvc0_screen *screen = nvc0->screen;
344 struct nouveau_pushbuf *push = nvc0->base.pushbuf;
345 struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
346 const struct nvc0_hw_sm_query_cfg *cfg;
347 unsigned i, c;
348 unsigned num_ab[2] = { 0, 0 };
349
350 cfg = nvc0_hw_sm_query_get_cfg(nvc0, hq);
351
352 /* check if we have enough free counter slots */
353 for (i = 0; i < cfg->num_counters; ++i)
354 num_ab[cfg->ctr[i].sig_dom]++;
355
356 if (screen->pm.num_hw_sm_active[0] + num_ab[0] > 4 ||
357 screen->pm.num_hw_sm_active[1] + num_ab[1] > 4) {
358 NOUVEAU_ERR("Not enough free MP counter slots !\n");
359 return false;
360 }
361
362 assert(cfg->num_counters <= 4);
363 PUSH_SPACE(push, 4 * 8 * + 6);
364
365 if (!screen->pm.mp_counters_enabled) {
366 screen->pm.mp_counters_enabled = true;
367 BEGIN_NVC0(push, SUBC_SW(0x06ac), 1);
368 PUSH_DATA (push, 0x1fcb);
369 }
370
371 /* set sequence field to 0 (used to check if result is available) */
372 for (i = 0; i < screen->mp_count; ++i)
373 hq->data[i * 10 + 10] = 0;
374 hq->sequence++;
375
376 for (i = 0; i < cfg->num_counters; ++i) {
377 const unsigned d = cfg->ctr[i].sig_dom;
378
379 if (!screen->pm.num_hw_sm_active[d]) {
380 uint32_t m = (1 << 22) | (1 << (7 + (8 * !d)));
381 if (screen->pm.num_hw_sm_active[!d])
382 m |= 1 << (7 + (8 * d));
383 BEGIN_NVC0(push, SUBC_SW(0x0600), 1);
384 PUSH_DATA (push, m);
385 }
386 screen->pm.num_hw_sm_active[d]++;
387
388 for (c = d * 4; c < (d * 4 + 4); ++c) {
389 if (!screen->pm.mp_counter[c]) {
390 hsq->ctr[i] = c;
391 screen->pm.mp_counter[c] = hsq;
392 break;
393 }
394 }
395 assert(c <= (d * 4 + 3)); /* must succeed, already checked for space */
396
397 /* configure and reset the counter(s) */
398 if (d == 0)
399 BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_A_SIGSEL(c & 3)), 1);
400 else
401 BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_B_SIGSEL(c & 3)), 1);
402 PUSH_DATA (push, cfg->ctr[i].sig_sel);
403 BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_SRCSEL(c)), 1);
404 PUSH_DATA (push, cfg->ctr[i].src_sel + 0x2108421 * (c & 3));
405 BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(c)), 1);
406 PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode);
407 BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_SET(c)), 1);
408 PUSH_DATA (push, 0);
409 }
410 return true;
411 }
412
413 static boolean
414 nvc0_hw_sm_begin_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
415 {
416 struct nvc0_screen *screen = nvc0->screen;
417 struct nouveau_pushbuf *push = nvc0->base.pushbuf;
418 struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
419 const struct nvc0_hw_sm_query_cfg *cfg;
420 unsigned i, c;
421 unsigned num_ab[2] = { 0, 0 };
422
423 if (screen->base.class_3d >= NVE4_3D_CLASS)
424 return nve4_hw_sm_begin_query(nvc0, hq);
425
426 cfg = nvc0_hw_sm_query_get_cfg(nvc0, hq);
427
428 /* check if we have enough free counter slots */
429 for (i = 0; i < cfg->num_counters; ++i)
430 num_ab[cfg->ctr[i].sig_dom]++;
431
432 if (screen->pm.num_hw_sm_active[0] + num_ab[0] > 4 ||
433 screen->pm.num_hw_sm_active[1] + num_ab[1] > 4) {
434 NOUVEAU_ERR("Not enough free MP counter slots !\n");
435 return false;
436 }
437
438 assert(cfg->num_counters <= 4);
439 PUSH_SPACE(push, 4 * 8 * 6 + 4);
440
441 /* set sequence field to 0 (used to check if result is available) */
442 for (i = 0; i < screen->mp_count; ++i)
443 hq->data[i * 10 + 10] = 0;
444 hq->sequence++;
445
446 for (i = 0; i < cfg->num_counters; ++i) {
447 const unsigned d = cfg->ctr[i].sig_dom;
448 unsigned s;
449
450 if (!screen->pm.num_hw_sm_active[d]) {
451 uint32_t m = (1 << 22) | (1 << (7 + (8 * !d)));
452 if (screen->pm.num_hw_sm_active[!d])
453 m |= 1 << (7 + (8 * d));
454 BEGIN_NVC0(push, SUBC_SW(0x0600), 1);
455 PUSH_DATA (push, m);
456 }
457 screen->pm.num_hw_sm_active[d]++;
458
459 for (c = d * 4; c < (d * 4 + 4); ++c) {
460 if (!screen->pm.mp_counter[c]) {
461 hsq->ctr[i] = c;
462 screen->pm.mp_counter[c] = hsq;
463 break;
464 }
465 }
466 assert(c <= (d * 4 + 3)); /* must succeed, already checked for space */
467
468 /* configure and reset the counter(s) */
469 for (s = 0; s < cfg->ctr[i].num_src; s++) {
470 BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SIGSEL(s)), 1);
471 PUSH_DATA (push, cfg->ctr[i].sig_sel);
472 BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SRCSEL(s)), 1);
473 PUSH_DATA (push, (cfg->ctr[i].src_sel >> (s * 8)) & 0xff);
474 BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_OP(s)), 1);
475 PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode);
476 BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SET(s)), 1);
477 PUSH_DATA (push, 0);
478 }
479 }
480 return true;
481 }
482
483 static void
484 nvc0_hw_sm_end_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
485 {
486 struct nvc0_screen *screen = nvc0->screen;
487 struct pipe_context *pipe = &nvc0->base.pipe;
488 struct nouveau_pushbuf *push = nvc0->base.pushbuf;
489 const bool is_nve4 = screen->base.class_3d >= NVE4_3D_CLASS;
490 struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
491 uint32_t mask;
492 uint32_t input[3];
493 const uint block[3] = { 32, is_nve4 ? 4 : 1, 1 };
494 const uint grid[3] = { screen->mp_count, 1, 1 };
495 unsigned c;
496
497 if (unlikely(!screen->pm.prog)) {
498 struct nvc0_program *prog = CALLOC_STRUCT(nvc0_program);
499 prog->type = PIPE_SHADER_COMPUTE;
500 prog->translated = true;
501 prog->num_gprs = 14;
502 prog->parm_size = 12;
503 if (is_nve4) {
504 prog->code = (uint32_t *)nve4_read_hw_sm_counters_code;
505 prog->code_size = sizeof(nve4_read_hw_sm_counters_code);
506 } else {
507 prog->code = (uint32_t *)nvc0_read_hw_sm_counters_code;
508 prog->code_size = sizeof(nvc0_read_hw_sm_counters_code);
509 }
510 screen->pm.prog = prog;
511 }
512
513 /* disable all counting */
514 PUSH_SPACE(push, 8);
515 for (c = 0; c < 8; ++c)
516 if (screen->pm.mp_counter[c]) {
517 if (is_nve4) {
518 IMMED_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(c)), 0);
519 } else {
520 IMMED_NVC0(push, NVC0_COMPUTE(MP_PM_OP(c)), 0);
521 }
522 }
523 /* release counters for this query */
524 for (c = 0; c < 8; ++c) {
525 if (screen->pm.mp_counter[c] == hsq) {
526 screen->pm.num_hw_sm_active[c / 4]--;
527 screen->pm.mp_counter[c] = NULL;
528 }
529 }
530
531 BCTX_REFN_bo(nvc0->bufctx_cp, CP_QUERY, NOUVEAU_BO_GART | NOUVEAU_BO_WR,
532 hq->bo);
533
534 PUSH_SPACE(push, 1);
535 IMMED_NVC0(push, SUBC_COMPUTE(NV50_GRAPH_SERIALIZE), 0);
536
537 pipe->bind_compute_state(pipe, screen->pm.prog);
538 input[0] = (hq->bo->offset + hq->base_offset);
539 input[1] = (hq->bo->offset + hq->base_offset) >> 32;
540 input[2] = hq->sequence;
541 pipe->launch_grid(pipe, block, grid, 0, input);
542
543 nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_QUERY);
544
545 /* re-activate other counters */
546 PUSH_SPACE(push, 16);
547 mask = 0;
548 for (c = 0; c < 8; ++c) {
549 const struct nvc0_hw_sm_query_cfg *cfg;
550 unsigned i;
551
552 hsq = screen->pm.mp_counter[c];
553 if (!hsq)
554 continue;
555
556 cfg = nvc0_hw_sm_query_get_cfg(nvc0, &hsq->base);
557 for (i = 0; i < cfg->num_counters; ++i) {
558 if (mask & (1 << hsq->ctr[i]))
559 break;
560 mask |= 1 << hsq->ctr[i];
561 if (is_nve4) {
562 BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(hsq->ctr[i])), 1);
563 } else {
564 BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_OP(hsq->ctr[i])), 1);
565 }
566 PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode);
567 }
568 }
569 }
570
571 static inline bool
572 nvc0_hw_sm_query_read_data(uint32_t count[32][4],
573 struct nvc0_context *nvc0, bool wait,
574 struct nvc0_hw_query *hq,
575 const struct nvc0_hw_sm_query_cfg *cfg,
576 unsigned mp_count)
577 {
578 struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
579 unsigned p, c;
580
581 for (p = 0; p < mp_count; ++p) {
582 const unsigned b = (0x24 / 4) * p;
583
584 for (c = 0; c < cfg->num_counters; ++c) {
585 if (hq->data[b + 8] != hq->sequence) {
586 if (!wait)
587 return false;
588 if (nouveau_bo_wait(hq->bo, NOUVEAU_BO_RD, nvc0->base.client))
589 return false;
590 }
591 count[p][c] = hq->data[b + hsq->ctr[c]];
592 }
593 }
594 return true;
595 }
596
597 static inline bool
598 nve4_hw_sm_query_read_data(uint32_t count[32][4],
599 struct nvc0_context *nvc0, bool wait,
600 struct nvc0_hw_query *hq,
601 const struct nvc0_hw_sm_query_cfg *cfg,
602 unsigned mp_count)
603 {
604 struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
605 unsigned p, c, d;
606
607 for (p = 0; p < mp_count; ++p) {
608 const unsigned b = (0x60 / 4) * p;
609
610 for (c = 0; c < cfg->num_counters; ++c) {
611 count[p][c] = 0;
612 for (d = 0; d < ((hsq->ctr[c] & ~3) ? 1 : 4); ++d) {
613 if (hq->data[b + 20 + d] != hq->sequence) {
614 if (!wait)
615 return false;
616 if (nouveau_bo_wait(hq->bo, NOUVEAU_BO_RD, nvc0->base.client))
617 return false;
618 }
619 if (hsq->ctr[c] & ~0x3)
620 count[p][c] = hq->data[b + 16 + (hsq->ctr[c] & 3)];
621 else
622 count[p][c] += hq->data[b + d * 4 + hsq->ctr[c]];
623 }
624 }
625 }
626 return true;
627 }
628
629 /* Metric calculations:
630 * sum(x) ... sum of x over all MPs
631 * avg(x) ... average of x over all MPs
632 *
633 * IPC : sum(inst_executed) / clock
634 * INST_REPLAY_OHEAD: (sum(inst_issued) - sum(inst_executed)) / sum(inst_issued)
635 * MP_OCCUPANCY : avg((active_warps / 64) / active_cycles)
636 * MP_EFFICIENCY : avg(active_cycles / clock)
637 *
638 * NOTE: Interpretation of IPC requires knowledge of MP count.
639 */
640 static boolean
641 nvc0_hw_sm_get_query_result(struct nvc0_context *nvc0, struct nvc0_hw_query *hq,
642 boolean wait, union pipe_query_result *result)
643 {
644 uint32_t count[32][4];
645 uint64_t value = 0;
646 unsigned mp_count = MIN2(nvc0->screen->mp_count_compute, 32);
647 unsigned p, c;
648 const struct nvc0_hw_sm_query_cfg *cfg;
649 bool ret;
650
651 cfg = nvc0_hw_sm_query_get_cfg(nvc0, hq);
652
653 if (nvc0->screen->base.class_3d >= NVE4_3D_CLASS)
654 ret = nve4_hw_sm_query_read_data(count, nvc0, wait, hq, cfg, mp_count);
655 else
656 ret = nvc0_hw_sm_query_read_data(count, nvc0, wait, hq, cfg, mp_count);
657 if (!ret)
658 return false;
659
660 if (cfg->op == NVC0_COUNTER_OPn_SUM) {
661 for (c = 0; c < cfg->num_counters; ++c)
662 for (p = 0; p < mp_count; ++p)
663 value += count[p][c];
664 value = (value * cfg->norm[0]) / cfg->norm[1];
665 } else
666 if (cfg->op == NVC0_COUNTER_OPn_OR) {
667 uint32_t v = 0;
668 for (c = 0; c < cfg->num_counters; ++c)
669 for (p = 0; p < mp_count; ++p)
670 v |= count[p][c];
671 value = ((uint64_t)v * cfg->norm[0]) / cfg->norm[1];
672 } else
673 if (cfg->op == NVC0_COUNTER_OPn_AND) {
674 uint32_t v = ~0;
675 for (c = 0; c < cfg->num_counters; ++c)
676 for (p = 0; p < mp_count; ++p)
677 v &= count[p][c];
678 value = ((uint64_t)v * cfg->norm[0]) / cfg->norm[1];
679 } else
680 if (cfg->op == NVC0_COUNTER_OP2_REL_SUM_MM) {
681 uint64_t v[2] = { 0, 0 };
682 for (p = 0; p < mp_count; ++p) {
683 v[0] += count[p][0];
684 v[1] += count[p][1];
685 }
686 if (v[0])
687 value = ((v[0] - v[1]) * cfg->norm[0]) / (v[0] * cfg->norm[1]);
688 } else
689 if (cfg->op == NVC0_COUNTER_OP2_DIV_SUM_M0) {
690 for (p = 0; p < mp_count; ++p)
691 value += count[p][0];
692 if (count[0][1])
693 value = (value * cfg->norm[0]) / (count[0][1] * cfg->norm[1]);
694 else
695 value = 0;
696 } else
697 if (cfg->op == NVC0_COUNTER_OP2_AVG_DIV_MM) {
698 unsigned mp_used = 0;
699 for (p = 0; p < mp_count; ++p, mp_used += !!count[p][0])
700 if (count[p][1])
701 value += (count[p][0] * cfg->norm[0]) / count[p][1];
702 if (mp_used)
703 value /= (uint64_t)mp_used * cfg->norm[1];
704 } else
705 if (cfg->op == NVC0_COUNTER_OP2_AVG_DIV_M0) {
706 unsigned mp_used = 0;
707 for (p = 0; p < mp_count; ++p, mp_used += !!count[p][0])
708 value += count[p][0];
709 if (count[0][1] && mp_used) {
710 value *= cfg->norm[0];
711 value /= (uint64_t)count[0][1] * mp_used * cfg->norm[1];
712 } else {
713 value = 0;
714 }
715 }
716
717 *(uint64_t *)result = value;
718 return true;
719 }
720
721 static const struct nvc0_hw_query_funcs hw_sm_query_funcs = {
722 .destroy_query = nvc0_hw_sm_destroy_query,
723 .begin_query = nvc0_hw_sm_begin_query,
724 .end_query = nvc0_hw_sm_end_query,
725 .get_query_result = nvc0_hw_sm_get_query_result,
726 };
727
728 struct nvc0_hw_query *
729 nvc0_hw_sm_create_query(struct nvc0_context *nvc0, unsigned type)
730 {
731 struct nvc0_screen *screen = nvc0->screen;
732 struct nvc0_hw_sm_query *hsq;
733 struct nvc0_hw_query *hq;
734 unsigned space;
735
736 if (nvc0->screen->base.device->drm_version < 0x01000101)
737 return NULL;
738
739 if ((type < NVE4_HW_SM_QUERY(0) || type > NVE4_HW_SM_QUERY_LAST) &&
740 (type < NVC0_HW_SM_QUERY(0) || type > NVC0_HW_SM_QUERY_LAST))
741 return NULL;
742
743 hsq = CALLOC_STRUCT(nvc0_hw_sm_query);
744 if (!hsq)
745 return NULL;
746
747 hq = &hsq->base;
748 hq->funcs = &hw_sm_query_funcs;
749 hq->base.type = type;
750
751 if (screen->base.class_3d >= NVE4_3D_CLASS) {
752 /* for each MP:
753 * [00] = WS0.C0
754 * [04] = WS0.C1
755 * [08] = WS0.C2
756 * [0c] = WS0.C3
757 * [10] = WS1.C0
758 * [14] = WS1.C1
759 * [18] = WS1.C2
760 * [1c] = WS1.C3
761 * [20] = WS2.C0
762 * [24] = WS2.C1
763 * [28] = WS2.C2
764 * [2c] = WS2.C3
765 * [30] = WS3.C0
766 * [34] = WS3.C1
767 * [38] = WS3.C2
768 * [3c] = WS3.C3
769 * [40] = MP.C4
770 * [44] = MP.C5
771 * [48] = MP.C6
772 * [4c] = MP.C7
773 * [50] = WS0.sequence
774 * [54] = WS1.sequence
775 * [58] = WS2.sequence
776 * [5c] = WS3.sequence
777 */
778 space = (4 * 4 + 4 + 4) * nvc0->screen->mp_count * sizeof(uint32_t);
779 } else {
780 /* for each MP:
781 * [00] = MP.C0
782 * [04] = MP.C1
783 * [08] = MP.C2
784 * [0c] = MP.C3
785 * [10] = MP.C4
786 * [14] = MP.C5
787 * [18] = MP.C6
788 * [1c] = MP.C7
789 * [20] = MP.sequence
790 */
791 space = (8 + 1) * nvc0->screen->mp_count * sizeof(uint32_t);
792 }
793
794 if (!nvc0_hw_query_allocate(nvc0, &hq->base, space)) {
795 FREE(hq);
796 return NULL;
797 }
798
799 return hq;
800 }