nvc0: allow to use 8 MP counters on Fermi
[mesa.git] / src / gallium / drivers / nouveau / nvc0 / nvc0_query_hw_sm.c
1 /*
2 * Copyright 2011 Christoph Bumiller
3 * Copyright 2015 Samuel Pitoiset
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21 * OTHER DEALINGS IN THE SOFTWARE.
22 */
23
24 #define NVC0_PUSH_EXPLICIT_SPACE_CHECKING
25
26 #include "nvc0/nvc0_context.h"
27 #include "nvc0/nvc0_query_hw_sm.h"
28
29 #include "nv_object.xml.h"
30 #include "nvc0/nve4_compute.xml.h"
31 #include "nvc0/nvc0_compute.xml.h"
32
33 /* === PERFORMANCE MONITORING COUNTERS for NVE4+ === */
34
35 /* Code to read out MP counters: They are accessible via mmio, too, but let's
36 * just avoid mapping registers in userspace. We'd have to know which MPs are
37 * enabled/present, too, and that information is not presently exposed.
38 * We could add a kernel interface for it, but reading the counters like this
39 * has the advantage of being async (if get_result isn't called immediately).
40 */
41 static const uint64_t nve4_read_hw_sm_counters_code[] =
42 {
43 /* sched 0x20 0x20 0x20 0x20 0x20 0x20 0x20
44 * mov b32 $r8 $tidx
45 * mov b32 $r12 $physid
46 * mov b32 $r0 $pm0
47 * mov b32 $r1 $pm1
48 * mov b32 $r2 $pm2
49 * mov b32 $r3 $pm3
50 * mov b32 $r4 $pm4
51 * sched 0x20 0x20 0x23 0x04 0x20 0x04 0x2b
52 * mov b32 $r5 $pm5
53 * mov b32 $r6 $pm6
54 * mov b32 $r7 $pm7
55 * set $p0 0x1 eq u32 $r8 0x0
56 * mov b32 $r10 c0[0x0]
57 * ext u32 $r8 $r12 0x414
58 * mov b32 $r11 c0[0x4]
59 * sched 0x04 0x2e 0x04 0x20 0x20 0x28 0x04
60 * ext u32 $r9 $r12 0x208
61 * (not $p0) exit
62 * set $p1 0x1 eq u32 $r9 0x0
63 * mul $r8 u32 $r8 u32 96
64 * mul $r12 u32 $r9 u32 16
65 * mul $r13 u32 $r9 u32 4
66 * add b32 $r9 $r8 $r13
67 * sched 0x28 0x04 0x2c 0x04 0x2c 0x04 0x2c
68 * add b32 $r8 $r8 $r12
69 * mov b32 $r12 $r10
70 * add b32 $r10 $c $r10 $r8
71 * mov b32 $r13 $r11
72 * add b32 $r11 $r11 0x0 $c
73 * add b32 $r12 $c $r12 $r9
74 * st b128 wt g[$r10d] $r0q
75 * sched 0x4 0x2c 0x20 0x04 0x2e 0x00 0x00
76 * mov b32 $r0 c0[0x8]
77 * add b32 $r13 $r13 0x0 $c
78 * $p1 st b128 wt g[$r12d+0x40] $r4q
79 * st b32 wt g[$r12d+0x50] $r0
80 * exit */
81 0x2202020202020207ULL,
82 0x2c00000084021c04ULL,
83 0x2c0000000c031c04ULL,
84 0x2c00000010001c04ULL,
85 0x2c00000014005c04ULL,
86 0x2c00000018009c04ULL,
87 0x2c0000001c00dc04ULL,
88 0x2c00000020011c04ULL,
89 0x22b0420042320207ULL,
90 0x2c00000024015c04ULL,
91 0x2c00000028019c04ULL,
92 0x2c0000002c01dc04ULL,
93 0x190e0000fc81dc03ULL,
94 0x2800400000029de4ULL,
95 0x7000c01050c21c03ULL,
96 0x280040001002dde4ULL,
97 0x204282020042e047ULL,
98 0x7000c00820c25c03ULL,
99 0x80000000000021e7ULL,
100 0x190e0000fc93dc03ULL,
101 0x1000000180821c02ULL,
102 0x1000000040931c02ULL,
103 0x1000000010935c02ULL,
104 0x4800000034825c03ULL,
105 0x22c042c042c04287ULL,
106 0x4800000030821c03ULL,
107 0x2800000028031de4ULL,
108 0x4801000020a29c03ULL,
109 0x280000002c035de4ULL,
110 0x0800000000b2dc42ULL,
111 0x4801000024c31c03ULL,
112 0x9400000000a01fc5ULL,
113 0x200002e04202c047ULL,
114 0x2800400020001de4ULL,
115 0x0800000000d35c42ULL,
116 0x9400000100c107c5ULL,
117 0x9400000140c01f85ULL,
118 0x8000000000001de7ULL
119 };
120
121 /* For simplicity, we will allocate as many group slots as we allocate counter
122 * slots. This means that a single counter which wants to source from 2 groups
123 * will have to be declared as using 2 counter slots. This shouldn't really be
124 * a problem because such queries don't make much sense ... (unless someone is
125 * really creative).
126 */
127 struct nvc0_hw_sm_counter_cfg
128 {
129 uint32_t func : 16; /* mask or 4-bit logic op (depending on mode) */
130 uint32_t mode : 4; /* LOGOP,B6,LOGOP_B6(_PULSE) */
131 uint32_t num_src : 3; /* number of sources (1 - 6, only for NVC0:NVE4) */
132 uint32_t sig_dom : 1; /* if 0, MP_PM_A (per warp-sched), if 1, MP_PM_B */
133 uint32_t sig_sel : 8; /* signal group */
134 uint64_t src_sel; /* signal selection for up to 6 sources (48 bit) */
135 };
136
137 #define NVC0_COUNTER_OPn_SUM 0
138 #define NVC0_COUNTER_OPn_OR 1
139 #define NVC0_COUNTER_OPn_AND 2
140 #define NVC0_COUNTER_OP2_REL_SUM_MM 3 /* (sum(ctr0) - sum(ctr1)) / sum(ctr0) */
141 #define NVC0_COUNTER_OP2_DIV_SUM_M0 4 /* sum(ctr0) / ctr1 of MP[0]) */
142 #define NVC0_COUNTER_OP2_AVG_DIV_MM 5 /* avg(ctr0 / ctr1) */
143 #define NVC0_COUNTER_OP2_AVG_DIV_M0 6 /* avg(ctr0) / ctr1 of MP[0]) */
144
145 struct nvc0_hw_sm_query_cfg
146 {
147 struct nvc0_hw_sm_counter_cfg ctr[8];
148 uint8_t num_counters;
149 uint8_t op;
150 uint8_t norm[2]; /* normalization num,denom */
151 };
152
153 #define _Q1A(n, f, m, g, s, nu, dn) [NVE4_HW_SM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g, s }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { nu, dn } }
154 #define _Q1B(n, f, m, g, s, nu, dn) [NVE4_HW_SM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g, s }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { nu, dn } }
155 #define _M2A(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_HW_SM_QUERY_METRIC_##n] = { { \
156 { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g0, s0 }, \
157 { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g1, s1 }, \
158 {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } }
159 #define _M2B(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_HW_SM_QUERY_METRIC_##n] = { { \
160 { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g0, s0 }, \
161 { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g1, s1 }, \
162 {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } }
163 #define _M2AB(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_HW_SM_QUERY_METRIC_##n] = { { \
164 { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g0, s0 }, \
165 { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g1, s1 }, \
166 {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } }
167
168 /* NOTES:
169 * active_warps: bit 0 alternates btw 0 and 1 for odd nr of warps
170 * inst_executed etc.: we only count a single warp scheduler
171 * metric-ipXc: we simply multiply by 4 to account for the 4 warp schedulers;
172 * this is inaccurate !
173 */
174 static const struct nvc0_hw_sm_query_cfg nve4_hw_sm_queries[] =
175 {
176 _Q1B(ACTIVE_CYCLES, 0x0001, B6, WARP, 0x00000000, 1, 1),
177 _Q1B(ACTIVE_WARPS, 0x003f, B6, WARP, 0x31483104, 2, 1),
178 _Q1A(ATOM_COUNT, 0x0001, B6, BRANCH, 0x00000000, 1, 1),
179 _Q1A(BRANCH, 0x0001, B6, BRANCH, 0x0000000c, 1, 1),
180 _Q1A(DIVERGENT_BRANCH, 0x0001, B6, BRANCH, 0x00000010, 1, 1),
181 _Q1A(GLD_REQUEST, 0x0001, B6, LDST, 0x00000010, 1, 1),
182 _Q1B(GLD_MEM_DIV_REPLAY, 0x0001, B6, REPLAY, 0x00000010, 1, 1),
183 _Q1B(GST_TRANSACTIONS, 0x0001, B6, MEM, 0x00000004, 1, 1),
184 _Q1B(GST_MEM_DIV_REPLAY, 0x0001, B6, REPLAY, 0x00000014, 1, 1),
185 _Q1A(GRED_COUNT, 0x0001, B6, BRANCH, 0x00000008, 1, 1),
186 _Q1A(GST_REQUEST, 0x0001, B6, LDST, 0x00000014, 1, 1),
187 _Q1A(INST_EXECUTED, 0x0003, B6, EXEC, 0x00000398, 1, 1),
188 _Q1A(INST_ISSUED, 0x0003, B6, ISSUE, 0x00000104, 1, 1),
189 _Q1A(INST_ISSUED1, 0x0001, B6, ISSUE, 0x00000004, 1, 1),
190 _Q1A(INST_ISSUED2, 0x0001, B6, ISSUE, 0x00000008, 1, 1),
191 _Q1B(L1_GLD_HIT, 0x0001, B6, L1, 0x00000010, 1, 1),
192 _Q1B(L1_GLD_MISS, 0x0001, B6, L1, 0x00000014, 1, 1),
193 _Q1B(L1_LOCAL_LD_HIT, 0x0001, B6, L1, 0x00000000, 1, 1),
194 _Q1B(L1_LOCAL_LD_MISS, 0x0001, B6, L1, 0x00000004, 1, 1),
195 _Q1B(L1_LOCAL_ST_HIT, 0x0001, B6, L1, 0x00000008, 1, 1),
196 _Q1B(L1_LOCAL_ST_MISS, 0x0001, B6, L1, 0x0000000c, 1, 1),
197 _Q1B(L1_SHARED_LD_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000008, 1, 1),
198 _Q1B(L1_SHARED_ST_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x0000000c, 1, 1),
199 _Q1A(LOCAL_LD, 0x0001, B6, LDST, 0x00000008, 1, 1),
200 _Q1B(LOCAL_LD_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000000, 1, 1),
201 _Q1A(LOCAL_ST, 0x0001, B6, LDST, 0x0000000c, 1, 1),
202 _Q1B(LOCAL_ST_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000004, 1, 1),
203 _Q1A(PROF_TRIGGER_0, 0x0001, B6, USER, 0x00000000, 1, 1),
204 _Q1A(PROF_TRIGGER_1, 0x0001, B6, USER, 0x00000004, 1, 1),
205 _Q1A(PROF_TRIGGER_2, 0x0001, B6, USER, 0x00000008, 1, 1),
206 _Q1A(PROF_TRIGGER_3, 0x0001, B6, USER, 0x0000000c, 1, 1),
207 _Q1A(PROF_TRIGGER_4, 0x0001, B6, USER, 0x00000010, 1, 1),
208 _Q1A(PROF_TRIGGER_5, 0x0001, B6, USER, 0x00000014, 1, 1),
209 _Q1A(PROF_TRIGGER_6, 0x0001, B6, USER, 0x00000018, 1, 1),
210 _Q1A(PROF_TRIGGER_7, 0x0001, B6, USER, 0x0000001c, 1, 1),
211 _Q1A(SHARED_LD, 0x0001, B6, LDST, 0x00000000, 1, 1),
212 _Q1B(SHARED_LD_REPLAY, 0x0001, B6, REPLAY, 0x00000008, 1, 1),
213 _Q1A(SHARED_ST, 0x0001, B6, LDST, 0x00000004, 1, 1),
214 _Q1B(SHARED_ST_REPLAY, 0x0001, B6, REPLAY, 0x0000000c, 1, 1),
215 _Q1B(SM_CTA_LAUNCHED, 0x0001, B6, WARP, 0x0000001c, 1, 1),
216 _Q1A(THREADS_LAUNCHED, 0x003f, B6, LAUNCH, 0x398a4188, 1, 1),
217 _Q1B(UNCACHED_GLD_TRANSACTIONS, 0x0001, B6, MEM, 0x00000000, 1, 1),
218 _Q1A(WARPS_LAUNCHED, 0x0001, B6, LAUNCH, 0x00000004, 1, 1),
219 _M2AB(IPC, 0x3, B6, EXEC, 0x398, 0xffff, LOGOP, WARP, 0x0, DIV_SUM_M0, 10, 1),
220 _M2AB(IPAC, 0x3, B6, EXEC, 0x398, 0x1, B6, WARP, 0x0, AVG_DIV_MM, 10, 1),
221 _M2A(IPEC, 0x3, B6, EXEC, 0x398, 0xe, LOGOP, EXEC, 0x398, AVG_DIV_MM, 10, 1),
222 _M2A(INST_REPLAY_OHEAD, 0x3, B6, ISSUE, 0x104, 0x3, B6, EXEC, 0x398, REL_SUM_MM, 100, 1),
223 _M2B(MP_OCCUPANCY, 0x3f, B6, WARP, 0x31483104, 0x01, B6, WARP, 0x0, AVG_DIV_MM, 200, 64),
224 _M2B(MP_EFFICIENCY, 0x01, B6, WARP, 0x0, 0xffff, LOGOP, WARP, 0x0, AVG_DIV_M0, 100, 1),
225 };
226
227 #undef _Q1A
228 #undef _Q1B
229 #undef _M2A
230 #undef _M2B
231
232 /* === PERFORMANCE MONITORING COUNTERS for NVC0:NVE4 === */
233 static const uint64_t nvc0_read_hw_sm_counters_code[] =
234 {
235 /* mov b32 $r8 $tidx
236 * mov b32 $r9 $physid
237 * mov b32 $r0 $pm0
238 * mov b32 $r1 $pm1
239 * mov b32 $r2 $pm2
240 * mov b32 $r3 $pm3
241 * mov b32 $r4 $pm4
242 * mov b32 $r5 $pm5
243 * mov b32 $r6 $pm6
244 * mov b32 $r7 $pm7
245 * set $p0 0x1 eq u32 $r8 0x0
246 * mov b32 $r10 c0[0x0]
247 * mov b32 $r11 c0[0x4]
248 * ext u32 $r8 $r9 0x414
249 * (not $p0) exit
250 * mul $r8 u32 $r8 u32 36
251 * add b32 $r10 $c $r10 $r8
252 * add b32 $r11 $r11 0x0 $c
253 * mov b32 $r8 c0[0x8]
254 * st b128 wt g[$r10d+0x00] $r0q
255 * st b128 wt g[$r10d+0x10] $r4q
256 * st b32 wt g[$r10d+0x20] $r8
257 * exit */
258 0x2c00000084021c04ULL,
259 0x2c0000000c025c04ULL,
260 0x2c00000010001c04ULL,
261 0x2c00000014005c04ULL,
262 0x2c00000018009c04ULL,
263 0x2c0000001c00dc04ULL,
264 0x2c00000020011c04ULL,
265 0x2c00000024015c04ULL,
266 0x2c00000028019c04ULL,
267 0x2c0000002c01dc04ULL,
268 0x190e0000fc81dc03ULL,
269 0x2800400000029de4ULL,
270 0x280040001002dde4ULL,
271 0x7000c01050921c03ULL,
272 0x80000000000021e7ULL,
273 0x1000000090821c02ULL,
274 0x4801000020a29c03ULL,
275 0x0800000000b2dc42ULL,
276 0x2800400020021de4ULL,
277 0x9400000000a01fc5ULL,
278 0x9400000040a11fc5ULL,
279 0x9400000080a21f85ULL,
280 0x8000000000001de7ULL
281 };
282
283 #define _Q(n, f, m, g, c, s0, s1, s2, s3, s4, s5) [NVC0_HW_SM_QUERY_##n] = { { { f, NVC0_COMPUTE_MP_PM_OP_MODE_##m, c, 0, g, s0|(s1 << 8)|(s2 << 16)|(s3 << 24)|(s4##ULL << 32)|(s5##ULL << 40) }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { 1, 1 } }
284
285 static const struct nvc0_hw_sm_query_cfg nvc0_hw_sm_queries[] =
286 {
287 _Q(ACTIVE_CYCLES, 0xaaaa, LOGOP, 0x11, 1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00),
288 _Q(ACTIVE_WARPS, 0xaaaa, LOGOP, 0x24, 6, 0x10, 0x21, 0x32, 0x43, 0x54, 0x65),
289 _Q(ATOM_COUNT, 0xaaaa, LOGOP, 0x63, 1, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00),
290 _Q(BRANCH, 0xaaaa, LOGOP, 0x1a, 2, 0x00, 0x11, 0x00, 0x00, 0x00, 0x00),
291 _Q(DIVERGENT_BRANCH, 0xaaaa, LOGOP, 0x19, 2, 0x20, 0x31, 0x00, 0x00, 0x00, 0x00),
292 _Q(GLD_REQUEST, 0xaaaa, LOGOP, 0x64, 1, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00),
293 _Q(GRED_COUNT, 0xaaaa, LOGOP, 0x63, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00),
294 _Q(GST_REQUEST, 0xaaaa, LOGOP, 0x64, 1, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00),
295 _Q(INST_EXECUTED, 0xaaaa, LOGOP, 0x2d, 3, 0x00, 0x11, 0x22, 0x00, 0x00, 0x00),
296 _Q(INST_ISSUED1_0, 0xaaaa, LOGOP, 0x7e, 1, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00),
297 _Q(INST_ISSUED1_1, 0xaaaa, LOGOP, 0x7e, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00),
298 _Q(INST_ISSUED2_0, 0xaaaa, LOGOP, 0x7e, 1, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00),
299 _Q(INST_ISSUED2_1, 0xaaaa, LOGOP, 0x7e, 1, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00),
300 _Q(LOCAL_LD, 0xaaaa, LOGOP, 0x64, 1, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00),
301 _Q(LOCAL_ST, 0xaaaa, LOGOP, 0x64, 1, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00),
302 _Q(PROF_TRIGGER_0, 0xaaaa, LOGOP, 0x01, 1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00),
303 _Q(PROF_TRIGGER_1, 0xaaaa, LOGOP, 0x01, 1, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00),
304 _Q(PROF_TRIGGER_2, 0xaaaa, LOGOP, 0x01, 1, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00),
305 _Q(PROF_TRIGGER_3, 0xaaaa, LOGOP, 0x01, 1, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00),
306 _Q(PROF_TRIGGER_4, 0xaaaa, LOGOP, 0x01, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00),
307 _Q(PROF_TRIGGER_5, 0xaaaa, LOGOP, 0x01, 1, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00),
308 _Q(PROF_TRIGGER_6, 0xaaaa, LOGOP, 0x01, 1, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00),
309 _Q(PROF_TRIGGER_7, 0xaaaa, LOGOP, 0x01, 1, 0x70, 0x00, 0x00, 0x00, 0x00, 0x00),
310 _Q(SHARED_LD, 0xaaaa, LOGOP, 0x64, 1, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00),
311 _Q(SHARED_ST, 0xaaaa, LOGOP, 0x64, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00),
312 _Q(THREADS_LAUNCHED, 0xaaaa, LOGOP, 0x26, 6, 0x10, 0x21, 0x32, 0x43, 0x54, 0x65),
313 _Q(TH_INST_EXECUTED_0, 0xaaaa, LOGOP, 0xa3, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55),
314 _Q(TH_INST_EXECUTED_1, 0xaaaa, LOGOP, 0xa5, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55),
315 _Q(TH_INST_EXECUTED_2, 0xaaaa, LOGOP, 0xa4, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55),
316 _Q(TH_INST_EXECUTED_3, 0xaaaa, LOGOP, 0xa6, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55),
317 _Q(WARPS_LAUNCHED, 0xaaaa, LOGOP, 0x26, 1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00),
318 };
319
320 #undef _Q
321
322 static const struct nvc0_hw_sm_query_cfg *
323 nvc0_hw_sm_query_get_cfg(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
324 {
325 struct nvc0_screen *screen = nvc0->screen;
326 struct nvc0_query *q = &hq->base;
327
328 if (screen->base.class_3d >= NVE4_3D_CLASS)
329 return &nve4_hw_sm_queries[q->type - PIPE_QUERY_DRIVER_SPECIFIC];
330 return &nvc0_hw_sm_queries[q->type - NVC0_HW_SM_QUERY(0)];
331 }
332
333 static void
334 nvc0_hw_sm_destroy_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
335 {
336 struct nvc0_query *q = &hq->base;
337 q->funcs->destroy_query(nvc0, q);
338 }
339
340 static boolean
341 nve4_hw_sm_begin_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
342 {
343 struct nvc0_screen *screen = nvc0->screen;
344 struct nouveau_pushbuf *push = nvc0->base.pushbuf;
345 struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
346 const struct nvc0_hw_sm_query_cfg *cfg;
347 unsigned i, c;
348 unsigned num_ab[2] = { 0, 0 };
349
350 cfg = nvc0_hw_sm_query_get_cfg(nvc0, hq);
351
352 /* check if we have enough free counter slots */
353 for (i = 0; i < cfg->num_counters; ++i)
354 num_ab[cfg->ctr[i].sig_dom]++;
355
356 if (screen->pm.num_hw_sm_active[0] + num_ab[0] > 4 ||
357 screen->pm.num_hw_sm_active[1] + num_ab[1] > 4) {
358 NOUVEAU_ERR("Not enough free MP counter slots !\n");
359 return false;
360 }
361
362 assert(cfg->num_counters <= 4);
363 PUSH_SPACE(push, 4 * 8 * + 6);
364
365 if (!screen->pm.mp_counters_enabled) {
366 screen->pm.mp_counters_enabled = true;
367 BEGIN_NVC0(push, SUBC_SW(0x06ac), 1);
368 PUSH_DATA (push, 0x1fcb);
369 }
370
371 /* set sequence field to 0 (used to check if result is available) */
372 for (i = 0; i < screen->mp_count; ++i)
373 hq->data[i * 10 + 10] = 0;
374 hq->sequence++;
375
376 for (i = 0; i < cfg->num_counters; ++i) {
377 const unsigned d = cfg->ctr[i].sig_dom;
378
379 if (!screen->pm.num_hw_sm_active[d]) {
380 uint32_t m = (1 << 22) | (1 << (7 + (8 * !d)));
381 if (screen->pm.num_hw_sm_active[!d])
382 m |= 1 << (7 + (8 * d));
383 BEGIN_NVC0(push, SUBC_SW(0x0600), 1);
384 PUSH_DATA (push, m);
385 }
386 screen->pm.num_hw_sm_active[d]++;
387
388 for (c = d * 4; c < (d * 4 + 4); ++c) {
389 if (!screen->pm.mp_counter[c]) {
390 hsq->ctr[i] = c;
391 screen->pm.mp_counter[c] = hsq;
392 break;
393 }
394 }
395 assert(c <= (d * 4 + 3)); /* must succeed, already checked for space */
396
397 /* configure and reset the counter(s) */
398 if (d == 0)
399 BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_A_SIGSEL(c & 3)), 1);
400 else
401 BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_B_SIGSEL(c & 3)), 1);
402 PUSH_DATA (push, cfg->ctr[i].sig_sel);
403 BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_SRCSEL(c)), 1);
404 PUSH_DATA (push, cfg->ctr[i].src_sel + 0x2108421 * (c & 3));
405 BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(c)), 1);
406 PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode);
407 BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_SET(c)), 1);
408 PUSH_DATA (push, 0);
409 }
410 return true;
411 }
412
413 static boolean
414 nvc0_hw_sm_begin_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
415 {
416 struct nvc0_screen *screen = nvc0->screen;
417 struct nouveau_pushbuf *push = nvc0->base.pushbuf;
418 struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
419 const struct nvc0_hw_sm_query_cfg *cfg;
420 unsigned i, c;
421
422 if (screen->base.class_3d >= NVE4_3D_CLASS)
423 return nve4_hw_sm_begin_query(nvc0, hq);
424
425 cfg = nvc0_hw_sm_query_get_cfg(nvc0, hq);
426
427 /* check if we have enough free counter slots */
428 if (screen->pm.num_hw_sm_active[0] + cfg->num_counters > 8) {
429 NOUVEAU_ERR("Not enough free MP counter slots !\n");
430 return false;
431 }
432
433 assert(cfg->num_counters <= 8);
434 PUSH_SPACE(push, 4 * 8 * 6 + 2);
435
436 /* set sequence field to 0 (used to check if result is available) */
437 for (i = 0; i < screen->mp_count; ++i) {
438 const unsigned b = (0x24 / 4) * i;
439 hq->data[b + 8] = 0;
440 }
441 hq->sequence++;
442
443 for (i = 0; i < cfg->num_counters; ++i) {
444 unsigned s;
445
446 if (!screen->pm.num_hw_sm_active[0]) {
447 BEGIN_NVC0(push, SUBC_SW(0x0600), 1);
448 PUSH_DATA (push, 0x80000000);
449 }
450 screen->pm.num_hw_sm_active[0]++;
451
452 for (c = 0; c < 8; ++c) {
453 if (!screen->pm.mp_counter[c]) {
454 hsq->ctr[i] = c;
455 screen->pm.mp_counter[c] = hsq;
456 break;
457 }
458 }
459
460 /* configure and reset the counter(s) */
461 for (s = 0; s < cfg->ctr[i].num_src; s++) {
462 BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SIGSEL(s)), 1);
463 PUSH_DATA (push, cfg->ctr[i].sig_sel);
464 BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SRCSEL(s)), 1);
465 PUSH_DATA (push, (cfg->ctr[i].src_sel >> (s * 8)) & 0xff);
466 BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_OP(s)), 1);
467 PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode);
468 BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SET(s)), 1);
469 PUSH_DATA (push, 0);
470 }
471 }
472 return true;
473 }
474
475 static void
476 nvc0_hw_sm_end_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
477 {
478 struct nvc0_screen *screen = nvc0->screen;
479 struct pipe_context *pipe = &nvc0->base.pipe;
480 struct nouveau_pushbuf *push = nvc0->base.pushbuf;
481 const bool is_nve4 = screen->base.class_3d >= NVE4_3D_CLASS;
482 struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
483 uint32_t mask;
484 uint32_t input[3];
485 const uint block[3] = { 32, is_nve4 ? 4 : 1, 1 };
486 const uint grid[3] = { screen->mp_count, 1, 1 };
487 unsigned c;
488
489 if (unlikely(!screen->pm.prog)) {
490 struct nvc0_program *prog = CALLOC_STRUCT(nvc0_program);
491 prog->type = PIPE_SHADER_COMPUTE;
492 prog->translated = true;
493 prog->num_gprs = 14;
494 prog->parm_size = 12;
495 if (is_nve4) {
496 prog->code = (uint32_t *)nve4_read_hw_sm_counters_code;
497 prog->code_size = sizeof(nve4_read_hw_sm_counters_code);
498 } else {
499 prog->code = (uint32_t *)nvc0_read_hw_sm_counters_code;
500 prog->code_size = sizeof(nvc0_read_hw_sm_counters_code);
501 }
502 screen->pm.prog = prog;
503 }
504
505 /* disable all counting */
506 PUSH_SPACE(push, 8);
507 for (c = 0; c < 8; ++c)
508 if (screen->pm.mp_counter[c]) {
509 if (is_nve4) {
510 IMMED_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(c)), 0);
511 } else {
512 IMMED_NVC0(push, NVC0_COMPUTE(MP_PM_OP(c)), 0);
513 }
514 }
515 /* release counters for this query */
516 for (c = 0; c < 8; ++c) {
517 if (screen->pm.mp_counter[c] == hsq) {
518 uint8_t d = is_nve4 ? c / 4 : 0; /* only one domain for NVC0:NVE4 */
519 screen->pm.num_hw_sm_active[d]--;
520 screen->pm.mp_counter[c] = NULL;
521 }
522 }
523
524 BCTX_REFN_bo(nvc0->bufctx_cp, CP_QUERY, NOUVEAU_BO_GART | NOUVEAU_BO_WR,
525 hq->bo);
526
527 PUSH_SPACE(push, 1);
528 IMMED_NVC0(push, SUBC_COMPUTE(NV50_GRAPH_SERIALIZE), 0);
529
530 pipe->bind_compute_state(pipe, screen->pm.prog);
531 input[0] = (hq->bo->offset + hq->base_offset);
532 input[1] = (hq->bo->offset + hq->base_offset) >> 32;
533 input[2] = hq->sequence;
534 pipe->launch_grid(pipe, block, grid, 0, input);
535
536 nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_QUERY);
537
538 /* re-activate other counters */
539 PUSH_SPACE(push, 16);
540 mask = 0;
541 for (c = 0; c < 8; ++c) {
542 const struct nvc0_hw_sm_query_cfg *cfg;
543 unsigned i;
544
545 hsq = screen->pm.mp_counter[c];
546 if (!hsq)
547 continue;
548
549 cfg = nvc0_hw_sm_query_get_cfg(nvc0, &hsq->base);
550 for (i = 0; i < cfg->num_counters; ++i) {
551 if (mask & (1 << hsq->ctr[i]))
552 break;
553 mask |= 1 << hsq->ctr[i];
554 if (is_nve4) {
555 BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(hsq->ctr[i])), 1);
556 } else {
557 BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_OP(hsq->ctr[i])), 1);
558 }
559 PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode);
560 }
561 }
562 }
563
564 static inline bool
565 nvc0_hw_sm_query_read_data(uint32_t count[32][8],
566 struct nvc0_context *nvc0, bool wait,
567 struct nvc0_hw_query *hq,
568 const struct nvc0_hw_sm_query_cfg *cfg,
569 unsigned mp_count)
570 {
571 struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
572 unsigned p, c;
573
574 for (p = 0; p < mp_count; ++p) {
575 const unsigned b = (0x24 / 4) * p;
576
577 for (c = 0; c < cfg->num_counters; ++c) {
578 if (hq->data[b + 8] != hq->sequence) {
579 if (!wait)
580 return false;
581 if (nouveau_bo_wait(hq->bo, NOUVEAU_BO_RD, nvc0->base.client))
582 return false;
583 }
584 count[p][c] = hq->data[b + hsq->ctr[c]];
585 }
586 }
587 return true;
588 }
589
590 static inline bool
591 nve4_hw_sm_query_read_data(uint32_t count[32][8],
592 struct nvc0_context *nvc0, bool wait,
593 struct nvc0_hw_query *hq,
594 const struct nvc0_hw_sm_query_cfg *cfg,
595 unsigned mp_count)
596 {
597 struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
598 unsigned p, c, d;
599
600 for (p = 0; p < mp_count; ++p) {
601 const unsigned b = (0x60 / 4) * p;
602
603 for (c = 0; c < cfg->num_counters; ++c) {
604 count[p][c] = 0;
605 for (d = 0; d < ((hsq->ctr[c] & ~3) ? 1 : 4); ++d) {
606 if (hq->data[b + 20 + d] != hq->sequence) {
607 if (!wait)
608 return false;
609 if (nouveau_bo_wait(hq->bo, NOUVEAU_BO_RD, nvc0->base.client))
610 return false;
611 }
612 if (hsq->ctr[c] & ~0x3)
613 count[p][c] = hq->data[b + 16 + (hsq->ctr[c] & 3)];
614 else
615 count[p][c] += hq->data[b + d * 4 + hsq->ctr[c]];
616 }
617 }
618 }
619 return true;
620 }
621
622 /* Metric calculations:
623 * sum(x) ... sum of x over all MPs
624 * avg(x) ... average of x over all MPs
625 *
626 * IPC : sum(inst_executed) / clock
627 * INST_REPLAY_OHEAD: (sum(inst_issued) - sum(inst_executed)) / sum(inst_issued)
628 * MP_OCCUPANCY : avg((active_warps / 64) / active_cycles)
629 * MP_EFFICIENCY : avg(active_cycles / clock)
630 *
631 * NOTE: Interpretation of IPC requires knowledge of MP count.
632 */
633 static boolean
634 nvc0_hw_sm_get_query_result(struct nvc0_context *nvc0, struct nvc0_hw_query *hq,
635 boolean wait, union pipe_query_result *result)
636 {
637 uint32_t count[32][8];
638 uint64_t value = 0;
639 unsigned mp_count = MIN2(nvc0->screen->mp_count_compute, 32);
640 unsigned p, c;
641 const struct nvc0_hw_sm_query_cfg *cfg;
642 bool ret;
643
644 cfg = nvc0_hw_sm_query_get_cfg(nvc0, hq);
645
646 if (nvc0->screen->base.class_3d >= NVE4_3D_CLASS)
647 ret = nve4_hw_sm_query_read_data(count, nvc0, wait, hq, cfg, mp_count);
648 else
649 ret = nvc0_hw_sm_query_read_data(count, nvc0, wait, hq, cfg, mp_count);
650 if (!ret)
651 return false;
652
653 if (cfg->op == NVC0_COUNTER_OPn_SUM) {
654 for (c = 0; c < cfg->num_counters; ++c)
655 for (p = 0; p < mp_count; ++p)
656 value += count[p][c];
657 value = (value * cfg->norm[0]) / cfg->norm[1];
658 } else
659 if (cfg->op == NVC0_COUNTER_OPn_OR) {
660 uint32_t v = 0;
661 for (c = 0; c < cfg->num_counters; ++c)
662 for (p = 0; p < mp_count; ++p)
663 v |= count[p][c];
664 value = ((uint64_t)v * cfg->norm[0]) / cfg->norm[1];
665 } else
666 if (cfg->op == NVC0_COUNTER_OPn_AND) {
667 uint32_t v = ~0;
668 for (c = 0; c < cfg->num_counters; ++c)
669 for (p = 0; p < mp_count; ++p)
670 v &= count[p][c];
671 value = ((uint64_t)v * cfg->norm[0]) / cfg->norm[1];
672 } else
673 if (cfg->op == NVC0_COUNTER_OP2_REL_SUM_MM) {
674 uint64_t v[2] = { 0, 0 };
675 for (p = 0; p < mp_count; ++p) {
676 v[0] += count[p][0];
677 v[1] += count[p][1];
678 }
679 if (v[0])
680 value = ((v[0] - v[1]) * cfg->norm[0]) / (v[0] * cfg->norm[1]);
681 } else
682 if (cfg->op == NVC0_COUNTER_OP2_DIV_SUM_M0) {
683 for (p = 0; p < mp_count; ++p)
684 value += count[p][0];
685 if (count[0][1])
686 value = (value * cfg->norm[0]) / (count[0][1] * cfg->norm[1]);
687 else
688 value = 0;
689 } else
690 if (cfg->op == NVC0_COUNTER_OP2_AVG_DIV_MM) {
691 unsigned mp_used = 0;
692 for (p = 0; p < mp_count; ++p, mp_used += !!count[p][0])
693 if (count[p][1])
694 value += (count[p][0] * cfg->norm[0]) / count[p][1];
695 if (mp_used)
696 value /= (uint64_t)mp_used * cfg->norm[1];
697 } else
698 if (cfg->op == NVC0_COUNTER_OP2_AVG_DIV_M0) {
699 unsigned mp_used = 0;
700 for (p = 0; p < mp_count; ++p, mp_used += !!count[p][0])
701 value += count[p][0];
702 if (count[0][1] && mp_used) {
703 value *= cfg->norm[0];
704 value /= (uint64_t)count[0][1] * mp_used * cfg->norm[1];
705 } else {
706 value = 0;
707 }
708 }
709
710 *(uint64_t *)result = value;
711 return true;
712 }
713
714 static const struct nvc0_hw_query_funcs hw_sm_query_funcs = {
715 .destroy_query = nvc0_hw_sm_destroy_query,
716 .begin_query = nvc0_hw_sm_begin_query,
717 .end_query = nvc0_hw_sm_end_query,
718 .get_query_result = nvc0_hw_sm_get_query_result,
719 };
720
721 struct nvc0_hw_query *
722 nvc0_hw_sm_create_query(struct nvc0_context *nvc0, unsigned type)
723 {
724 struct nvc0_screen *screen = nvc0->screen;
725 struct nvc0_hw_sm_query *hsq;
726 struct nvc0_hw_query *hq;
727 unsigned space;
728
729 if (nvc0->screen->base.device->drm_version < 0x01000101)
730 return NULL;
731
732 if ((type < NVE4_HW_SM_QUERY(0) || type > NVE4_HW_SM_QUERY_LAST) &&
733 (type < NVC0_HW_SM_QUERY(0) || type > NVC0_HW_SM_QUERY_LAST))
734 return NULL;
735
736 hsq = CALLOC_STRUCT(nvc0_hw_sm_query);
737 if (!hsq)
738 return NULL;
739
740 hq = &hsq->base;
741 hq->funcs = &hw_sm_query_funcs;
742 hq->base.type = type;
743
744 if (screen->base.class_3d >= NVE4_3D_CLASS) {
745 /* for each MP:
746 * [00] = WS0.C0
747 * [04] = WS0.C1
748 * [08] = WS0.C2
749 * [0c] = WS0.C3
750 * [10] = WS1.C0
751 * [14] = WS1.C1
752 * [18] = WS1.C2
753 * [1c] = WS1.C3
754 * [20] = WS2.C0
755 * [24] = WS2.C1
756 * [28] = WS2.C2
757 * [2c] = WS2.C3
758 * [30] = WS3.C0
759 * [34] = WS3.C1
760 * [38] = WS3.C2
761 * [3c] = WS3.C3
762 * [40] = MP.C4
763 * [44] = MP.C5
764 * [48] = MP.C6
765 * [4c] = MP.C7
766 * [50] = WS0.sequence
767 * [54] = WS1.sequence
768 * [58] = WS2.sequence
769 * [5c] = WS3.sequence
770 */
771 space = (4 * 4 + 4 + 4) * nvc0->screen->mp_count * sizeof(uint32_t);
772 } else {
773 /* for each MP:
774 * [00] = MP.C0
775 * [04] = MP.C1
776 * [08] = MP.C2
777 * [0c] = MP.C3
778 * [10] = MP.C4
779 * [14] = MP.C5
780 * [18] = MP.C6
781 * [1c] = MP.C7
782 * [20] = MP.sequence
783 */
784 space = (8 + 1) * nvc0->screen->mp_count * sizeof(uint32_t);
785 }
786
787 if (!nvc0_hw_query_allocate(nvc0, &hq->base, space)) {
788 FREE(hq);
789 return NULL;
790 }
791
792 return hq;
793 }