nvc0: move HW SM queries to nvc0_query_hw_sm.c/h files
[mesa.git] / src / gallium / drivers / nouveau / nvc0 / nvc0_query_hw_sm.c
1 /*
2 * Copyright 2011 Christoph Bumiller
3 * Copyright 2015 Samuel Pitoiset
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21 * OTHER DEALINGS IN THE SOFTWARE.
22 */
23
24 #define NVC0_PUSH_EXPLICIT_SPACE_CHECKING
25
26 #include "nvc0/nvc0_context.h"
27 #include "nvc0/nvc0_query_hw_sm.h"
28
29 #include "nv_object.xml.h"
30 #include "nvc0/nve4_compute.xml.h"
31 #include "nvc0/nvc0_compute.xml.h"
32
33 /* === PERFORMANCE MONITORING COUNTERS for NVE4+ === */
34
35 /* Code to read out MP counters: They are accessible via mmio, too, but let's
36 * just avoid mapping registers in userspace. We'd have to know which MPs are
37 * enabled/present, too, and that information is not presently exposed.
38 * We could add a kernel interface for it, but reading the counters like this
39 * has the advantage of being async (if get_result isn't called immediately).
40 */
41 static const uint64_t nve4_read_hw_sm_counters_code[] =
42 {
43 /* sched 0x20 0x20 0x20 0x20 0x20 0x20 0x20
44 * mov b32 $r8 $tidx
45 * mov b32 $r12 $physid
46 * mov b32 $r0 $pm0
47 * mov b32 $r1 $pm1
48 * mov b32 $r2 $pm2
49 * mov b32 $r3 $pm3
50 * mov b32 $r4 $pm4
51 * sched 0x20 0x20 0x23 0x04 0x20 0x04 0x2b
52 * mov b32 $r5 $pm5
53 * mov b32 $r6 $pm6
54 * mov b32 $r7 $pm7
55 * set $p0 0x1 eq u32 $r8 0x0
56 * mov b32 $r10 c0[0x0]
57 * ext u32 $r8 $r12 0x414
58 * mov b32 $r11 c0[0x4]
59 * sched 0x04 0x2e 0x04 0x20 0x20 0x28 0x04
60 * ext u32 $r9 $r12 0x208
61 * (not $p0) exit
62 * set $p1 0x1 eq u32 $r9 0x0
63 * mul $r8 u32 $r8 u32 96
64 * mul $r12 u32 $r9 u32 16
65 * mul $r13 u32 $r9 u32 4
66 * add b32 $r9 $r8 $r13
67 * sched 0x28 0x04 0x2c 0x04 0x2c 0x04 0x2c
68 * add b32 $r8 $r8 $r12
69 * mov b32 $r12 $r10
70 * add b32 $r10 $c $r10 $r8
71 * mov b32 $r13 $r11
72 * add b32 $r11 $r11 0x0 $c
73 * add b32 $r12 $c $r12 $r9
74 * st b128 wt g[$r10d] $r0q
75 * sched 0x4 0x2c 0x20 0x04 0x2e 0x00 0x00
76 * mov b32 $r0 c0[0x8]
77 * add b32 $r13 $r13 0x0 $c
78 * $p1 st b128 wt g[$r12d+0x40] $r4q
79 * st b32 wt g[$r12d+0x50] $r0
80 * exit */
81 0x2202020202020207ULL,
82 0x2c00000084021c04ULL,
83 0x2c0000000c031c04ULL,
84 0x2c00000010001c04ULL,
85 0x2c00000014005c04ULL,
86 0x2c00000018009c04ULL,
87 0x2c0000001c00dc04ULL,
88 0x2c00000020011c04ULL,
89 0x22b0420042320207ULL,
90 0x2c00000024015c04ULL,
91 0x2c00000028019c04ULL,
92 0x2c0000002c01dc04ULL,
93 0x190e0000fc81dc03ULL,
94 0x2800400000029de4ULL,
95 0x7000c01050c21c03ULL,
96 0x280040001002dde4ULL,
97 0x204282020042e047ULL,
98 0x7000c00820c25c03ULL,
99 0x80000000000021e7ULL,
100 0x190e0000fc93dc03ULL,
101 0x1000000180821c02ULL,
102 0x1000000040931c02ULL,
103 0x1000000010935c02ULL,
104 0x4800000034825c03ULL,
105 0x22c042c042c04287ULL,
106 0x4800000030821c03ULL,
107 0x2800000028031de4ULL,
108 0x4801000020a29c03ULL,
109 0x280000002c035de4ULL,
110 0x0800000000b2dc42ULL,
111 0x4801000024c31c03ULL,
112 0x9400000000a01fc5ULL,
113 0x200002e04202c047ULL,
114 0x2800400020001de4ULL,
115 0x0800000000d35c42ULL,
116 0x9400000100c107c5ULL,
117 0x9400000140c01f85ULL,
118 0x8000000000001de7ULL
119 };
120
121 /* For simplicity, we will allocate as many group slots as we allocate counter
122 * slots. This means that a single counter which wants to source from 2 groups
123 * will have to be declared as using 2 counter slots. This shouldn't really be
124 * a problem because such queries don't make much sense ... (unless someone is
125 * really creative).
126 */
127 struct nvc0_hw_sm_counter_cfg
128 {
129 uint32_t func : 16; /* mask or 4-bit logic op (depending on mode) */
130 uint32_t mode : 4; /* LOGOP,B6,LOGOP_B6(_PULSE) */
131 uint32_t num_src : 3; /* number of sources (1 - 6, only for NVC0:NVE4) */
132 uint32_t sig_dom : 1; /* if 0, MP_PM_A (per warp-sched), if 1, MP_PM_B */
133 uint32_t sig_sel : 8; /* signal group */
134 uint64_t src_sel; /* signal selection for up to 6 sources (48 bit) */
135 };
136
137 #define NVC0_COUNTER_OPn_SUM 0
138 #define NVC0_COUNTER_OPn_OR 1
139 #define NVC0_COUNTER_OPn_AND 2
140 #define NVC0_COUNTER_OP2_REL_SUM_MM 3 /* (sum(ctr0) - sum(ctr1)) / sum(ctr0) */
141 #define NVC0_COUNTER_OP2_DIV_SUM_M0 4 /* sum(ctr0) / ctr1 of MP[0]) */
142 #define NVC0_COUNTER_OP2_AVG_DIV_MM 5 /* avg(ctr0 / ctr1) */
143 #define NVC0_COUNTER_OP2_AVG_DIV_M0 6 /* avg(ctr0) / ctr1 of MP[0]) */
144
145 struct nvc0_hw_sm_query_cfg
146 {
147 struct nvc0_hw_sm_counter_cfg ctr[4];
148 uint8_t num_counters;
149 uint8_t op;
150 uint8_t norm[2]; /* normalization num,denom */
151 };
152
153 #define _Q1A(n, f, m, g, s, nu, dn) [NVE4_HW_SM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g, s }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { nu, dn } }
154 #define _Q1B(n, f, m, g, s, nu, dn) [NVE4_HW_SM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g, s }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { nu, dn } }
155 #define _M2A(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_HW_SM_QUERY_METRIC_##n] = { { \
156 { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g0, s0 }, \
157 { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g1, s1 }, \
158 {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } }
159 #define _M2B(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_HW_SM_QUERY_METRIC_##n] = { { \
160 { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g0, s0 }, \
161 { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g1, s1 }, \
162 {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } }
163 #define _M2AB(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_HW_SM_QUERY_METRIC_##n] = { { \
164 { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g0, s0 }, \
165 { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g1, s1 }, \
166 {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } }
167
168 /* NOTES:
169 * active_warps: bit 0 alternates btw 0 and 1 for odd nr of warps
170 * inst_executed etc.: we only count a single warp scheduler
171 * metric-ipXc: we simply multiply by 4 to account for the 4 warp schedulers;
172 * this is inaccurate !
173 */
174 static const struct nvc0_hw_sm_query_cfg nve4_hw_sm_queries[] =
175 {
176 _Q1B(ACTIVE_CYCLES, 0x0001, B6, WARP, 0x00000000, 1, 1),
177 _Q1B(ACTIVE_WARPS, 0x003f, B6, WARP, 0x31483104, 2, 1),
178 _Q1A(ATOM_COUNT, 0x0001, B6, BRANCH, 0x00000000, 1, 1),
179 _Q1A(BRANCH, 0x0001, B6, BRANCH, 0x0000000c, 1, 1),
180 _Q1A(DIVERGENT_BRANCH, 0x0001, B6, BRANCH, 0x00000010, 1, 1),
181 _Q1A(GLD_REQUEST, 0x0001, B6, LDST, 0x00000010, 1, 1),
182 _Q1B(GLD_MEM_DIV_REPLAY, 0x0001, B6, REPLAY, 0x00000010, 1, 1),
183 _Q1B(GST_TRANSACTIONS, 0x0001, B6, MEM, 0x00000004, 1, 1),
184 _Q1B(GST_MEM_DIV_REPLAY, 0x0001, B6, REPLAY, 0x00000014, 1, 1),
185 _Q1A(GRED_COUNT, 0x0001, B6, BRANCH, 0x00000008, 1, 1),
186 _Q1A(GST_REQUEST, 0x0001, B6, LDST, 0x00000014, 1, 1),
187 _Q1A(INST_EXECUTED, 0x0003, B6, EXEC, 0x00000398, 1, 1),
188 _Q1A(INST_ISSUED, 0x0003, B6, ISSUE, 0x00000104, 1, 1),
189 _Q1A(INST_ISSUED1, 0x0001, B6, ISSUE, 0x00000004, 1, 1),
190 _Q1A(INST_ISSUED2, 0x0001, B6, ISSUE, 0x00000008, 1, 1),
191 _Q1B(L1_GLD_HIT, 0x0001, B6, L1, 0x00000010, 1, 1),
192 _Q1B(L1_GLD_MISS, 0x0001, B6, L1, 0x00000014, 1, 1),
193 _Q1B(L1_LOCAL_LD_HIT, 0x0001, B6, L1, 0x00000000, 1, 1),
194 _Q1B(L1_LOCAL_LD_MISS, 0x0001, B6, L1, 0x00000004, 1, 1),
195 _Q1B(L1_LOCAL_ST_HIT, 0x0001, B6, L1, 0x00000008, 1, 1),
196 _Q1B(L1_LOCAL_ST_MISS, 0x0001, B6, L1, 0x0000000c, 1, 1),
197 _Q1B(L1_SHARED_LD_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000008, 1, 1),
198 _Q1B(L1_SHARED_ST_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x0000000c, 1, 1),
199 _Q1A(LOCAL_LD, 0x0001, B6, LDST, 0x00000008, 1, 1),
200 _Q1B(LOCAL_LD_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000000, 1, 1),
201 _Q1A(LOCAL_ST, 0x0001, B6, LDST, 0x0000000c, 1, 1),
202 _Q1B(LOCAL_ST_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000004, 1, 1),
203 _Q1A(PROF_TRIGGER_0, 0x0001, B6, USER, 0x00000000, 1, 1),
204 _Q1A(PROF_TRIGGER_1, 0x0001, B6, USER, 0x00000004, 1, 1),
205 _Q1A(PROF_TRIGGER_2, 0x0001, B6, USER, 0x00000008, 1, 1),
206 _Q1A(PROF_TRIGGER_3, 0x0001, B6, USER, 0x0000000c, 1, 1),
207 _Q1A(PROF_TRIGGER_4, 0x0001, B6, USER, 0x00000010, 1, 1),
208 _Q1A(PROF_TRIGGER_5, 0x0001, B6, USER, 0x00000014, 1, 1),
209 _Q1A(PROF_TRIGGER_6, 0x0001, B6, USER, 0x00000018, 1, 1),
210 _Q1A(PROF_TRIGGER_7, 0x0001, B6, USER, 0x0000001c, 1, 1),
211 _Q1A(SHARED_LD, 0x0001, B6, LDST, 0x00000000, 1, 1),
212 _Q1B(SHARED_LD_REPLAY, 0x0001, B6, REPLAY, 0x00000008, 1, 1),
213 _Q1A(SHARED_ST, 0x0001, B6, LDST, 0x00000004, 1, 1),
214 _Q1B(SHARED_ST_REPLAY, 0x0001, B6, REPLAY, 0x0000000c, 1, 1),
215 _Q1B(SM_CTA_LAUNCHED, 0x0001, B6, WARP, 0x0000001c, 1, 1),
216 _Q1A(THREADS_LAUNCHED, 0x003f, B6, LAUNCH, 0x398a4188, 1, 1),
217 _Q1B(UNCACHED_GLD_TRANSACTIONS, 0x0001, B6, MEM, 0x00000000, 1, 1),
218 _Q1A(WARPS_LAUNCHED, 0x0001, B6, LAUNCH, 0x00000004, 1, 1),
219 _M2AB(IPC, 0x3, B6, EXEC, 0x398, 0xffff, LOGOP, WARP, 0x0, DIV_SUM_M0, 10, 1),
220 _M2AB(IPAC, 0x3, B6, EXEC, 0x398, 0x1, B6, WARP, 0x0, AVG_DIV_MM, 10, 1),
221 _M2A(IPEC, 0x3, B6, EXEC, 0x398, 0xe, LOGOP, EXEC, 0x398, AVG_DIV_MM, 10, 1),
222 _M2A(INST_REPLAY_OHEAD, 0x3, B6, ISSUE, 0x104, 0x3, B6, EXEC, 0x398, REL_SUM_MM, 100, 1),
223 _M2B(MP_OCCUPANCY, 0x3f, B6, WARP, 0x31483104, 0x01, B6, WARP, 0x0, AVG_DIV_MM, 200, 64),
224 _M2B(MP_EFFICIENCY, 0x01, B6, WARP, 0x0, 0xffff, LOGOP, WARP, 0x0, AVG_DIV_M0, 100, 1),
225 };
226
227 #undef _Q1A
228 #undef _Q1B
229 #undef _M2A
230 #undef _M2B
231
232 /* === PERFORMANCE MONITORING COUNTERS for NVC0:NVE4 === */
233 static const uint64_t nvc0_read_hw_sm_counters_code[] =
234 {
235 /* mov b32 $r8 $tidx
236 * mov b32 $r9 $physid
237 * mov b32 $r0 $pm0
238 * mov b32 $r1 $pm1
239 * mov b32 $r2 $pm2
240 * mov b32 $r3 $pm3
241 * mov b32 $r4 $pm4
242 * mov b32 $r5 $pm5
243 * mov b32 $r6 $pm6
244 * mov b32 $r7 $pm7
245 * set $p0 0x1 eq u32 $r8 0x0
246 * mov b32 $r10 c0[0x0]
247 * mov b32 $r11 c0[0x4]
248 * ext u32 $r8 $r9 0x414
249 * (not $p0) exit
250 * mul $r8 u32 $r8 u32 36
251 * add b32 $r10 $c $r10 $r8
252 * add b32 $r11 $r11 0x0 $c
253 * mov b32 $r8 c0[0x8]
254 * st b128 wt g[$r10d+0x00] $r0q
255 * st b128 wt g[$r10d+0x10] $r4q
256 * st b32 wt g[$r10d+0x20] $r8
257 * exit */
258 0x2c00000084021c04ULL,
259 0x2c0000000c025c04ULL,
260 0x2c00000010001c04ULL,
261 0x2c00000014005c04ULL,
262 0x2c00000018009c04ULL,
263 0x2c0000001c00dc04ULL,
264 0x2c00000020011c04ULL,
265 0x2c00000024015c04ULL,
266 0x2c00000028019c04ULL,
267 0x2c0000002c01dc04ULL,
268 0x190e0000fc81dc03ULL,
269 0x2800400000029de4ULL,
270 0x280040001002dde4ULL,
271 0x7000c01050921c03ULL,
272 0x80000000000021e7ULL,
273 0x1000000090821c02ULL,
274 0x4801000020a29c03ULL,
275 0x0800000000b2dc42ULL,
276 0x2800400020021de4ULL,
277 0x9400000000a01fc5ULL,
278 0x9400000040a11fc5ULL,
279 0x9400000080a21f85ULL,
280 0x8000000000001de7ULL
281 };
282
283 #define _Q(n, f, m, g, c, s0, s1, s2, s3, s4, s5) [NVC0_HW_SM_QUERY_##n] = { { { f, NVC0_COMPUTE_MP_PM_OP_MODE_##m, c, 0, g, s0|(s1 << 8)|(s2 << 16)|(s3 << 24)|(s4##ULL << 32)|(s5##ULL << 40) }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { 1, 1 } }
284
285 static const struct nvc0_hw_sm_query_cfg nvc0_hw_sm_queries[] =
286 {
287 _Q(ACTIVE_CYCLES, 0xaaaa, LOGOP, 0x11, 1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00),
288 _Q(ACTIVE_WARPS, 0xaaaa, LOGOP, 0x24, 6, 0x10, 0x21, 0x32, 0x43, 0x54, 0x65),
289 _Q(ATOM_COUNT, 0xaaaa, LOGOP, 0x63, 1, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00),
290 _Q(BRANCH, 0xaaaa, LOGOP, 0x1a, 2, 0x00, 0x11, 0x00, 0x00, 0x00, 0x00),
291 _Q(DIVERGENT_BRANCH, 0xaaaa, LOGOP, 0x19, 2, 0x20, 0x31, 0x00, 0x00, 0x00, 0x00),
292 _Q(GLD_REQUEST, 0xaaaa, LOGOP, 0x64, 1, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00),
293 _Q(GRED_COUNT, 0xaaaa, LOGOP, 0x63, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00),
294 _Q(GST_REQUEST, 0xaaaa, LOGOP, 0x64, 1, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00),
295 _Q(INST_EXECUTED, 0xaaaa, LOGOP, 0x2d, 3, 0x00, 0x11, 0x22, 0x00, 0x00, 0x00),
296 _Q(INST_ISSUED1_0, 0xaaaa, LOGOP, 0x7e, 1, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00),
297 _Q(INST_ISSUED1_1, 0xaaaa, LOGOP, 0x7e, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00),
298 _Q(INST_ISSUED2_0, 0xaaaa, LOGOP, 0x7e, 1, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00),
299 _Q(INST_ISSUED2_1, 0xaaaa, LOGOP, 0x7e, 1, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00),
300 _Q(LOCAL_LD, 0xaaaa, LOGOP, 0x64, 1, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00),
301 _Q(LOCAL_ST, 0xaaaa, LOGOP, 0x64, 1, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00),
302 _Q(PROF_TRIGGER_0, 0xaaaa, LOGOP, 0x01, 1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00),
303 _Q(PROF_TRIGGER_1, 0xaaaa, LOGOP, 0x01, 1, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00),
304 _Q(PROF_TRIGGER_2, 0xaaaa, LOGOP, 0x01, 1, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00),
305 _Q(PROF_TRIGGER_3, 0xaaaa, LOGOP, 0x01, 1, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00),
306 _Q(PROF_TRIGGER_4, 0xaaaa, LOGOP, 0x01, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00),
307 _Q(PROF_TRIGGER_5, 0xaaaa, LOGOP, 0x01, 1, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00),
308 _Q(PROF_TRIGGER_6, 0xaaaa, LOGOP, 0x01, 1, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00),
309 _Q(PROF_TRIGGER_7, 0xaaaa, LOGOP, 0x01, 1, 0x70, 0x00, 0x00, 0x00, 0x00, 0x00),
310 _Q(SHARED_LD, 0xaaaa, LOGOP, 0x64, 1, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00),
311 _Q(SHARED_ST, 0xaaaa, LOGOP, 0x64, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00),
312 _Q(THREADS_LAUNCHED, 0xaaaa, LOGOP, 0x26, 6, 0x10, 0x21, 0x32, 0x43, 0x54, 0x65),
313 _Q(TH_INST_EXECUTED_0, 0xaaaa, LOGOP, 0xa3, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55),
314 _Q(TH_INST_EXECUTED_1, 0xaaaa, LOGOP, 0xa5, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55),
315 _Q(TH_INST_EXECUTED_2, 0xaaaa, LOGOP, 0xa4, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55),
316 _Q(TH_INST_EXECUTED_3, 0xaaaa, LOGOP, 0xa6, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55),
317 _Q(WARPS_LAUNCHED, 0xaaaa, LOGOP, 0x26, 1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00),
318 };
319
320 #undef _Q
321
322 static const struct nvc0_hw_sm_query_cfg *
323 nvc0_hw_sm_query_get_cfg(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
324 {
325 struct nvc0_screen *screen = nvc0->screen;
326 struct nvc0_query *q = &hq->base;
327
328 if (screen->base.class_3d >= NVE4_3D_CLASS)
329 return &nve4_hw_sm_queries[q->type - PIPE_QUERY_DRIVER_SPECIFIC];
330 return &nvc0_hw_sm_queries[q->type - NVC0_HW_SM_QUERY(0)];
331 }
332
333 static void
334 nvc0_hw_sm_destroy_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
335 {
336 struct nvc0_query *q = &hq->base;
337 q->funcs->destroy_query(nvc0, q);
338 }
339
340 static boolean
341 nvc0_hw_sm_begin_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
342 {
343 struct nvc0_screen *screen = nvc0->screen;
344 struct nouveau_pushbuf *push = nvc0->base.pushbuf;
345 const bool is_nve4 = screen->base.class_3d >= NVE4_3D_CLASS;
346 struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
347 const struct nvc0_hw_sm_query_cfg *cfg;
348 unsigned i, c;
349 unsigned num_ab[2] = { 0, 0 };
350
351 cfg = nvc0_hw_sm_query_get_cfg(nvc0, hq);
352
353 /* check if we have enough free counter slots */
354 for (i = 0; i < cfg->num_counters; ++i)
355 num_ab[cfg->ctr[i].sig_dom]++;
356
357 if (screen->pm.num_hw_sm_active[0] + num_ab[0] > 4 ||
358 screen->pm.num_hw_sm_active[1] + num_ab[1] > 4) {
359 NOUVEAU_ERR("Not enough free MP counter slots !\n");
360 return false;
361 }
362
363 assert(cfg->num_counters <= 4);
364 PUSH_SPACE(push, 4 * 8 * (is_nve4 ? 1 : 6) + 6);
365
366 if (!screen->pm.mp_counters_enabled) {
367 screen->pm.mp_counters_enabled = true;
368 BEGIN_NVC0(push, SUBC_SW(0x06ac), 1);
369 PUSH_DATA (push, 0x1fcb);
370 }
371
372 /* set sequence field to 0 (used to check if result is available) */
373 for (i = 0; i < screen->mp_count; ++i)
374 hq->data[i * 10 + 10] = 0;
375 hq->sequence++;
376
377 for (i = 0; i < cfg->num_counters; ++i) {
378 const unsigned d = cfg->ctr[i].sig_dom;
379
380 if (!screen->pm.num_hw_sm_active[d]) {
381 uint32_t m = (1 << 22) | (1 << (7 + (8 * !d)));
382 if (screen->pm.num_hw_sm_active[!d])
383 m |= 1 << (7 + (8 * d));
384 BEGIN_NVC0(push, SUBC_SW(0x0600), 1);
385 PUSH_DATA (push, m);
386 }
387 screen->pm.num_hw_sm_active[d]++;
388
389 for (c = d * 4; c < (d * 4 + 4); ++c) {
390 if (!screen->pm.mp_counter[c]) {
391 hsq->ctr[i] = c;
392 screen->pm.mp_counter[c] = hsq;
393 break;
394 }
395 }
396 assert(c <= (d * 4 + 3)); /* must succeed, already checked for space */
397
398 /* configure and reset the counter(s) */
399 if (is_nve4) {
400 if (d == 0)
401 BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_A_SIGSEL(c & 3)), 1);
402 else
403 BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_B_SIGSEL(c & 3)), 1);
404 PUSH_DATA (push, cfg->ctr[i].sig_sel);
405 BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_SRCSEL(c)), 1);
406 PUSH_DATA (push, cfg->ctr[i].src_sel + 0x2108421 * (c & 3));
407 BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(c)), 1);
408 PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode);
409 BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_SET(c)), 1);
410 PUSH_DATA (push, 0);
411 } else {
412 unsigned s;
413
414 for (s = 0; s < cfg->ctr[i].num_src; s++) {
415 BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SIGSEL(s)), 1);
416 PUSH_DATA (push, cfg->ctr[i].sig_sel);
417 BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SRCSEL(s)), 1);
418 PUSH_DATA (push, (cfg->ctr[i].src_sel >> (s * 8)) & 0xff);
419 BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_OP(s)), 1);
420 PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode);
421 BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SET(s)), 1);
422 PUSH_DATA (push, 0);
423 }
424 }
425 }
426 return true;
427 }
428
429 static void
430 nvc0_hw_sm_end_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
431 {
432 struct nvc0_screen *screen = nvc0->screen;
433 struct pipe_context *pipe = &nvc0->base.pipe;
434 struct nouveau_pushbuf *push = nvc0->base.pushbuf;
435 const bool is_nve4 = screen->base.class_3d >= NVE4_3D_CLASS;
436 struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
437 uint32_t mask;
438 uint32_t input[3];
439 const uint block[3] = { 32, is_nve4 ? 4 : 1, 1 };
440 const uint grid[3] = { screen->mp_count, 1, 1 };
441 unsigned c;
442 const struct nvc0_hw_sm_query_cfg *cfg;
443
444 cfg = nvc0_hw_sm_query_get_cfg(nvc0, hq);
445
446 if (unlikely(!screen->pm.prog)) {
447 struct nvc0_program *prog = CALLOC_STRUCT(nvc0_program);
448 prog->type = PIPE_SHADER_COMPUTE;
449 prog->translated = true;
450 prog->num_gprs = 14;
451 prog->parm_size = 12;
452 if (is_nve4) {
453 prog->code = (uint32_t *)nve4_read_hw_sm_counters_code;
454 prog->code_size = sizeof(nve4_read_hw_sm_counters_code);
455 } else {
456 prog->code = (uint32_t *)nvc0_read_hw_sm_counters_code;
457 prog->code_size = sizeof(nvc0_read_hw_sm_counters_code);
458 }
459 screen->pm.prog = prog;
460 }
461
462 /* disable all counting */
463 PUSH_SPACE(push, 8);
464 for (c = 0; c < 8; ++c)
465 if (screen->pm.mp_counter[c]) {
466 if (is_nve4) {
467 IMMED_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(c)), 0);
468 } else {
469 IMMED_NVC0(push, NVC0_COMPUTE(MP_PM_OP(c)), 0);
470 }
471 }
472 /* release counters for this query */
473 for (c = 0; c < 8; ++c) {
474 if (screen->pm.mp_counter[c] == hsq) {
475 screen->pm.num_hw_sm_active[c / 4]--;
476 screen->pm.mp_counter[c] = NULL;
477 }
478 }
479
480 BCTX_REFN_bo(nvc0->bufctx_cp, CP_QUERY, NOUVEAU_BO_GART | NOUVEAU_BO_WR,
481 hq->bo);
482
483 PUSH_SPACE(push, 1);
484 IMMED_NVC0(push, SUBC_COMPUTE(NV50_GRAPH_SERIALIZE), 0);
485
486 pipe->bind_compute_state(pipe, screen->pm.prog);
487 input[0] = (hq->bo->offset + hq->base_offset);
488 input[1] = (hq->bo->offset + hq->base_offset) >> 32;
489 input[2] = hq->sequence;
490 pipe->launch_grid(pipe, block, grid, 0, input);
491
492 nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_QUERY);
493
494 /* re-activate other counters */
495 PUSH_SPACE(push, 16);
496 mask = 0;
497 for (c = 0; c < 8; ++c) {
498 unsigned i;
499
500 hsq = screen->pm.mp_counter[c];
501 if (!hsq)
502 continue;
503
504 cfg = nvc0_hw_sm_query_get_cfg(nvc0, &hsq->base);
505 for (i = 0; i < cfg->num_counters; ++i) {
506 if (mask & (1 << hsq->ctr[i]))
507 break;
508 mask |= 1 << hsq->ctr[i];
509 if (is_nve4) {
510 BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(hsq->ctr[i])), 1);
511 } else {
512 BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_OP(hsq->ctr[i])), 1);
513 }
514 PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode);
515 }
516 }
517 }
518
519 static inline bool
520 nvc0_hw_sm_query_read_data(uint32_t count[32][4],
521 struct nvc0_context *nvc0, bool wait,
522 struct nvc0_hw_query *hq,
523 const struct nvc0_hw_sm_query_cfg *cfg,
524 unsigned mp_count)
525 {
526 struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
527 unsigned p, c;
528
529 for (p = 0; p < mp_count; ++p) {
530 const unsigned b = (0x24 / 4) * p;
531
532 for (c = 0; c < cfg->num_counters; ++c) {
533 if (hq->data[b + 8] != hq->sequence) {
534 if (!wait)
535 return false;
536 if (nouveau_bo_wait(hq->bo, NOUVEAU_BO_RD, nvc0->base.client))
537 return false;
538 }
539 count[p][c] = hq->data[b + hsq->ctr[c]];
540 }
541 }
542 return true;
543 }
544
545 static inline bool
546 nve4_hw_sm_query_read_data(uint32_t count[32][4],
547 struct nvc0_context *nvc0, bool wait,
548 struct nvc0_hw_query *hq,
549 const struct nvc0_hw_sm_query_cfg *cfg,
550 unsigned mp_count)
551 {
552 struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
553 unsigned p, c, d;
554
555 for (p = 0; p < mp_count; ++p) {
556 const unsigned b = (0x60 / 4) * p;
557
558 for (c = 0; c < cfg->num_counters; ++c) {
559 count[p][c] = 0;
560 for (d = 0; d < ((hsq->ctr[c] & ~3) ? 1 : 4); ++d) {
561 if (hq->data[b + 20 + d] != hq->sequence) {
562 if (!wait)
563 return false;
564 if (nouveau_bo_wait(hq->bo, NOUVEAU_BO_RD, nvc0->base.client))
565 return false;
566 }
567 if (hsq->ctr[c] & ~0x3)
568 count[p][c] = hq->data[b + 16 + (hsq->ctr[c] & 3)];
569 else
570 count[p][c] += hq->data[b + d * 4 + hsq->ctr[c]];
571 }
572 }
573 }
574 return true;
575 }
576
577 /* Metric calculations:
578 * sum(x) ... sum of x over all MPs
579 * avg(x) ... average of x over all MPs
580 *
581 * IPC : sum(inst_executed) / clock
582 * INST_REPLAY_OHEAD: (sum(inst_issued) - sum(inst_executed)) / sum(inst_issued)
583 * MP_OCCUPANCY : avg((active_warps / 64) / active_cycles)
584 * MP_EFFICIENCY : avg(active_cycles / clock)
585 *
586 * NOTE: Interpretation of IPC requires knowledge of MP count.
587 */
588 static boolean
589 nvc0_hw_sm_get_query_result(struct nvc0_context *nvc0, struct nvc0_hw_query *hq,
590 boolean wait, union pipe_query_result *result)
591 {
592 uint32_t count[32][4];
593 uint64_t value = 0;
594 unsigned mp_count = MIN2(nvc0->screen->mp_count_compute, 32);
595 unsigned p, c;
596 const struct nvc0_hw_sm_query_cfg *cfg;
597 bool ret;
598
599 cfg = nvc0_hw_sm_query_get_cfg(nvc0, hq);
600
601 if (nvc0->screen->base.class_3d >= NVE4_3D_CLASS)
602 ret = nve4_hw_sm_query_read_data(count, nvc0, wait, hq, cfg, mp_count);
603 else
604 ret = nvc0_hw_sm_query_read_data(count, nvc0, wait, hq, cfg, mp_count);
605 if (!ret)
606 return false;
607
608 if (cfg->op == NVC0_COUNTER_OPn_SUM) {
609 for (c = 0; c < cfg->num_counters; ++c)
610 for (p = 0; p < mp_count; ++p)
611 value += count[p][c];
612 value = (value * cfg->norm[0]) / cfg->norm[1];
613 } else
614 if (cfg->op == NVC0_COUNTER_OPn_OR) {
615 uint32_t v = 0;
616 for (c = 0; c < cfg->num_counters; ++c)
617 for (p = 0; p < mp_count; ++p)
618 v |= count[p][c];
619 value = ((uint64_t)v * cfg->norm[0]) / cfg->norm[1];
620 } else
621 if (cfg->op == NVC0_COUNTER_OPn_AND) {
622 uint32_t v = ~0;
623 for (c = 0; c < cfg->num_counters; ++c)
624 for (p = 0; p < mp_count; ++p)
625 v &= count[p][c];
626 value = ((uint64_t)v * cfg->norm[0]) / cfg->norm[1];
627 } else
628 if (cfg->op == NVC0_COUNTER_OP2_REL_SUM_MM) {
629 uint64_t v[2] = { 0, 0 };
630 for (p = 0; p < mp_count; ++p) {
631 v[0] += count[p][0];
632 v[1] += count[p][1];
633 }
634 if (v[0])
635 value = ((v[0] - v[1]) * cfg->norm[0]) / (v[0] * cfg->norm[1]);
636 } else
637 if (cfg->op == NVC0_COUNTER_OP2_DIV_SUM_M0) {
638 for (p = 0; p < mp_count; ++p)
639 value += count[p][0];
640 if (count[0][1])
641 value = (value * cfg->norm[0]) / (count[0][1] * cfg->norm[1]);
642 else
643 value = 0;
644 } else
645 if (cfg->op == NVC0_COUNTER_OP2_AVG_DIV_MM) {
646 unsigned mp_used = 0;
647 for (p = 0; p < mp_count; ++p, mp_used += !!count[p][0])
648 if (count[p][1])
649 value += (count[p][0] * cfg->norm[0]) / count[p][1];
650 if (mp_used)
651 value /= (uint64_t)mp_used * cfg->norm[1];
652 } else
653 if (cfg->op == NVC0_COUNTER_OP2_AVG_DIV_M0) {
654 unsigned mp_used = 0;
655 for (p = 0; p < mp_count; ++p, mp_used += !!count[p][0])
656 value += count[p][0];
657 if (count[0][1] && mp_used) {
658 value *= cfg->norm[0];
659 value /= (uint64_t)count[0][1] * mp_used * cfg->norm[1];
660 } else {
661 value = 0;
662 }
663 }
664
665 *(uint64_t *)result = value;
666 return true;
667 }
668
669 static const struct nvc0_hw_query_funcs hw_sm_query_funcs = {
670 .destroy_query = nvc0_hw_sm_destroy_query,
671 .begin_query = nvc0_hw_sm_begin_query,
672 .end_query = nvc0_hw_sm_end_query,
673 .get_query_result = nvc0_hw_sm_get_query_result,
674 };
675
676 struct nvc0_hw_query *
677 nvc0_hw_sm_create_query(struct nvc0_context *nvc0, unsigned type)
678 {
679 struct nvc0_screen *screen = nvc0->screen;
680 struct nvc0_hw_sm_query *hsq;
681 struct nvc0_hw_query *hq;
682 unsigned space;
683
684 if (nvc0->screen->base.device->drm_version < 0x01000101)
685 return NULL;
686
687 if ((type < NVE4_HW_SM_QUERY(0) || type > NVE4_HW_SM_QUERY_LAST) &&
688 (type < NVC0_HW_SM_QUERY(0) || type > NVC0_HW_SM_QUERY_LAST))
689 return NULL;
690
691 hsq = CALLOC_STRUCT(nvc0_hw_sm_query);
692 if (!hsq)
693 return NULL;
694
695 hq = &hsq->base;
696 hq->funcs = &hw_sm_query_funcs;
697 hq->base.type = type;
698
699 if (screen->base.class_3d >= NVE4_3D_CLASS) {
700 /* for each MP:
701 * [00] = WS0.C0
702 * [04] = WS0.C1
703 * [08] = WS0.C2
704 * [0c] = WS0.C3
705 * [10] = WS1.C0
706 * [14] = WS1.C1
707 * [18] = WS1.C2
708 * [1c] = WS1.C3
709 * [20] = WS2.C0
710 * [24] = WS2.C1
711 * [28] = WS2.C2
712 * [2c] = WS2.C3
713 * [30] = WS3.C0
714 * [34] = WS3.C1
715 * [38] = WS3.C2
716 * [3c] = WS3.C3
717 * [40] = MP.C4
718 * [44] = MP.C5
719 * [48] = MP.C6
720 * [4c] = MP.C7
721 * [50] = WS0.sequence
722 * [54] = WS1.sequence
723 * [58] = WS2.sequence
724 * [5c] = WS3.sequence
725 */
726 space = (4 * 4 + 4 + 4) * nvc0->screen->mp_count * sizeof(uint32_t);
727 } else {
728 /* for each MP:
729 * [00] = MP.C0
730 * [04] = MP.C1
731 * [08] = MP.C2
732 * [0c] = MP.C3
733 * [10] = MP.C4
734 * [14] = MP.C5
735 * [18] = MP.C6
736 * [1c] = MP.C7
737 * [20] = MP.sequence
738 */
739 space = (8 + 1) * nvc0->screen->mp_count * sizeof(uint32_t);
740 }
741
742 if (!nvc0_hw_query_allocate(nvc0, &hq->base, space)) {
743 FREE(hq);
744 return NULL;
745 }
746
747 return hq;
748 }