nvc0: move HW queries to nvc0_query_hw.c/h files
[mesa.git] / src / gallium / drivers / nouveau / nvc0 / nvc0_query_hw.c
1 /*
2 * Copyright 2011 Christoph Bumiller
3 * Copyright 2015 Samuel Pitoiset
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21 * OTHER DEALINGS IN THE SOFTWARE.
22 */
23
24 #define NVC0_PUSH_EXPLICIT_SPACE_CHECKING
25
26 #include "nvc0/nvc0_context.h"
27 #include "nvc0/nvc0_query_hw.h"
28
29 #include "nv_object.xml.h"
30 #include "nvc0/nve4_compute.xml.h"
31 #include "nvc0/nvc0_compute.xml.h"
32
33 #define NVC0_HW_QUERY_STATE_READY 0
34 #define NVC0_HW_QUERY_STATE_ACTIVE 1
35 #define NVC0_HW_QUERY_STATE_ENDED 2
36 #define NVC0_HW_QUERY_STATE_FLUSHED 3
37
38 #define NVC0_HW_QUERY_ALLOC_SPACE 256
39
40 /* === PERFORMANCE MONITORING COUNTERS for NVE4+ === */
41
42 /* Code to read out MP counters: They are accessible via mmio, too, but let's
43 * just avoid mapping registers in userspace. We'd have to know which MPs are
44 * enabled/present, too, and that information is not presently exposed.
45 * We could add a kernel interface for it, but reading the counters like this
46 * has the advantage of being async (if get_result isn't called immediately).
47 */
48 static const uint64_t nve4_read_hw_sm_counters_code[] =
49 {
50 /* sched 0x20 0x20 0x20 0x20 0x20 0x20 0x20
51 * mov b32 $r8 $tidx
52 * mov b32 $r12 $physid
53 * mov b32 $r0 $pm0
54 * mov b32 $r1 $pm1
55 * mov b32 $r2 $pm2
56 * mov b32 $r3 $pm3
57 * mov b32 $r4 $pm4
58 * sched 0x20 0x20 0x23 0x04 0x20 0x04 0x2b
59 * mov b32 $r5 $pm5
60 * mov b32 $r6 $pm6
61 * mov b32 $r7 $pm7
62 * set $p0 0x1 eq u32 $r8 0x0
63 * mov b32 $r10 c0[0x0]
64 * ext u32 $r8 $r12 0x414
65 * mov b32 $r11 c0[0x4]
66 * sched 0x04 0x2e 0x04 0x20 0x20 0x28 0x04
67 * ext u32 $r9 $r12 0x208
68 * (not $p0) exit
69 * set $p1 0x1 eq u32 $r9 0x0
70 * mul $r8 u32 $r8 u32 96
71 * mul $r12 u32 $r9 u32 16
72 * mul $r13 u32 $r9 u32 4
73 * add b32 $r9 $r8 $r13
74 * sched 0x28 0x04 0x2c 0x04 0x2c 0x04 0x2c
75 * add b32 $r8 $r8 $r12
76 * mov b32 $r12 $r10
77 * add b32 $r10 $c $r10 $r8
78 * mov b32 $r13 $r11
79 * add b32 $r11 $r11 0x0 $c
80 * add b32 $r12 $c $r12 $r9
81 * st b128 wt g[$r10d] $r0q
82 * sched 0x4 0x2c 0x20 0x04 0x2e 0x00 0x00
83 * mov b32 $r0 c0[0x8]
84 * add b32 $r13 $r13 0x0 $c
85 * $p1 st b128 wt g[$r12d+0x40] $r4q
86 * st b32 wt g[$r12d+0x50] $r0
87 * exit */
88 0x2202020202020207ULL,
89 0x2c00000084021c04ULL,
90 0x2c0000000c031c04ULL,
91 0x2c00000010001c04ULL,
92 0x2c00000014005c04ULL,
93 0x2c00000018009c04ULL,
94 0x2c0000001c00dc04ULL,
95 0x2c00000020011c04ULL,
96 0x22b0420042320207ULL,
97 0x2c00000024015c04ULL,
98 0x2c00000028019c04ULL,
99 0x2c0000002c01dc04ULL,
100 0x190e0000fc81dc03ULL,
101 0x2800400000029de4ULL,
102 0x7000c01050c21c03ULL,
103 0x280040001002dde4ULL,
104 0x204282020042e047ULL,
105 0x7000c00820c25c03ULL,
106 0x80000000000021e7ULL,
107 0x190e0000fc93dc03ULL,
108 0x1000000180821c02ULL,
109 0x1000000040931c02ULL,
110 0x1000000010935c02ULL,
111 0x4800000034825c03ULL,
112 0x22c042c042c04287ULL,
113 0x4800000030821c03ULL,
114 0x2800000028031de4ULL,
115 0x4801000020a29c03ULL,
116 0x280000002c035de4ULL,
117 0x0800000000b2dc42ULL,
118 0x4801000024c31c03ULL,
119 0x9400000000a01fc5ULL,
120 0x200002e04202c047ULL,
121 0x2800400020001de4ULL,
122 0x0800000000d35c42ULL,
123 0x9400000100c107c5ULL,
124 0x9400000140c01f85ULL,
125 0x8000000000001de7ULL
126 };
127
128 /* For simplicity, we will allocate as many group slots as we allocate counter
129 * slots. This means that a single counter which wants to source from 2 groups
130 * will have to be declared as using 2 counter slots. This shouldn't really be
131 * a problem because such queries don't make much sense ... (unless someone is
132 * really creative).
133 */
134 struct nvc0_mp_counter_cfg
135 {
136 uint32_t func : 16; /* mask or 4-bit logic op (depending on mode) */
137 uint32_t mode : 4; /* LOGOP,B6,LOGOP_B6(_PULSE) */
138 uint32_t num_src : 3; /* number of sources (1 - 6, only for NVC0:NVE4) */
139 uint32_t sig_dom : 1; /* if 0, MP_PM_A (per warp-sched), if 1, MP_PM_B */
140 uint32_t sig_sel : 8; /* signal group */
141 uint64_t src_sel; /* signal selection for up to 6 sources (48 bit) */
142 };
143
144 #define NVC0_COUNTER_OPn_SUM 0
145 #define NVC0_COUNTER_OPn_OR 1
146 #define NVC0_COUNTER_OPn_AND 2
147 #define NVC0_COUNTER_OP2_REL_SUM_MM 3 /* (sum(ctr0) - sum(ctr1)) / sum(ctr0) */
148 #define NVC0_COUNTER_OP2_DIV_SUM_M0 4 /* sum(ctr0) / ctr1 of MP[0]) */
149 #define NVC0_COUNTER_OP2_AVG_DIV_MM 5 /* avg(ctr0 / ctr1) */
150 #define NVC0_COUNTER_OP2_AVG_DIV_M0 6 /* avg(ctr0) / ctr1 of MP[0]) */
151
152 struct nvc0_hw_sm_query_cfg
153 {
154 struct nvc0_mp_counter_cfg ctr[4];
155 uint8_t num_counters;
156 uint8_t op;
157 uint8_t norm[2]; /* normalization num,denom */
158 };
159
160 #define _Q1A(n, f, m, g, s, nu, dn) [NVE4_HW_SM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g, s }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { nu, dn } }
161 #define _Q1B(n, f, m, g, s, nu, dn) [NVE4_HW_SM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g, s }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { nu, dn } }
162 #define _M2A(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_HW_SM_QUERY_METRIC_##n] = { { \
163 { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g0, s0 }, \
164 { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g1, s1 }, \
165 {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } }
166 #define _M2B(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_HW_SM_QUERY_METRIC_##n] = { { \
167 { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g0, s0 }, \
168 { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g1, s1 }, \
169 {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } }
170 #define _M2AB(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_HW_SM_QUERY_METRIC_##n] = { { \
171 { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g0, s0 }, \
172 { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g1, s1 }, \
173 {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } }
174
175 /* NOTES:
176 * active_warps: bit 0 alternates btw 0 and 1 for odd nr of warps
177 * inst_executed etc.: we only count a single warp scheduler
178 * metric-ipXc: we simply multiply by 4 to account for the 4 warp schedulers;
179 * this is inaccurate !
180 */
181 static const struct nvc0_hw_sm_query_cfg nve4_hw_sm_queries[] =
182 {
183 _Q1B(ACTIVE_CYCLES, 0x0001, B6, WARP, 0x00000000, 1, 1),
184 _Q1B(ACTIVE_WARPS, 0x003f, B6, WARP, 0x31483104, 2, 1),
185 _Q1A(ATOM_COUNT, 0x0001, B6, BRANCH, 0x00000000, 1, 1),
186 _Q1A(BRANCH, 0x0001, B6, BRANCH, 0x0000000c, 1, 1),
187 _Q1A(DIVERGENT_BRANCH, 0x0001, B6, BRANCH, 0x00000010, 1, 1),
188 _Q1A(GLD_REQUEST, 0x0001, B6, LDST, 0x00000010, 1, 1),
189 _Q1B(GLD_MEM_DIV_REPLAY, 0x0001, B6, REPLAY, 0x00000010, 1, 1),
190 _Q1B(GST_TRANSACTIONS, 0x0001, B6, MEM, 0x00000004, 1, 1),
191 _Q1B(GST_MEM_DIV_REPLAY, 0x0001, B6, REPLAY, 0x00000014, 1, 1),
192 _Q1A(GRED_COUNT, 0x0001, B6, BRANCH, 0x00000008, 1, 1),
193 _Q1A(GST_REQUEST, 0x0001, B6, LDST, 0x00000014, 1, 1),
194 _Q1A(INST_EXECUTED, 0x0003, B6, EXEC, 0x00000398, 1, 1),
195 _Q1A(INST_ISSUED, 0x0003, B6, ISSUE, 0x00000104, 1, 1),
196 _Q1A(INST_ISSUED1, 0x0001, B6, ISSUE, 0x00000004, 1, 1),
197 _Q1A(INST_ISSUED2, 0x0001, B6, ISSUE, 0x00000008, 1, 1),
198 _Q1B(L1_GLD_HIT, 0x0001, B6, L1, 0x00000010, 1, 1),
199 _Q1B(L1_GLD_MISS, 0x0001, B6, L1, 0x00000014, 1, 1),
200 _Q1B(L1_LOCAL_LD_HIT, 0x0001, B6, L1, 0x00000000, 1, 1),
201 _Q1B(L1_LOCAL_LD_MISS, 0x0001, B6, L1, 0x00000004, 1, 1),
202 _Q1B(L1_LOCAL_ST_HIT, 0x0001, B6, L1, 0x00000008, 1, 1),
203 _Q1B(L1_LOCAL_ST_MISS, 0x0001, B6, L1, 0x0000000c, 1, 1),
204 _Q1B(L1_SHARED_LD_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000008, 1, 1),
205 _Q1B(L1_SHARED_ST_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x0000000c, 1, 1),
206 _Q1A(LOCAL_LD, 0x0001, B6, LDST, 0x00000008, 1, 1),
207 _Q1B(LOCAL_LD_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000000, 1, 1),
208 _Q1A(LOCAL_ST, 0x0001, B6, LDST, 0x0000000c, 1, 1),
209 _Q1B(LOCAL_ST_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000004, 1, 1),
210 _Q1A(PROF_TRIGGER_0, 0x0001, B6, USER, 0x00000000, 1, 1),
211 _Q1A(PROF_TRIGGER_1, 0x0001, B6, USER, 0x00000004, 1, 1),
212 _Q1A(PROF_TRIGGER_2, 0x0001, B6, USER, 0x00000008, 1, 1),
213 _Q1A(PROF_TRIGGER_3, 0x0001, B6, USER, 0x0000000c, 1, 1),
214 _Q1A(PROF_TRIGGER_4, 0x0001, B6, USER, 0x00000010, 1, 1),
215 _Q1A(PROF_TRIGGER_5, 0x0001, B6, USER, 0x00000014, 1, 1),
216 _Q1A(PROF_TRIGGER_6, 0x0001, B6, USER, 0x00000018, 1, 1),
217 _Q1A(PROF_TRIGGER_7, 0x0001, B6, USER, 0x0000001c, 1, 1),
218 _Q1A(SHARED_LD, 0x0001, B6, LDST, 0x00000000, 1, 1),
219 _Q1B(SHARED_LD_REPLAY, 0x0001, B6, REPLAY, 0x00000008, 1, 1),
220 _Q1A(SHARED_ST, 0x0001, B6, LDST, 0x00000004, 1, 1),
221 _Q1B(SHARED_ST_REPLAY, 0x0001, B6, REPLAY, 0x0000000c, 1, 1),
222 _Q1B(SM_CTA_LAUNCHED, 0x0001, B6, WARP, 0x0000001c, 1, 1),
223 _Q1A(THREADS_LAUNCHED, 0x003f, B6, LAUNCH, 0x398a4188, 1, 1),
224 _Q1B(UNCACHED_GLD_TRANSACTIONS, 0x0001, B6, MEM, 0x00000000, 1, 1),
225 _Q1A(WARPS_LAUNCHED, 0x0001, B6, LAUNCH, 0x00000004, 1, 1),
226 _M2AB(IPC, 0x3, B6, EXEC, 0x398, 0xffff, LOGOP, WARP, 0x0, DIV_SUM_M0, 10, 1),
227 _M2AB(IPAC, 0x3, B6, EXEC, 0x398, 0x1, B6, WARP, 0x0, AVG_DIV_MM, 10, 1),
228 _M2A(IPEC, 0x3, B6, EXEC, 0x398, 0xe, LOGOP, EXEC, 0x398, AVG_DIV_MM, 10, 1),
229 _M2A(INST_REPLAY_OHEAD, 0x3, B6, ISSUE, 0x104, 0x3, B6, EXEC, 0x398, REL_SUM_MM, 100, 1),
230 _M2B(MP_OCCUPANCY, 0x3f, B6, WARP, 0x31483104, 0x01, B6, WARP, 0x0, AVG_DIV_MM, 200, 64),
231 _M2B(MP_EFFICIENCY, 0x01, B6, WARP, 0x0, 0xffff, LOGOP, WARP, 0x0, AVG_DIV_M0, 100, 1),
232 };
233
234 #undef _Q1A
235 #undef _Q1B
236 #undef _M2A
237 #undef _M2B
238
239 /* === PERFORMANCE MONITORING COUNTERS for NVC0:NVE4 === */
240 static const uint64_t nvc0_read_hw_sm_counters_code[] =
241 {
242 /* mov b32 $r8 $tidx
243 * mov b32 $r9 $physid
244 * mov b32 $r0 $pm0
245 * mov b32 $r1 $pm1
246 * mov b32 $r2 $pm2
247 * mov b32 $r3 $pm3
248 * mov b32 $r4 $pm4
249 * mov b32 $r5 $pm5
250 * mov b32 $r6 $pm6
251 * mov b32 $r7 $pm7
252 * set $p0 0x1 eq u32 $r8 0x0
253 * mov b32 $r10 c0[0x0]
254 * mov b32 $r11 c0[0x4]
255 * ext u32 $r8 $r9 0x414
256 * (not $p0) exit
257 * mul $r8 u32 $r8 u32 36
258 * add b32 $r10 $c $r10 $r8
259 * add b32 $r11 $r11 0x0 $c
260 * mov b32 $r8 c0[0x8]
261 * st b128 wt g[$r10d+0x00] $r0q
262 * st b128 wt g[$r10d+0x10] $r4q
263 * st b32 wt g[$r10d+0x20] $r8
264 * exit */
265 0x2c00000084021c04ULL,
266 0x2c0000000c025c04ULL,
267 0x2c00000010001c04ULL,
268 0x2c00000014005c04ULL,
269 0x2c00000018009c04ULL,
270 0x2c0000001c00dc04ULL,
271 0x2c00000020011c04ULL,
272 0x2c00000024015c04ULL,
273 0x2c00000028019c04ULL,
274 0x2c0000002c01dc04ULL,
275 0x190e0000fc81dc03ULL,
276 0x2800400000029de4ULL,
277 0x280040001002dde4ULL,
278 0x7000c01050921c03ULL,
279 0x80000000000021e7ULL,
280 0x1000000090821c02ULL,
281 0x4801000020a29c03ULL,
282 0x0800000000b2dc42ULL,
283 0x2800400020021de4ULL,
284 0x9400000000a01fc5ULL,
285 0x9400000040a11fc5ULL,
286 0x9400000080a21f85ULL,
287 0x8000000000001de7ULL
288 };
289
290 #define _Q(n, f, m, g, c, s0, s1, s2, s3, s4, s5) [NVC0_HW_SM_QUERY_##n] = { { { f, NVC0_COMPUTE_MP_PM_OP_MODE_##m, c, 0, g, s0|(s1 << 8)|(s2 << 16)|(s3 << 24)|(s4##ULL << 32)|(s5##ULL << 40) }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { 1, 1 } }
291
292 static const struct nvc0_hw_sm_query_cfg nvc0_hw_sm_queries[] =
293 {
294 _Q(ACTIVE_CYCLES, 0xaaaa, LOGOP, 0x11, 1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00),
295 _Q(ACTIVE_WARPS, 0xaaaa, LOGOP, 0x24, 6, 0x10, 0x21, 0x32, 0x43, 0x54, 0x65),
296 _Q(ATOM_COUNT, 0xaaaa, LOGOP, 0x63, 1, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00),
297 _Q(BRANCH, 0xaaaa, LOGOP, 0x1a, 2, 0x00, 0x11, 0x00, 0x00, 0x00, 0x00),
298 _Q(DIVERGENT_BRANCH, 0xaaaa, LOGOP, 0x19, 2, 0x20, 0x31, 0x00, 0x00, 0x00, 0x00),
299 _Q(GLD_REQUEST, 0xaaaa, LOGOP, 0x64, 1, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00),
300 _Q(GRED_COUNT, 0xaaaa, LOGOP, 0x63, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00),
301 _Q(GST_REQUEST, 0xaaaa, LOGOP, 0x64, 1, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00),
302 _Q(INST_EXECUTED, 0xaaaa, LOGOP, 0x2d, 3, 0x00, 0x11, 0x22, 0x00, 0x00, 0x00),
303 _Q(INST_ISSUED1_0, 0xaaaa, LOGOP, 0x7e, 1, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00),
304 _Q(INST_ISSUED1_1, 0xaaaa, LOGOP, 0x7e, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00),
305 _Q(INST_ISSUED2_0, 0xaaaa, LOGOP, 0x7e, 1, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00),
306 _Q(INST_ISSUED2_1, 0xaaaa, LOGOP, 0x7e, 1, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00),
307 _Q(LOCAL_LD, 0xaaaa, LOGOP, 0x64, 1, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00),
308 _Q(LOCAL_ST, 0xaaaa, LOGOP, 0x64, 1, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00),
309 _Q(PROF_TRIGGER_0, 0xaaaa, LOGOP, 0x01, 1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00),
310 _Q(PROF_TRIGGER_1, 0xaaaa, LOGOP, 0x01, 1, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00),
311 _Q(PROF_TRIGGER_2, 0xaaaa, LOGOP, 0x01, 1, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00),
312 _Q(PROF_TRIGGER_3, 0xaaaa, LOGOP, 0x01, 1, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00),
313 _Q(PROF_TRIGGER_4, 0xaaaa, LOGOP, 0x01, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00),
314 _Q(PROF_TRIGGER_5, 0xaaaa, LOGOP, 0x01, 1, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00),
315 _Q(PROF_TRIGGER_6, 0xaaaa, LOGOP, 0x01, 1, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00),
316 _Q(PROF_TRIGGER_7, 0xaaaa, LOGOP, 0x01, 1, 0x70, 0x00, 0x00, 0x00, 0x00, 0x00),
317 _Q(SHARED_LD, 0xaaaa, LOGOP, 0x64, 1, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00),
318 _Q(SHARED_ST, 0xaaaa, LOGOP, 0x64, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00),
319 _Q(THREADS_LAUNCHED, 0xaaaa, LOGOP, 0x26, 6, 0x10, 0x21, 0x32, 0x43, 0x54, 0x65),
320 _Q(TH_INST_EXECUTED_0, 0xaaaa, LOGOP, 0xa3, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55),
321 _Q(TH_INST_EXECUTED_1, 0xaaaa, LOGOP, 0xa5, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55),
322 _Q(TH_INST_EXECUTED_2, 0xaaaa, LOGOP, 0xa4, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55),
323 _Q(TH_INST_EXECUTED_3, 0xaaaa, LOGOP, 0xa6, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55),
324 _Q(WARPS_LAUNCHED, 0xaaaa, LOGOP, 0x26, 1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00),
325 };
326
327 #undef _Q
328
329 static const struct nvc0_hw_sm_query_cfg *
330 nvc0_hw_sm_query_get_cfg(struct nvc0_context *nvc0, struct nvc0_query *q)
331 {
332 struct nvc0_screen *screen = nvc0->screen;
333
334 if (screen->base.class_3d >= NVE4_3D_CLASS)
335 return &nve4_hw_sm_queries[q->type - PIPE_QUERY_DRIVER_SPECIFIC];
336 return &nvc0_hw_sm_queries[q->type - NVC0_HW_SM_QUERY(0)];
337 }
338
339 static boolean
340 nvc0_hw_sm_query_begin(struct nvc0_context *nvc0, struct nvc0_query *q)
341 {
342 struct nvc0_screen *screen = nvc0->screen;
343 struct nouveau_pushbuf *push = nvc0->base.pushbuf;
344 const bool is_nve4 = screen->base.class_3d >= NVE4_3D_CLASS;
345 struct nvc0_hw_query *hq = nvc0_hw_query(q);
346 const struct nvc0_hw_sm_query_cfg *cfg;
347 unsigned i, c;
348 unsigned num_ab[2] = { 0, 0 };
349
350 cfg = nvc0_hw_sm_query_get_cfg(nvc0, q);
351
352 /* check if we have enough free counter slots */
353 for (i = 0; i < cfg->num_counters; ++i)
354 num_ab[cfg->ctr[i].sig_dom]++;
355
356 if (screen->pm.num_hw_sm_active[0] + num_ab[0] > 4 ||
357 screen->pm.num_hw_sm_active[1] + num_ab[1] > 4) {
358 NOUVEAU_ERR("Not enough free MP counter slots !\n");
359 return false;
360 }
361
362 assert(cfg->num_counters <= 4);
363 PUSH_SPACE(push, 4 * 8 * (is_nve4 ? 1 : 6) + 6);
364
365 if (!screen->pm.mp_counters_enabled) {
366 screen->pm.mp_counters_enabled = true;
367 BEGIN_NVC0(push, SUBC_SW(0x06ac), 1);
368 PUSH_DATA (push, 0x1fcb);
369 }
370
371 /* set sequence field to 0 (used to check if result is available) */
372 for (i = 0; i < screen->mp_count; ++i)
373 hq->data[i * 10 + 10] = 0;
374
375 for (i = 0; i < cfg->num_counters; ++i) {
376 const unsigned d = cfg->ctr[i].sig_dom;
377
378 if (!screen->pm.num_hw_sm_active[d]) {
379 uint32_t m = (1 << 22) | (1 << (7 + (8 * !d)));
380 if (screen->pm.num_hw_sm_active[!d])
381 m |= 1 << (7 + (8 * d));
382 BEGIN_NVC0(push, SUBC_SW(0x0600), 1);
383 PUSH_DATA (push, m);
384 }
385 screen->pm.num_hw_sm_active[d]++;
386
387 for (c = d * 4; c < (d * 4 + 4); ++c) {
388 if (!screen->pm.mp_counter[c]) {
389 hq->ctr[i] = c;
390 screen->pm.mp_counter[c] = (struct pipe_query *)q;
391 break;
392 }
393 }
394 assert(c <= (d * 4 + 3)); /* must succeed, already checked for space */
395
396 /* configure and reset the counter(s) */
397 if (is_nve4) {
398 if (d == 0)
399 BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_A_SIGSEL(c & 3)), 1);
400 else
401 BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_B_SIGSEL(c & 3)), 1);
402 PUSH_DATA (push, cfg->ctr[i].sig_sel);
403 BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_SRCSEL(c)), 1);
404 PUSH_DATA (push, cfg->ctr[i].src_sel + 0x2108421 * (c & 3));
405 BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(c)), 1);
406 PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode);
407 BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_SET(c)), 1);
408 PUSH_DATA (push, 0);
409 } else {
410 unsigned s;
411
412 for (s = 0; s < cfg->ctr[i].num_src; s++) {
413 BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SIGSEL(s)), 1);
414 PUSH_DATA (push, cfg->ctr[i].sig_sel);
415 BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SRCSEL(s)), 1);
416 PUSH_DATA (push, (cfg->ctr[i].src_sel >> (s * 8)) & 0xff);
417 BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_OP(s)), 1);
418 PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode);
419 BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SET(s)), 1);
420 PUSH_DATA (push, 0);
421 }
422 }
423 }
424 return true;
425 }
426
427 static void
428 nvc0_hw_sm_query_end(struct nvc0_context *nvc0, struct nvc0_query *q)
429 {
430 struct nvc0_screen *screen = nvc0->screen;
431 struct pipe_context *pipe = &nvc0->base.pipe;
432 struct nouveau_pushbuf *push = nvc0->base.pushbuf;
433 const bool is_nve4 = screen->base.class_3d >= NVE4_3D_CLASS;
434 struct nvc0_hw_query *hq = nvc0_hw_query(q);
435 uint32_t mask;
436 uint32_t input[3];
437 const uint block[3] = { 32, is_nve4 ? 4 : 1, 1 };
438 const uint grid[3] = { screen->mp_count, 1, 1 };
439 unsigned c;
440 const struct nvc0_hw_sm_query_cfg *cfg;
441
442 cfg = nvc0_hw_sm_query_get_cfg(nvc0, q);
443
444 if (unlikely(!screen->pm.prog)) {
445 struct nvc0_program *prog = CALLOC_STRUCT(nvc0_program);
446 prog->type = PIPE_SHADER_COMPUTE;
447 prog->translated = true;
448 prog->num_gprs = 14;
449 prog->parm_size = 12;
450 if (is_nve4) {
451 prog->code = (uint32_t *)nve4_read_hw_sm_counters_code;
452 prog->code_size = sizeof(nve4_read_hw_sm_counters_code);
453 } else {
454 prog->code = (uint32_t *)nvc0_read_hw_sm_counters_code;
455 prog->code_size = sizeof(nvc0_read_hw_sm_counters_code);
456 }
457 screen->pm.prog = prog;
458 }
459
460 /* disable all counting */
461 PUSH_SPACE(push, 8);
462 for (c = 0; c < 8; ++c)
463 if (screen->pm.mp_counter[c]) {
464 if (is_nve4) {
465 IMMED_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(c)), 0);
466 } else {
467 IMMED_NVC0(push, NVC0_COMPUTE(MP_PM_OP(c)), 0);
468 }
469 }
470 /* release counters for this query */
471 for (c = 0; c < 8; ++c) {
472 if (nvc0_query(screen->pm.mp_counter[c]) == q) {
473 screen->pm.num_hw_sm_active[c / 4]--;
474 screen->pm.mp_counter[c] = NULL;
475 }
476 }
477
478 BCTX_REFN_bo(nvc0->bufctx_cp, CP_QUERY, NOUVEAU_BO_GART | NOUVEAU_BO_WR,
479 hq->bo);
480
481 PUSH_SPACE(push, 1);
482 IMMED_NVC0(push, SUBC_COMPUTE(NV50_GRAPH_SERIALIZE), 0);
483
484 pipe->bind_compute_state(pipe, screen->pm.prog);
485 input[0] = (hq->bo->offset + hq->base_offset);
486 input[1] = (hq->bo->offset + hq->base_offset) >> 32;
487 input[2] = hq->sequence;
488 pipe->launch_grid(pipe, block, grid, 0, input);
489
490 nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_QUERY);
491
492 /* re-activate other counters */
493 PUSH_SPACE(push, 16);
494 mask = 0;
495 for (c = 0; c < 8; ++c) {
496 unsigned i;
497 q = nvc0_query(screen->pm.mp_counter[c]);
498 if (!q)
499 continue;
500 cfg = nvc0_hw_sm_query_get_cfg(nvc0, q);
501 for (i = 0; i < cfg->num_counters; ++i) {
502 if (mask & (1 << hq->ctr[i]))
503 break;
504 mask |= 1 << hq->ctr[i];
505 if (is_nve4) {
506 BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(hq->ctr[i])), 1);
507 } else {
508 BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_OP(hq->ctr[i])), 1);
509 }
510 PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode);
511 }
512 }
513 }
514
515 static inline bool
516 nvc0_hw_sm_query_read_data(uint32_t count[32][4],
517 struct nvc0_context *nvc0, bool wait,
518 struct nvc0_query *q,
519 const struct nvc0_hw_sm_query_cfg *cfg,
520 unsigned mp_count)
521 {
522 struct nvc0_hw_query *hq = nvc0_hw_query(q);
523 unsigned p, c;
524
525 for (p = 0; p < mp_count; ++p) {
526 const unsigned b = (0x24 / 4) * p;
527
528 for (c = 0; c < cfg->num_counters; ++c) {
529 if (hq->data[b + 8] != hq->sequence) {
530 if (!wait)
531 return false;
532 if (nouveau_bo_wait(hq->bo, NOUVEAU_BO_RD, nvc0->base.client))
533 return false;
534 }
535 count[p][c] = hq->data[b + hq->ctr[c]];
536 }
537 }
538 return true;
539 }
540
541 static inline bool
542 nve4_hw_sm_query_read_data(uint32_t count[32][4],
543 struct nvc0_context *nvc0, bool wait,
544 struct nvc0_query *q,
545 const struct nvc0_hw_sm_query_cfg *cfg,
546 unsigned mp_count)
547 {
548 struct nvc0_hw_query *hq = nvc0_hw_query(q);
549 unsigned p, c, d;
550
551 for (p = 0; p < mp_count; ++p) {
552 const unsigned b = (0x60 / 4) * p;
553
554 for (c = 0; c < cfg->num_counters; ++c) {
555 count[p][c] = 0;
556 for (d = 0; d < ((hq->ctr[c] & ~3) ? 1 : 4); ++d) {
557 if (hq->data[b + 20 + d] != hq->sequence) {
558 if (!wait)
559 return false;
560 if (nouveau_bo_wait(hq->bo, NOUVEAU_BO_RD, nvc0->base.client))
561 return false;
562 }
563 if (hq->ctr[c] & ~0x3)
564 count[p][c] = hq->data[b + 16 + (hq->ctr[c] & 3)];
565 else
566 count[p][c] += hq->data[b + d * 4 + hq->ctr[c]];
567 }
568 }
569 }
570 return true;
571 }
572
573 /* Metric calculations:
574 * sum(x) ... sum of x over all MPs
575 * avg(x) ... average of x over all MPs
576 *
577 * IPC : sum(inst_executed) / clock
578 * INST_REPLAY_OHEAD: (sum(inst_issued) - sum(inst_executed)) / sum(inst_issued)
579 * MP_OCCUPANCY : avg((active_warps / 64) / active_cycles)
580 * MP_EFFICIENCY : avg(active_cycles / clock)
581 *
582 * NOTE: Interpretation of IPC requires knowledge of MP count.
583 */
584 static boolean
585 nvc0_hw_sm_query_result(struct nvc0_context *nvc0, struct nvc0_query *q,
586 void *result, boolean wait)
587 {
588 uint32_t count[32][4];
589 uint64_t value = 0;
590 unsigned mp_count = MIN2(nvc0->screen->mp_count_compute, 32);
591 unsigned p, c;
592 const struct nvc0_hw_sm_query_cfg *cfg;
593 bool ret;
594
595 cfg = nvc0_hw_sm_query_get_cfg(nvc0, q);
596
597 if (nvc0->screen->base.class_3d >= NVE4_3D_CLASS)
598 ret = nve4_hw_sm_query_read_data(count, nvc0, wait, q, cfg, mp_count);
599 else
600 ret = nvc0_hw_sm_query_read_data(count, nvc0, wait, q, cfg, mp_count);
601 if (!ret)
602 return false;
603
604 if (cfg->op == NVC0_COUNTER_OPn_SUM) {
605 for (c = 0; c < cfg->num_counters; ++c)
606 for (p = 0; p < mp_count; ++p)
607 value += count[p][c];
608 value = (value * cfg->norm[0]) / cfg->norm[1];
609 } else
610 if (cfg->op == NVC0_COUNTER_OPn_OR) {
611 uint32_t v = 0;
612 for (c = 0; c < cfg->num_counters; ++c)
613 for (p = 0; p < mp_count; ++p)
614 v |= count[p][c];
615 value = ((uint64_t)v * cfg->norm[0]) / cfg->norm[1];
616 } else
617 if (cfg->op == NVC0_COUNTER_OPn_AND) {
618 uint32_t v = ~0;
619 for (c = 0; c < cfg->num_counters; ++c)
620 for (p = 0; p < mp_count; ++p)
621 v &= count[p][c];
622 value = ((uint64_t)v * cfg->norm[0]) / cfg->norm[1];
623 } else
624 if (cfg->op == NVC0_COUNTER_OP2_REL_SUM_MM) {
625 uint64_t v[2] = { 0, 0 };
626 for (p = 0; p < mp_count; ++p) {
627 v[0] += count[p][0];
628 v[1] += count[p][1];
629 }
630 if (v[0])
631 value = ((v[0] - v[1]) * cfg->norm[0]) / (v[0] * cfg->norm[1]);
632 } else
633 if (cfg->op == NVC0_COUNTER_OP2_DIV_SUM_M0) {
634 for (p = 0; p < mp_count; ++p)
635 value += count[p][0];
636 if (count[0][1])
637 value = (value * cfg->norm[0]) / (count[0][1] * cfg->norm[1]);
638 else
639 value = 0;
640 } else
641 if (cfg->op == NVC0_COUNTER_OP2_AVG_DIV_MM) {
642 unsigned mp_used = 0;
643 for (p = 0; p < mp_count; ++p, mp_used += !!count[p][0])
644 if (count[p][1])
645 value += (count[p][0] * cfg->norm[0]) / count[p][1];
646 if (mp_used)
647 value /= (uint64_t)mp_used * cfg->norm[1];
648 } else
649 if (cfg->op == NVC0_COUNTER_OP2_AVG_DIV_M0) {
650 unsigned mp_used = 0;
651 for (p = 0; p < mp_count; ++p, mp_used += !!count[p][0])
652 value += count[p][0];
653 if (count[0][1] && mp_used) {
654 value *= cfg->norm[0];
655 value /= (uint64_t)count[0][1] * mp_used * cfg->norm[1];
656 } else {
657 value = 0;
658 }
659 }
660
661 *(uint64_t *)result = value;
662 return true;
663 }
664
665 static bool
666 nvc0_hw_query_allocate(struct nvc0_context *nvc0, struct nvc0_query *q,
667 int size)
668 {
669 struct nvc0_hw_query *hq = nvc0_hw_query(q);
670 struct nvc0_screen *screen = nvc0->screen;
671 int ret;
672
673 if (hq->bo) {
674 nouveau_bo_ref(NULL, &hq->bo);
675 if (hq->mm) {
676 if (hq->state == NVC0_HW_QUERY_STATE_READY)
677 nouveau_mm_free(hq->mm);
678 else
679 nouveau_fence_work(screen->base.fence.current,
680 nouveau_mm_free_work, hq->mm);
681 }
682 }
683 if (size) {
684 hq->mm = nouveau_mm_allocate(screen->base.mm_GART, size, &hq->bo,
685 &hq->base_offset);
686 if (!hq->bo)
687 return false;
688 hq->offset = hq->base_offset;
689
690 ret = nouveau_bo_map(hq->bo, 0, screen->base.client);
691 if (ret) {
692 nvc0_hw_query_allocate(nvc0, q, 0);
693 return false;
694 }
695 hq->data = (uint32_t *)((uint8_t *)hq->bo->map + hq->base_offset);
696 }
697 return true;
698 }
699
700 static void
701 nvc0_hw_query_get(struct nouveau_pushbuf *push, struct nvc0_query *q,
702 unsigned offset, uint32_t get)
703 {
704 struct nvc0_hw_query *hq = nvc0_hw_query(q);
705
706 offset += hq->offset;
707
708 PUSH_SPACE(push, 5);
709 PUSH_REFN (push, hq->bo, NOUVEAU_BO_GART | NOUVEAU_BO_WR);
710 BEGIN_NVC0(push, NVC0_3D(QUERY_ADDRESS_HIGH), 4);
711 PUSH_DATAh(push, hq->bo->offset + offset);
712 PUSH_DATA (push, hq->bo->offset + offset);
713 PUSH_DATA (push, hq->sequence);
714 PUSH_DATA (push, get);
715 }
716
717 static void
718 nvc0_hw_query_rotate(struct nvc0_context *nvc0, struct nvc0_query *q)
719 {
720 struct nvc0_hw_query *hq = nvc0_hw_query(q);
721
722 hq->offset += hq->rotate;
723 hq->data += hq->rotate / sizeof(*hq->data);
724 if (hq->offset - hq->base_offset == NVC0_HW_QUERY_ALLOC_SPACE)
725 nvc0_hw_query_allocate(nvc0, q, NVC0_HW_QUERY_ALLOC_SPACE);
726 }
727
728 static inline void
729 nvc0_hw_query_update(struct nouveau_client *cli, struct nvc0_query *q)
730 {
731 struct nvc0_hw_query *hq = nvc0_hw_query(q);
732
733 if (hq->is64bit) {
734 if (nouveau_fence_signalled(hq->fence))
735 hq->state = NVC0_HW_QUERY_STATE_READY;
736 } else {
737 if (hq->data[0] == hq->sequence)
738 hq->state = NVC0_HW_QUERY_STATE_READY;
739 }
740 }
741
742 static void
743 nvc0_hw_destroy_query(struct nvc0_context *nvc0, struct nvc0_query *q)
744 {
745 struct nvc0_hw_query *hq = nvc0_hw_query(q);
746 nvc0_hw_query_allocate(nvc0, q, 0);
747 nouveau_fence_ref(NULL, &hq->fence);
748 FREE(hq);
749 }
750
751 static boolean
752 nvc0_hw_begin_query(struct nvc0_context *nvc0, struct nvc0_query *q)
753 {
754 struct nouveau_pushbuf *push = nvc0->base.pushbuf;
755 struct nvc0_hw_query *hq = nvc0_hw_query(q);
756 bool ret = true;
757
758 /* For occlusion queries we have to change the storage, because a previous
759 * query might set the initial render conition to false even *after* we re-
760 * initialized it to true.
761 */
762 if (hq->rotate) {
763 nvc0_hw_query_rotate(nvc0, q);
764
765 /* XXX: can we do this with the GPU, and sync with respect to a previous
766 * query ?
767 */
768 hq->data[0] = hq->sequence; /* initialize sequence */
769 hq->data[1] = 1; /* initial render condition = true */
770 hq->data[4] = hq->sequence + 1; /* for comparison COND_MODE */
771 hq->data[5] = 0;
772 }
773 hq->sequence++;
774
775 switch (q->type) {
776 case PIPE_QUERY_OCCLUSION_COUNTER:
777 case PIPE_QUERY_OCCLUSION_PREDICATE:
778 hq->nesting = nvc0->screen->num_occlusion_queries_active++;
779 if (hq->nesting) {
780 nvc0_hw_query_get(push, q, 0x10, 0x0100f002);
781 } else {
782 PUSH_SPACE(push, 3);
783 BEGIN_NVC0(push, NVC0_3D(COUNTER_RESET), 1);
784 PUSH_DATA (push, NVC0_3D_COUNTER_RESET_SAMPLECNT);
785 IMMED_NVC0(push, NVC0_3D(SAMPLECNT_ENABLE), 1);
786 }
787 break;
788 case PIPE_QUERY_PRIMITIVES_GENERATED:
789 nvc0_hw_query_get(push, q, 0x10, 0x09005002 | (q->index << 5));
790 break;
791 case PIPE_QUERY_PRIMITIVES_EMITTED:
792 nvc0_hw_query_get(push, q, 0x10, 0x05805002 | (q->index << 5));
793 break;
794 case PIPE_QUERY_SO_STATISTICS:
795 nvc0_hw_query_get(push, q, 0x20, 0x05805002 | (q->index << 5));
796 nvc0_hw_query_get(push, q, 0x30, 0x06805002 | (q->index << 5));
797 break;
798 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
799 nvc0_hw_query_get(push, q, 0x10, 0x03005002 | (q->index << 5));
800 break;
801 case PIPE_QUERY_TIME_ELAPSED:
802 nvc0_hw_query_get(push, q, 0x10, 0x00005002);
803 break;
804 case PIPE_QUERY_PIPELINE_STATISTICS:
805 nvc0_hw_query_get(push, q, 0xc0 + 0x00, 0x00801002); /* VFETCH, VERTICES */
806 nvc0_hw_query_get(push, q, 0xc0 + 0x10, 0x01801002); /* VFETCH, PRIMS */
807 nvc0_hw_query_get(push, q, 0xc0 + 0x20, 0x02802002); /* VP, LAUNCHES */
808 nvc0_hw_query_get(push, q, 0xc0 + 0x30, 0x03806002); /* GP, LAUNCHES */
809 nvc0_hw_query_get(push, q, 0xc0 + 0x40, 0x04806002); /* GP, PRIMS_OUT */
810 nvc0_hw_query_get(push, q, 0xc0 + 0x50, 0x07804002); /* RAST, PRIMS_IN */
811 nvc0_hw_query_get(push, q, 0xc0 + 0x60, 0x08804002); /* RAST, PRIMS_OUT */
812 nvc0_hw_query_get(push, q, 0xc0 + 0x70, 0x0980a002); /* ROP, PIXELS */
813 nvc0_hw_query_get(push, q, 0xc0 + 0x80, 0x0d808002); /* TCP, LAUNCHES */
814 nvc0_hw_query_get(push, q, 0xc0 + 0x90, 0x0e809002); /* TEP, LAUNCHES */
815 break;
816 default:
817 if ((q->type >= NVE4_HW_SM_QUERY(0) && q->type <= NVE4_HW_SM_QUERY_LAST) ||
818 (q->type >= NVC0_HW_SM_QUERY(0) && q->type <= NVC0_HW_SM_QUERY_LAST)) {
819 ret = nvc0_hw_sm_query_begin(nvc0, q);
820 }
821 break;
822 }
823 hq->state = NVC0_HW_QUERY_STATE_ACTIVE;
824 return ret;
825 }
826
827 static void
828 nvc0_hw_end_query(struct nvc0_context *nvc0, struct nvc0_query *q)
829 {
830 struct nouveau_pushbuf *push = nvc0->base.pushbuf;
831 struct nvc0_hw_query *hq = nvc0_hw_query(q);
832
833 if (hq->state != NVC0_HW_QUERY_STATE_ACTIVE) {
834 /* some queries don't require 'begin' to be called (e.g. GPU_FINISHED) */
835 if (hq->rotate)
836 nvc0_hw_query_rotate(nvc0, q);
837 hq->sequence++;
838 }
839 hq->state = NVC0_HW_QUERY_STATE_ENDED;
840
841 switch (q->type) {
842 case PIPE_QUERY_OCCLUSION_COUNTER:
843 case PIPE_QUERY_OCCLUSION_PREDICATE:
844 nvc0_hw_query_get(push, q, 0, 0x0100f002);
845 if (--nvc0->screen->num_occlusion_queries_active == 0) {
846 PUSH_SPACE(push, 1);
847 IMMED_NVC0(push, NVC0_3D(SAMPLECNT_ENABLE), 0);
848 }
849 break;
850 case PIPE_QUERY_PRIMITIVES_GENERATED:
851 nvc0_hw_query_get(push, q, 0, 0x09005002 | (q->index << 5));
852 break;
853 case PIPE_QUERY_PRIMITIVES_EMITTED:
854 nvc0_hw_query_get(push, q, 0, 0x05805002 | (q->index << 5));
855 break;
856 case PIPE_QUERY_SO_STATISTICS:
857 nvc0_hw_query_get(push, q, 0x00, 0x05805002 | (q->index << 5));
858 nvc0_hw_query_get(push, q, 0x10, 0x06805002 | (q->index << 5));
859 break;
860 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
861 /* TODO: How do we sum over all streams for render condition ? */
862 /* PRIMS_DROPPED doesn't write sequence, use a ZERO query to sync on */
863 nvc0_hw_query_get(push, q, 0x00, 0x03005002 | (q->index << 5));
864 nvc0_hw_query_get(push, q, 0x20, 0x00005002);
865 break;
866 case PIPE_QUERY_TIMESTAMP:
867 case PIPE_QUERY_TIME_ELAPSED:
868 nvc0_hw_query_get(push, q, 0, 0x00005002);
869 break;
870 case PIPE_QUERY_GPU_FINISHED:
871 nvc0_hw_query_get(push, q, 0, 0x1000f010);
872 break;
873 case PIPE_QUERY_PIPELINE_STATISTICS:
874 nvc0_hw_query_get(push, q, 0x00, 0x00801002); /* VFETCH, VERTICES */
875 nvc0_hw_query_get(push, q, 0x10, 0x01801002); /* VFETCH, PRIMS */
876 nvc0_hw_query_get(push, q, 0x20, 0x02802002); /* VP, LAUNCHES */
877 nvc0_hw_query_get(push, q, 0x30, 0x03806002); /* GP, LAUNCHES */
878 nvc0_hw_query_get(push, q, 0x40, 0x04806002); /* GP, PRIMS_OUT */
879 nvc0_hw_query_get(push, q, 0x50, 0x07804002); /* RAST, PRIMS_IN */
880 nvc0_hw_query_get(push, q, 0x60, 0x08804002); /* RAST, PRIMS_OUT */
881 nvc0_hw_query_get(push, q, 0x70, 0x0980a002); /* ROP, PIXELS */
882 nvc0_hw_query_get(push, q, 0x80, 0x0d808002); /* TCP, LAUNCHES */
883 nvc0_hw_query_get(push, q, 0x90, 0x0e809002); /* TEP, LAUNCHES */
884 break;
885 case PIPE_QUERY_TIMESTAMP_DISJOINT:
886 /* This query is not issued on GPU because disjoint is forced to false */
887 hq->state = NVC0_HW_QUERY_STATE_READY;
888 break;
889 case NVC0_HW_QUERY_TFB_BUFFER_OFFSET:
890 /* indexed by TFB buffer instead of by vertex stream */
891 nvc0_hw_query_get(push, q, 0x00, 0x0d005002 | (q->index << 5));
892 break;
893 default:
894 if ((q->type >= NVE4_HW_SM_QUERY(0) && q->type <= NVE4_HW_SM_QUERY_LAST) ||
895 (q->type >= NVC0_HW_SM_QUERY(0) && q->type <= NVC0_HW_SM_QUERY_LAST)) {
896 nvc0_hw_sm_query_end(nvc0, q);
897 }
898 break;
899 }
900 if (hq->is64bit)
901 nouveau_fence_ref(nvc0->screen->base.fence.current, &hq->fence);
902 }
903
904 static boolean
905 nvc0_hw_get_query_result(struct nvc0_context *nvc0, struct nvc0_query *q,
906 boolean wait, union pipe_query_result *result)
907 {
908 struct nvc0_hw_query *hq = nvc0_hw_query(q);
909 uint64_t *res64 = (uint64_t*)result;
910 uint32_t *res32 = (uint32_t*)result;
911 uint8_t *res8 = (uint8_t*)result;
912 uint64_t *data64 = (uint64_t *)hq->data;
913 unsigned i;
914
915 if ((q->type >= NVE4_HW_SM_QUERY(0) && q->type <= NVE4_HW_SM_QUERY_LAST) ||
916 (q->type >= NVC0_HW_SM_QUERY(0) && q->type <= NVC0_HW_SM_QUERY_LAST)) {
917 return nvc0_hw_sm_query_result(nvc0, q, result, wait);
918 }
919
920 if (hq->state != NVC0_HW_QUERY_STATE_READY)
921 nvc0_hw_query_update(nvc0->screen->base.client, q);
922
923 if (hq->state != NVC0_HW_QUERY_STATE_READY) {
924 if (!wait) {
925 if (hq->state != NVC0_HW_QUERY_STATE_FLUSHED) {
926 hq->state = NVC0_HW_QUERY_STATE_FLUSHED;
927 /* flush for silly apps that spin on GL_QUERY_RESULT_AVAILABLE */
928 PUSH_KICK(nvc0->base.pushbuf);
929 }
930 return false;
931 }
932 if (nouveau_bo_wait(hq->bo, NOUVEAU_BO_RD, nvc0->screen->base.client))
933 return false;
934 NOUVEAU_DRV_STAT(&nvc0->screen->base, query_sync_count, 1);
935 }
936 hq->state = NVC0_HW_QUERY_STATE_READY;
937
938 switch (q->type) {
939 case PIPE_QUERY_GPU_FINISHED:
940 res8[0] = true;
941 break;
942 case PIPE_QUERY_OCCLUSION_COUNTER: /* u32 sequence, u32 count, u64 time */
943 res64[0] = hq->data[1] - hq->data[5];
944 break;
945 case PIPE_QUERY_OCCLUSION_PREDICATE:
946 res8[0] = hq->data[1] != hq->data[5];
947 break;
948 case PIPE_QUERY_PRIMITIVES_GENERATED: /* u64 count, u64 time */
949 case PIPE_QUERY_PRIMITIVES_EMITTED: /* u64 count, u64 time */
950 res64[0] = data64[0] - data64[2];
951 break;
952 case PIPE_QUERY_SO_STATISTICS:
953 res64[0] = data64[0] - data64[4];
954 res64[1] = data64[2] - data64[6];
955 break;
956 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
957 res8[0] = data64[0] != data64[2];
958 break;
959 case PIPE_QUERY_TIMESTAMP:
960 res64[0] = data64[1];
961 break;
962 case PIPE_QUERY_TIMESTAMP_DISJOINT:
963 res64[0] = 1000000000;
964 res8[8] = false;
965 break;
966 case PIPE_QUERY_TIME_ELAPSED:
967 res64[0] = data64[1] - data64[3];
968 break;
969 case PIPE_QUERY_PIPELINE_STATISTICS:
970 for (i = 0; i < 10; ++i)
971 res64[i] = data64[i * 2] - data64[24 + i * 2];
972 break;
973 case NVC0_HW_QUERY_TFB_BUFFER_OFFSET:
974 res32[0] = hq->data[1];
975 break;
976 default:
977 assert(0); /* can't happen, we don't create queries with invalid type */
978 return false;
979 }
980
981 return true;
982 }
983
984 static const struct nvc0_query_funcs hw_query_funcs = {
985 .destroy_query = nvc0_hw_destroy_query,
986 .begin_query = nvc0_hw_begin_query,
987 .end_query = nvc0_hw_end_query,
988 .get_query_result = nvc0_hw_get_query_result,
989 };
990
991 struct nvc0_query *
992 nvc0_hw_create_query(struct nvc0_context *nvc0, unsigned type, unsigned index)
993 {
994 struct nvc0_hw_query *hq;
995 struct nvc0_query *q;
996 unsigned space = NVC0_HW_QUERY_ALLOC_SPACE;
997
998 hq = CALLOC_STRUCT(nvc0_hw_query);
999 if (!hq)
1000 return NULL;
1001
1002 q = &hq->base;
1003 q->funcs = &hw_query_funcs;
1004 q->type = type;
1005
1006 switch (q->type) {
1007 case PIPE_QUERY_OCCLUSION_COUNTER:
1008 case PIPE_QUERY_OCCLUSION_PREDICATE:
1009 hq->rotate = 32;
1010 space = NVC0_HW_QUERY_ALLOC_SPACE;
1011 break;
1012 case PIPE_QUERY_PIPELINE_STATISTICS:
1013 hq->is64bit = true;
1014 space = 512;
1015 break;
1016 case PIPE_QUERY_SO_STATISTICS:
1017 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
1018 hq->is64bit = true;
1019 space = 64;
1020 break;
1021 case PIPE_QUERY_PRIMITIVES_GENERATED:
1022 case PIPE_QUERY_PRIMITIVES_EMITTED:
1023 hq->is64bit = true;
1024 q->index = index;
1025 space = 32;
1026 break;
1027 case PIPE_QUERY_TIME_ELAPSED:
1028 case PIPE_QUERY_TIMESTAMP:
1029 case PIPE_QUERY_TIMESTAMP_DISJOINT:
1030 case PIPE_QUERY_GPU_FINISHED:
1031 space = 32;
1032 break;
1033 case NVC0_HW_QUERY_TFB_BUFFER_OFFSET:
1034 space = 16;
1035 break;
1036 default:
1037 if (nvc0->screen->base.device->drm_version >= 0x01000101) {
1038 if (type >= NVE4_HW_SM_QUERY(0) && type <= NVE4_HW_SM_QUERY_LAST) {
1039 /* for each MP:
1040 * [00] = WS0.C0
1041 * [04] = WS0.C1
1042 * [08] = WS0.C2
1043 * [0c] = WS0.C3
1044 * [10] = WS1.C0
1045 * [14] = WS1.C1
1046 * [18] = WS1.C2
1047 * [1c] = WS1.C3
1048 * [20] = WS2.C0
1049 * [24] = WS2.C1
1050 * [28] = WS2.C2
1051 * [2c] = WS2.C3
1052 * [30] = WS3.C0
1053 * [34] = WS3.C1
1054 * [38] = WS3.C2
1055 * [3c] = WS3.C3
1056 * [40] = MP.C4
1057 * [44] = MP.C5
1058 * [48] = MP.C6
1059 * [4c] = MP.C7
1060 * [50] = WS0.sequence
1061 * [54] = WS1.sequence
1062 * [58] = WS2.sequence
1063 * [5c] = WS3.sequence
1064 */
1065 space = (4 * 4 + 4 + 4) * nvc0->screen->mp_count * sizeof(uint32_t);
1066 break;
1067 } else
1068 if (type >= NVC0_HW_SM_QUERY(0) && type <= NVC0_HW_SM_QUERY_LAST) {
1069 /* for each MP:
1070 * [00] = MP.C0
1071 * [04] = MP.C1
1072 * [08] = MP.C2
1073 * [0c] = MP.C3
1074 * [10] = MP.C4
1075 * [14] = MP.C5
1076 * [18] = MP.C6
1077 * [1c] = MP.C7
1078 * [20] = MP.sequence
1079 */
1080 space = (8 + 1) * nvc0->screen->mp_count * sizeof(uint32_t);
1081 break;
1082 }
1083 }
1084 debug_printf("invalid query type: %u\n", type);
1085 FREE(q);
1086 return NULL;
1087 }
1088
1089 if (!nvc0_hw_query_allocate(nvc0, q, space)) {
1090 FREE(hq);
1091 return NULL;
1092 }
1093
1094 if (hq->rotate) {
1095 /* we advance before query_begin ! */
1096 hq->offset -= hq->rotate;
1097 hq->data -= hq->rotate / sizeof(*hq->data);
1098 } else
1099 if (!hq->is64bit)
1100 hq->data[0] = 0; /* initialize sequence */
1101
1102 return q;
1103 }
1104
1105 void
1106 nvc0_hw_query_pushbuf_submit(struct nouveau_pushbuf *push,
1107 struct nvc0_query *q, unsigned result_offset)
1108 {
1109 struct nvc0_hw_query *hq = nvc0_hw_query(q);
1110
1111 #define NVC0_IB_ENTRY_1_NO_PREFETCH (1 << (31 - 8))
1112
1113 PUSH_REFN(push, hq->bo, NOUVEAU_BO_RD | NOUVEAU_BO_GART);
1114 nouveau_pushbuf_space(push, 0, 0, 1);
1115 nouveau_pushbuf_data(push, hq->bo, hq->offset + result_offset, 4 |
1116 NVC0_IB_ENTRY_1_NO_PREFETCH);
1117 }
1118
1119 void
1120 nvc0_hw_query_fifo_wait(struct nouveau_pushbuf *push, struct nvc0_query *q)
1121 {
1122 struct nvc0_hw_query *hq = nvc0_hw_query(q);
1123 unsigned offset = hq->offset;
1124
1125 if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE) offset += 0x20;
1126
1127 PUSH_SPACE(push, 5);
1128 PUSH_REFN (push, hq->bo, NOUVEAU_BO_GART | NOUVEAU_BO_RD);
1129 BEGIN_NVC0(push, SUBC_3D(NV84_SUBCHAN_SEMAPHORE_ADDRESS_HIGH), 4);
1130 PUSH_DATAh(push, hq->bo->offset + offset);
1131 PUSH_DATA (push, hq->bo->offset + offset);
1132 PUSH_DATA (push, hq->sequence);
1133 PUSH_DATA (push, (1 << 12) |
1134 NV84_SUBCHAN_SEMAPHORE_TRIGGER_ACQUIRE_EQUAL);
1135 }