2 * Copyright 2011 Christoph Bumiller
3 * Copyright 2015 Samuel Pitoiset
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21 * OTHER DEALINGS IN THE SOFTWARE.
24 #define NVC0_PUSH_EXPLICIT_SPACE_CHECKING
26 #include "nvc0/nvc0_context.h"
27 #include "nvc0/nvc0_query_hw.h"
29 #include "nv_object.xml.h"
30 #include "nvc0/nve4_compute.xml.h"
31 #include "nvc0/nvc0_compute.xml.h"
33 #define NVC0_HW_QUERY_STATE_READY 0
34 #define NVC0_HW_QUERY_STATE_ACTIVE 1
35 #define NVC0_HW_QUERY_STATE_ENDED 2
36 #define NVC0_HW_QUERY_STATE_FLUSHED 3
38 #define NVC0_HW_QUERY_ALLOC_SPACE 256
40 /* === PERFORMANCE MONITORING COUNTERS for NVE4+ === */
42 /* Code to read out MP counters: They are accessible via mmio, too, but let's
43 * just avoid mapping registers in userspace. We'd have to know which MPs are
44 * enabled/present, too, and that information is not presently exposed.
45 * We could add a kernel interface for it, but reading the counters like this
46 * has the advantage of being async (if get_result isn't called immediately).
48 static const uint64_t nve4_read_hw_sm_counters_code
[] =
50 /* sched 0x20 0x20 0x20 0x20 0x20 0x20 0x20
52 * mov b32 $r12 $physid
58 * sched 0x20 0x20 0x23 0x04 0x20 0x04 0x2b
62 * set $p0 0x1 eq u32 $r8 0x0
63 * mov b32 $r10 c0[0x0]
64 * ext u32 $r8 $r12 0x414
65 * mov b32 $r11 c0[0x4]
66 * sched 0x04 0x2e 0x04 0x20 0x20 0x28 0x04
67 * ext u32 $r9 $r12 0x208
69 * set $p1 0x1 eq u32 $r9 0x0
70 * mul $r8 u32 $r8 u32 96
71 * mul $r12 u32 $r9 u32 16
72 * mul $r13 u32 $r9 u32 4
73 * add b32 $r9 $r8 $r13
74 * sched 0x28 0x04 0x2c 0x04 0x2c 0x04 0x2c
75 * add b32 $r8 $r8 $r12
77 * add b32 $r10 $c $r10 $r8
79 * add b32 $r11 $r11 0x0 $c
80 * add b32 $r12 $c $r12 $r9
81 * st b128 wt g[$r10d] $r0q
82 * sched 0x4 0x2c 0x20 0x04 0x2e 0x00 0x00
84 * add b32 $r13 $r13 0x0 $c
85 * $p1 st b128 wt g[$r12d+0x40] $r4q
86 * st b32 wt g[$r12d+0x50] $r0
88 0x2202020202020207ULL
,
89 0x2c00000084021c04ULL
,
90 0x2c0000000c031c04ULL
,
91 0x2c00000010001c04ULL
,
92 0x2c00000014005c04ULL
,
93 0x2c00000018009c04ULL
,
94 0x2c0000001c00dc04ULL
,
95 0x2c00000020011c04ULL
,
96 0x22b0420042320207ULL
,
97 0x2c00000024015c04ULL
,
98 0x2c00000028019c04ULL
,
99 0x2c0000002c01dc04ULL
,
100 0x190e0000fc81dc03ULL
,
101 0x2800400000029de4ULL
,
102 0x7000c01050c21c03ULL
,
103 0x280040001002dde4ULL
,
104 0x204282020042e047ULL
,
105 0x7000c00820c25c03ULL
,
106 0x80000000000021e7ULL
,
107 0x190e0000fc93dc03ULL
,
108 0x1000000180821c02ULL
,
109 0x1000000040931c02ULL
,
110 0x1000000010935c02ULL
,
111 0x4800000034825c03ULL
,
112 0x22c042c042c04287ULL
,
113 0x4800000030821c03ULL
,
114 0x2800000028031de4ULL
,
115 0x4801000020a29c03ULL
,
116 0x280000002c035de4ULL
,
117 0x0800000000b2dc42ULL
,
118 0x4801000024c31c03ULL
,
119 0x9400000000a01fc5ULL
,
120 0x200002e04202c047ULL
,
121 0x2800400020001de4ULL
,
122 0x0800000000d35c42ULL
,
123 0x9400000100c107c5ULL
,
124 0x9400000140c01f85ULL
,
125 0x8000000000001de7ULL
128 /* For simplicity, we will allocate as many group slots as we allocate counter
129 * slots. This means that a single counter which wants to source from 2 groups
130 * will have to be declared as using 2 counter slots. This shouldn't really be
131 * a problem because such queries don't make much sense ... (unless someone is
134 struct nvc0_mp_counter_cfg
136 uint32_t func
: 16; /* mask or 4-bit logic op (depending on mode) */
137 uint32_t mode
: 4; /* LOGOP,B6,LOGOP_B6(_PULSE) */
138 uint32_t num_src
: 3; /* number of sources (1 - 6, only for NVC0:NVE4) */
139 uint32_t sig_dom
: 1; /* if 0, MP_PM_A (per warp-sched), if 1, MP_PM_B */
140 uint32_t sig_sel
: 8; /* signal group */
141 uint64_t src_sel
; /* signal selection for up to 6 sources (48 bit) */
144 #define NVC0_COUNTER_OPn_SUM 0
145 #define NVC0_COUNTER_OPn_OR 1
146 #define NVC0_COUNTER_OPn_AND 2
147 #define NVC0_COUNTER_OP2_REL_SUM_MM 3 /* (sum(ctr0) - sum(ctr1)) / sum(ctr0) */
148 #define NVC0_COUNTER_OP2_DIV_SUM_M0 4 /* sum(ctr0) / ctr1 of MP[0]) */
149 #define NVC0_COUNTER_OP2_AVG_DIV_MM 5 /* avg(ctr0 / ctr1) */
150 #define NVC0_COUNTER_OP2_AVG_DIV_M0 6 /* avg(ctr0) / ctr1 of MP[0]) */
152 struct nvc0_hw_sm_query_cfg
154 struct nvc0_mp_counter_cfg ctr
[4];
155 uint8_t num_counters
;
157 uint8_t norm
[2]; /* normalization num,denom */
160 #define _Q1A(n, f, m, g, s, nu, dn) [NVE4_HW_SM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g, s }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { nu, dn } }
161 #define _Q1B(n, f, m, g, s, nu, dn) [NVE4_HW_SM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g, s }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { nu, dn } }
162 #define _M2A(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_HW_SM_QUERY_METRIC_##n] = { { \
163 { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g0, s0 }, \
164 { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g1, s1 }, \
165 {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } }
166 #define _M2B(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_HW_SM_QUERY_METRIC_##n] = { { \
167 { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g0, s0 }, \
168 { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g1, s1 }, \
169 {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } }
170 #define _M2AB(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_HW_SM_QUERY_METRIC_##n] = { { \
171 { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g0, s0 }, \
172 { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g1, s1 }, \
173 {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } }
176 * active_warps: bit 0 alternates btw 0 and 1 for odd nr of warps
177 * inst_executed etc.: we only count a single warp scheduler
178 * metric-ipXc: we simply multiply by 4 to account for the 4 warp schedulers;
179 * this is inaccurate !
181 static const struct nvc0_hw_sm_query_cfg nve4_hw_sm_queries
[] =
183 _Q1B(ACTIVE_CYCLES
, 0x0001, B6
, WARP
, 0x00000000, 1, 1),
184 _Q1B(ACTIVE_WARPS
, 0x003f, B6
, WARP
, 0x31483104, 2, 1),
185 _Q1A(ATOM_COUNT
, 0x0001, B6
, BRANCH
, 0x00000000, 1, 1),
186 _Q1A(BRANCH
, 0x0001, B6
, BRANCH
, 0x0000000c, 1, 1),
187 _Q1A(DIVERGENT_BRANCH
, 0x0001, B6
, BRANCH
, 0x00000010, 1, 1),
188 _Q1A(GLD_REQUEST
, 0x0001, B6
, LDST
, 0x00000010, 1, 1),
189 _Q1B(GLD_MEM_DIV_REPLAY
, 0x0001, B6
, REPLAY
, 0x00000010, 1, 1),
190 _Q1B(GST_TRANSACTIONS
, 0x0001, B6
, MEM
, 0x00000004, 1, 1),
191 _Q1B(GST_MEM_DIV_REPLAY
, 0x0001, B6
, REPLAY
, 0x00000014, 1, 1),
192 _Q1A(GRED_COUNT
, 0x0001, B6
, BRANCH
, 0x00000008, 1, 1),
193 _Q1A(GST_REQUEST
, 0x0001, B6
, LDST
, 0x00000014, 1, 1),
194 _Q1A(INST_EXECUTED
, 0x0003, B6
, EXEC
, 0x00000398, 1, 1),
195 _Q1A(INST_ISSUED
, 0x0003, B6
, ISSUE
, 0x00000104, 1, 1),
196 _Q1A(INST_ISSUED1
, 0x0001, B6
, ISSUE
, 0x00000004, 1, 1),
197 _Q1A(INST_ISSUED2
, 0x0001, B6
, ISSUE
, 0x00000008, 1, 1),
198 _Q1B(L1_GLD_HIT
, 0x0001, B6
, L1
, 0x00000010, 1, 1),
199 _Q1B(L1_GLD_MISS
, 0x0001, B6
, L1
, 0x00000014, 1, 1),
200 _Q1B(L1_LOCAL_LD_HIT
, 0x0001, B6
, L1
, 0x00000000, 1, 1),
201 _Q1B(L1_LOCAL_LD_MISS
, 0x0001, B6
, L1
, 0x00000004, 1, 1),
202 _Q1B(L1_LOCAL_ST_HIT
, 0x0001, B6
, L1
, 0x00000008, 1, 1),
203 _Q1B(L1_LOCAL_ST_MISS
, 0x0001, B6
, L1
, 0x0000000c, 1, 1),
204 _Q1B(L1_SHARED_LD_TRANSACTIONS
, 0x0001, B6
, TRANSACTION
, 0x00000008, 1, 1),
205 _Q1B(L1_SHARED_ST_TRANSACTIONS
, 0x0001, B6
, TRANSACTION
, 0x0000000c, 1, 1),
206 _Q1A(LOCAL_LD
, 0x0001, B6
, LDST
, 0x00000008, 1, 1),
207 _Q1B(LOCAL_LD_TRANSACTIONS
, 0x0001, B6
, TRANSACTION
, 0x00000000, 1, 1),
208 _Q1A(LOCAL_ST
, 0x0001, B6
, LDST
, 0x0000000c, 1, 1),
209 _Q1B(LOCAL_ST_TRANSACTIONS
, 0x0001, B6
, TRANSACTION
, 0x00000004, 1, 1),
210 _Q1A(PROF_TRIGGER_0
, 0x0001, B6
, USER
, 0x00000000, 1, 1),
211 _Q1A(PROF_TRIGGER_1
, 0x0001, B6
, USER
, 0x00000004, 1, 1),
212 _Q1A(PROF_TRIGGER_2
, 0x0001, B6
, USER
, 0x00000008, 1, 1),
213 _Q1A(PROF_TRIGGER_3
, 0x0001, B6
, USER
, 0x0000000c, 1, 1),
214 _Q1A(PROF_TRIGGER_4
, 0x0001, B6
, USER
, 0x00000010, 1, 1),
215 _Q1A(PROF_TRIGGER_5
, 0x0001, B6
, USER
, 0x00000014, 1, 1),
216 _Q1A(PROF_TRIGGER_6
, 0x0001, B6
, USER
, 0x00000018, 1, 1),
217 _Q1A(PROF_TRIGGER_7
, 0x0001, B6
, USER
, 0x0000001c, 1, 1),
218 _Q1A(SHARED_LD
, 0x0001, B6
, LDST
, 0x00000000, 1, 1),
219 _Q1B(SHARED_LD_REPLAY
, 0x0001, B6
, REPLAY
, 0x00000008, 1, 1),
220 _Q1A(SHARED_ST
, 0x0001, B6
, LDST
, 0x00000004, 1, 1),
221 _Q1B(SHARED_ST_REPLAY
, 0x0001, B6
, REPLAY
, 0x0000000c, 1, 1),
222 _Q1B(SM_CTA_LAUNCHED
, 0x0001, B6
, WARP
, 0x0000001c, 1, 1),
223 _Q1A(THREADS_LAUNCHED
, 0x003f, B6
, LAUNCH
, 0x398a4188, 1, 1),
224 _Q1B(UNCACHED_GLD_TRANSACTIONS
, 0x0001, B6
, MEM
, 0x00000000, 1, 1),
225 _Q1A(WARPS_LAUNCHED
, 0x0001, B6
, LAUNCH
, 0x00000004, 1, 1),
226 _M2AB(IPC
, 0x3, B6
, EXEC
, 0x398, 0xffff, LOGOP
, WARP
, 0x0, DIV_SUM_M0
, 10, 1),
227 _M2AB(IPAC
, 0x3, B6
, EXEC
, 0x398, 0x1, B6
, WARP
, 0x0, AVG_DIV_MM
, 10, 1),
228 _M2A(IPEC
, 0x3, B6
, EXEC
, 0x398, 0xe, LOGOP
, EXEC
, 0x398, AVG_DIV_MM
, 10, 1),
229 _M2A(INST_REPLAY_OHEAD
, 0x3, B6
, ISSUE
, 0x104, 0x3, B6
, EXEC
, 0x398, REL_SUM_MM
, 100, 1),
230 _M2B(MP_OCCUPANCY
, 0x3f, B6
, WARP
, 0x31483104, 0x01, B6
, WARP
, 0x0, AVG_DIV_MM
, 200, 64),
231 _M2B(MP_EFFICIENCY
, 0x01, B6
, WARP
, 0x0, 0xffff, LOGOP
, WARP
, 0x0, AVG_DIV_M0
, 100, 1),
239 /* === PERFORMANCE MONITORING COUNTERS for NVC0:NVE4 === */
240 static const uint64_t nvc0_read_hw_sm_counters_code
[] =
243 * mov b32 $r9 $physid
252 * set $p0 0x1 eq u32 $r8 0x0
253 * mov b32 $r10 c0[0x0]
254 * mov b32 $r11 c0[0x4]
255 * ext u32 $r8 $r9 0x414
257 * mul $r8 u32 $r8 u32 36
258 * add b32 $r10 $c $r10 $r8
259 * add b32 $r11 $r11 0x0 $c
260 * mov b32 $r8 c0[0x8]
261 * st b128 wt g[$r10d+0x00] $r0q
262 * st b128 wt g[$r10d+0x10] $r4q
263 * st b32 wt g[$r10d+0x20] $r8
265 0x2c00000084021c04ULL
,
266 0x2c0000000c025c04ULL
,
267 0x2c00000010001c04ULL
,
268 0x2c00000014005c04ULL
,
269 0x2c00000018009c04ULL
,
270 0x2c0000001c00dc04ULL
,
271 0x2c00000020011c04ULL
,
272 0x2c00000024015c04ULL
,
273 0x2c00000028019c04ULL
,
274 0x2c0000002c01dc04ULL
,
275 0x190e0000fc81dc03ULL
,
276 0x2800400000029de4ULL
,
277 0x280040001002dde4ULL
,
278 0x7000c01050921c03ULL
,
279 0x80000000000021e7ULL
,
280 0x1000000090821c02ULL
,
281 0x4801000020a29c03ULL
,
282 0x0800000000b2dc42ULL
,
283 0x2800400020021de4ULL
,
284 0x9400000000a01fc5ULL
,
285 0x9400000040a11fc5ULL
,
286 0x9400000080a21f85ULL
,
287 0x8000000000001de7ULL
290 #define _Q(n, f, m, g, c, s0, s1, s2, s3, s4, s5) [NVC0_HW_SM_QUERY_##n] = { { { f, NVC0_COMPUTE_MP_PM_OP_MODE_##m, c, 0, g, s0|(s1 << 8)|(s2 << 16)|(s3 << 24)|(s4##ULL << 32)|(s5##ULL << 40) }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { 1, 1 } }
292 static const struct nvc0_hw_sm_query_cfg nvc0_hw_sm_queries
[] =
294 _Q(ACTIVE_CYCLES
, 0xaaaa, LOGOP
, 0x11, 1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00),
295 _Q(ACTIVE_WARPS
, 0xaaaa, LOGOP
, 0x24, 6, 0x10, 0x21, 0x32, 0x43, 0x54, 0x65),
296 _Q(ATOM_COUNT
, 0xaaaa, LOGOP
, 0x63, 1, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00),
297 _Q(BRANCH
, 0xaaaa, LOGOP
, 0x1a, 2, 0x00, 0x11, 0x00, 0x00, 0x00, 0x00),
298 _Q(DIVERGENT_BRANCH
, 0xaaaa, LOGOP
, 0x19, 2, 0x20, 0x31, 0x00, 0x00, 0x00, 0x00),
299 _Q(GLD_REQUEST
, 0xaaaa, LOGOP
, 0x64, 1, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00),
300 _Q(GRED_COUNT
, 0xaaaa, LOGOP
, 0x63, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00),
301 _Q(GST_REQUEST
, 0xaaaa, LOGOP
, 0x64, 1, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00),
302 _Q(INST_EXECUTED
, 0xaaaa, LOGOP
, 0x2d, 3, 0x00, 0x11, 0x22, 0x00, 0x00, 0x00),
303 _Q(INST_ISSUED1_0
, 0xaaaa, LOGOP
, 0x7e, 1, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00),
304 _Q(INST_ISSUED1_1
, 0xaaaa, LOGOP
, 0x7e, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00),
305 _Q(INST_ISSUED2_0
, 0xaaaa, LOGOP
, 0x7e, 1, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00),
306 _Q(INST_ISSUED2_1
, 0xaaaa, LOGOP
, 0x7e, 1, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00),
307 _Q(LOCAL_LD
, 0xaaaa, LOGOP
, 0x64, 1, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00),
308 _Q(LOCAL_ST
, 0xaaaa, LOGOP
, 0x64, 1, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00),
309 _Q(PROF_TRIGGER_0
, 0xaaaa, LOGOP
, 0x01, 1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00),
310 _Q(PROF_TRIGGER_1
, 0xaaaa, LOGOP
, 0x01, 1, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00),
311 _Q(PROF_TRIGGER_2
, 0xaaaa, LOGOP
, 0x01, 1, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00),
312 _Q(PROF_TRIGGER_3
, 0xaaaa, LOGOP
, 0x01, 1, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00),
313 _Q(PROF_TRIGGER_4
, 0xaaaa, LOGOP
, 0x01, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00),
314 _Q(PROF_TRIGGER_5
, 0xaaaa, LOGOP
, 0x01, 1, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00),
315 _Q(PROF_TRIGGER_6
, 0xaaaa, LOGOP
, 0x01, 1, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00),
316 _Q(PROF_TRIGGER_7
, 0xaaaa, LOGOP
, 0x01, 1, 0x70, 0x00, 0x00, 0x00, 0x00, 0x00),
317 _Q(SHARED_LD
, 0xaaaa, LOGOP
, 0x64, 1, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00),
318 _Q(SHARED_ST
, 0xaaaa, LOGOP
, 0x64, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00),
319 _Q(THREADS_LAUNCHED
, 0xaaaa, LOGOP
, 0x26, 6, 0x10, 0x21, 0x32, 0x43, 0x54, 0x65),
320 _Q(TH_INST_EXECUTED_0
, 0xaaaa, LOGOP
, 0xa3, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55),
321 _Q(TH_INST_EXECUTED_1
, 0xaaaa, LOGOP
, 0xa5, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55),
322 _Q(TH_INST_EXECUTED_2
, 0xaaaa, LOGOP
, 0xa4, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55),
323 _Q(TH_INST_EXECUTED_3
, 0xaaaa, LOGOP
, 0xa6, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55),
324 _Q(WARPS_LAUNCHED
, 0xaaaa, LOGOP
, 0x26, 1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00),
329 static const struct nvc0_hw_sm_query_cfg
*
330 nvc0_hw_sm_query_get_cfg(struct nvc0_context
*nvc0
, struct nvc0_query
*q
)
332 struct nvc0_screen
*screen
= nvc0
->screen
;
334 if (screen
->base
.class_3d
>= NVE4_3D_CLASS
)
335 return &nve4_hw_sm_queries
[q
->type
- PIPE_QUERY_DRIVER_SPECIFIC
];
336 return &nvc0_hw_sm_queries
[q
->type
- NVC0_HW_SM_QUERY(0)];
340 nvc0_hw_sm_query_begin(struct nvc0_context
*nvc0
, struct nvc0_query
*q
)
342 struct nvc0_screen
*screen
= nvc0
->screen
;
343 struct nouveau_pushbuf
*push
= nvc0
->base
.pushbuf
;
344 const bool is_nve4
= screen
->base
.class_3d
>= NVE4_3D_CLASS
;
345 struct nvc0_hw_query
*hq
= nvc0_hw_query(q
);
346 const struct nvc0_hw_sm_query_cfg
*cfg
;
348 unsigned num_ab
[2] = { 0, 0 };
350 cfg
= nvc0_hw_sm_query_get_cfg(nvc0
, q
);
352 /* check if we have enough free counter slots */
353 for (i
= 0; i
< cfg
->num_counters
; ++i
)
354 num_ab
[cfg
->ctr
[i
].sig_dom
]++;
356 if (screen
->pm
.num_hw_sm_active
[0] + num_ab
[0] > 4 ||
357 screen
->pm
.num_hw_sm_active
[1] + num_ab
[1] > 4) {
358 NOUVEAU_ERR("Not enough free MP counter slots !\n");
362 assert(cfg
->num_counters
<= 4);
363 PUSH_SPACE(push
, 4 * 8 * (is_nve4
? 1 : 6) + 6);
365 if (!screen
->pm
.mp_counters_enabled
) {
366 screen
->pm
.mp_counters_enabled
= true;
367 BEGIN_NVC0(push
, SUBC_SW(0x06ac), 1);
368 PUSH_DATA (push
, 0x1fcb);
371 /* set sequence field to 0 (used to check if result is available) */
372 for (i
= 0; i
< screen
->mp_count
; ++i
)
373 hq
->data
[i
* 10 + 10] = 0;
375 for (i
= 0; i
< cfg
->num_counters
; ++i
) {
376 const unsigned d
= cfg
->ctr
[i
].sig_dom
;
378 if (!screen
->pm
.num_hw_sm_active
[d
]) {
379 uint32_t m
= (1 << 22) | (1 << (7 + (8 * !d
)));
380 if (screen
->pm
.num_hw_sm_active
[!d
])
381 m
|= 1 << (7 + (8 * d
));
382 BEGIN_NVC0(push
, SUBC_SW(0x0600), 1);
385 screen
->pm
.num_hw_sm_active
[d
]++;
387 for (c
= d
* 4; c
< (d
* 4 + 4); ++c
) {
388 if (!screen
->pm
.mp_counter
[c
]) {
390 screen
->pm
.mp_counter
[c
] = (struct pipe_query
*)q
;
394 assert(c
<= (d
* 4 + 3)); /* must succeed, already checked for space */
396 /* configure and reset the counter(s) */
399 BEGIN_NVC0(push
, NVE4_COMPUTE(MP_PM_A_SIGSEL(c
& 3)), 1);
401 BEGIN_NVC0(push
, NVE4_COMPUTE(MP_PM_B_SIGSEL(c
& 3)), 1);
402 PUSH_DATA (push
, cfg
->ctr
[i
].sig_sel
);
403 BEGIN_NVC0(push
, NVE4_COMPUTE(MP_PM_SRCSEL(c
)), 1);
404 PUSH_DATA (push
, cfg
->ctr
[i
].src_sel
+ 0x2108421 * (c
& 3));
405 BEGIN_NVC0(push
, NVE4_COMPUTE(MP_PM_FUNC(c
)), 1);
406 PUSH_DATA (push
, (cfg
->ctr
[i
].func
<< 4) | cfg
->ctr
[i
].mode
);
407 BEGIN_NVC0(push
, NVE4_COMPUTE(MP_PM_SET(c
)), 1);
412 for (s
= 0; s
< cfg
->ctr
[i
].num_src
; s
++) {
413 BEGIN_NVC0(push
, NVC0_COMPUTE(MP_PM_SIGSEL(s
)), 1);
414 PUSH_DATA (push
, cfg
->ctr
[i
].sig_sel
);
415 BEGIN_NVC0(push
, NVC0_COMPUTE(MP_PM_SRCSEL(s
)), 1);
416 PUSH_DATA (push
, (cfg
->ctr
[i
].src_sel
>> (s
* 8)) & 0xff);
417 BEGIN_NVC0(push
, NVC0_COMPUTE(MP_PM_OP(s
)), 1);
418 PUSH_DATA (push
, (cfg
->ctr
[i
].func
<< 4) | cfg
->ctr
[i
].mode
);
419 BEGIN_NVC0(push
, NVC0_COMPUTE(MP_PM_SET(s
)), 1);
428 nvc0_hw_sm_query_end(struct nvc0_context
*nvc0
, struct nvc0_query
*q
)
430 struct nvc0_screen
*screen
= nvc0
->screen
;
431 struct pipe_context
*pipe
= &nvc0
->base
.pipe
;
432 struct nouveau_pushbuf
*push
= nvc0
->base
.pushbuf
;
433 const bool is_nve4
= screen
->base
.class_3d
>= NVE4_3D_CLASS
;
434 struct nvc0_hw_query
*hq
= nvc0_hw_query(q
);
437 const uint block
[3] = { 32, is_nve4
? 4 : 1, 1 };
438 const uint grid
[3] = { screen
->mp_count
, 1, 1 };
440 const struct nvc0_hw_sm_query_cfg
*cfg
;
442 cfg
= nvc0_hw_sm_query_get_cfg(nvc0
, q
);
444 if (unlikely(!screen
->pm
.prog
)) {
445 struct nvc0_program
*prog
= CALLOC_STRUCT(nvc0_program
);
446 prog
->type
= PIPE_SHADER_COMPUTE
;
447 prog
->translated
= true;
449 prog
->parm_size
= 12;
451 prog
->code
= (uint32_t *)nve4_read_hw_sm_counters_code
;
452 prog
->code_size
= sizeof(nve4_read_hw_sm_counters_code
);
454 prog
->code
= (uint32_t *)nvc0_read_hw_sm_counters_code
;
455 prog
->code_size
= sizeof(nvc0_read_hw_sm_counters_code
);
457 screen
->pm
.prog
= prog
;
460 /* disable all counting */
462 for (c
= 0; c
< 8; ++c
)
463 if (screen
->pm
.mp_counter
[c
]) {
465 IMMED_NVC0(push
, NVE4_COMPUTE(MP_PM_FUNC(c
)), 0);
467 IMMED_NVC0(push
, NVC0_COMPUTE(MP_PM_OP(c
)), 0);
470 /* release counters for this query */
471 for (c
= 0; c
< 8; ++c
) {
472 if (nvc0_query(screen
->pm
.mp_counter
[c
]) == q
) {
473 screen
->pm
.num_hw_sm_active
[c
/ 4]--;
474 screen
->pm
.mp_counter
[c
] = NULL
;
478 BCTX_REFN_bo(nvc0
->bufctx_cp
, CP_QUERY
, NOUVEAU_BO_GART
| NOUVEAU_BO_WR
,
482 IMMED_NVC0(push
, SUBC_COMPUTE(NV50_GRAPH_SERIALIZE
), 0);
484 pipe
->bind_compute_state(pipe
, screen
->pm
.prog
);
485 input
[0] = (hq
->bo
->offset
+ hq
->base_offset
);
486 input
[1] = (hq
->bo
->offset
+ hq
->base_offset
) >> 32;
487 input
[2] = hq
->sequence
;
488 pipe
->launch_grid(pipe
, block
, grid
, 0, input
);
490 nouveau_bufctx_reset(nvc0
->bufctx_cp
, NVC0_BIND_CP_QUERY
);
492 /* re-activate other counters */
493 PUSH_SPACE(push
, 16);
495 for (c
= 0; c
< 8; ++c
) {
497 q
= nvc0_query(screen
->pm
.mp_counter
[c
]);
500 cfg
= nvc0_hw_sm_query_get_cfg(nvc0
, q
);
501 for (i
= 0; i
< cfg
->num_counters
; ++i
) {
502 if (mask
& (1 << hq
->ctr
[i
]))
504 mask
|= 1 << hq
->ctr
[i
];
506 BEGIN_NVC0(push
, NVE4_COMPUTE(MP_PM_FUNC(hq
->ctr
[i
])), 1);
508 BEGIN_NVC0(push
, NVC0_COMPUTE(MP_PM_OP(hq
->ctr
[i
])), 1);
510 PUSH_DATA (push
, (cfg
->ctr
[i
].func
<< 4) | cfg
->ctr
[i
].mode
);
516 nvc0_hw_sm_query_read_data(uint32_t count
[32][4],
517 struct nvc0_context
*nvc0
, bool wait
,
518 struct nvc0_query
*q
,
519 const struct nvc0_hw_sm_query_cfg
*cfg
,
522 struct nvc0_hw_query
*hq
= nvc0_hw_query(q
);
525 for (p
= 0; p
< mp_count
; ++p
) {
526 const unsigned b
= (0x24 / 4) * p
;
528 for (c
= 0; c
< cfg
->num_counters
; ++c
) {
529 if (hq
->data
[b
+ 8] != hq
->sequence
) {
532 if (nouveau_bo_wait(hq
->bo
, NOUVEAU_BO_RD
, nvc0
->base
.client
))
535 count
[p
][c
] = hq
->data
[b
+ hq
->ctr
[c
]];
542 nve4_hw_sm_query_read_data(uint32_t count
[32][4],
543 struct nvc0_context
*nvc0
, bool wait
,
544 struct nvc0_query
*q
,
545 const struct nvc0_hw_sm_query_cfg
*cfg
,
548 struct nvc0_hw_query
*hq
= nvc0_hw_query(q
);
551 for (p
= 0; p
< mp_count
; ++p
) {
552 const unsigned b
= (0x60 / 4) * p
;
554 for (c
= 0; c
< cfg
->num_counters
; ++c
) {
556 for (d
= 0; d
< ((hq
->ctr
[c
] & ~3) ? 1 : 4); ++d
) {
557 if (hq
->data
[b
+ 20 + d
] != hq
->sequence
) {
560 if (nouveau_bo_wait(hq
->bo
, NOUVEAU_BO_RD
, nvc0
->base
.client
))
563 if (hq
->ctr
[c
] & ~0x3)
564 count
[p
][c
] = hq
->data
[b
+ 16 + (hq
->ctr
[c
] & 3)];
566 count
[p
][c
] += hq
->data
[b
+ d
* 4 + hq
->ctr
[c
]];
573 /* Metric calculations:
574 * sum(x) ... sum of x over all MPs
575 * avg(x) ... average of x over all MPs
577 * IPC : sum(inst_executed) / clock
578 * INST_REPLAY_OHEAD: (sum(inst_issued) - sum(inst_executed)) / sum(inst_issued)
579 * MP_OCCUPANCY : avg((active_warps / 64) / active_cycles)
580 * MP_EFFICIENCY : avg(active_cycles / clock)
582 * NOTE: Interpretation of IPC requires knowledge of MP count.
585 nvc0_hw_sm_query_result(struct nvc0_context
*nvc0
, struct nvc0_query
*q
,
586 void *result
, boolean wait
)
588 uint32_t count
[32][4];
590 unsigned mp_count
= MIN2(nvc0
->screen
->mp_count_compute
, 32);
592 const struct nvc0_hw_sm_query_cfg
*cfg
;
595 cfg
= nvc0_hw_sm_query_get_cfg(nvc0
, q
);
597 if (nvc0
->screen
->base
.class_3d
>= NVE4_3D_CLASS
)
598 ret
= nve4_hw_sm_query_read_data(count
, nvc0
, wait
, q
, cfg
, mp_count
);
600 ret
= nvc0_hw_sm_query_read_data(count
, nvc0
, wait
, q
, cfg
, mp_count
);
604 if (cfg
->op
== NVC0_COUNTER_OPn_SUM
) {
605 for (c
= 0; c
< cfg
->num_counters
; ++c
)
606 for (p
= 0; p
< mp_count
; ++p
)
607 value
+= count
[p
][c
];
608 value
= (value
* cfg
->norm
[0]) / cfg
->norm
[1];
610 if (cfg
->op
== NVC0_COUNTER_OPn_OR
) {
612 for (c
= 0; c
< cfg
->num_counters
; ++c
)
613 for (p
= 0; p
< mp_count
; ++p
)
615 value
= ((uint64_t)v
* cfg
->norm
[0]) / cfg
->norm
[1];
617 if (cfg
->op
== NVC0_COUNTER_OPn_AND
) {
619 for (c
= 0; c
< cfg
->num_counters
; ++c
)
620 for (p
= 0; p
< mp_count
; ++p
)
622 value
= ((uint64_t)v
* cfg
->norm
[0]) / cfg
->norm
[1];
624 if (cfg
->op
== NVC0_COUNTER_OP2_REL_SUM_MM
) {
625 uint64_t v
[2] = { 0, 0 };
626 for (p
= 0; p
< mp_count
; ++p
) {
631 value
= ((v
[0] - v
[1]) * cfg
->norm
[0]) / (v
[0] * cfg
->norm
[1]);
633 if (cfg
->op
== NVC0_COUNTER_OP2_DIV_SUM_M0
) {
634 for (p
= 0; p
< mp_count
; ++p
)
635 value
+= count
[p
][0];
637 value
= (value
* cfg
->norm
[0]) / (count
[0][1] * cfg
->norm
[1]);
641 if (cfg
->op
== NVC0_COUNTER_OP2_AVG_DIV_MM
) {
642 unsigned mp_used
= 0;
643 for (p
= 0; p
< mp_count
; ++p
, mp_used
+= !!count
[p
][0])
645 value
+= (count
[p
][0] * cfg
->norm
[0]) / count
[p
][1];
647 value
/= (uint64_t)mp_used
* cfg
->norm
[1];
649 if (cfg
->op
== NVC0_COUNTER_OP2_AVG_DIV_M0
) {
650 unsigned mp_used
= 0;
651 for (p
= 0; p
< mp_count
; ++p
, mp_used
+= !!count
[p
][0])
652 value
+= count
[p
][0];
653 if (count
[0][1] && mp_used
) {
654 value
*= cfg
->norm
[0];
655 value
/= (uint64_t)count
[0][1] * mp_used
* cfg
->norm
[1];
661 *(uint64_t *)result
= value
;
666 nvc0_hw_query_allocate(struct nvc0_context
*nvc0
, struct nvc0_query
*q
,
669 struct nvc0_hw_query
*hq
= nvc0_hw_query(q
);
670 struct nvc0_screen
*screen
= nvc0
->screen
;
674 nouveau_bo_ref(NULL
, &hq
->bo
);
676 if (hq
->state
== NVC0_HW_QUERY_STATE_READY
)
677 nouveau_mm_free(hq
->mm
);
679 nouveau_fence_work(screen
->base
.fence
.current
,
680 nouveau_mm_free_work
, hq
->mm
);
684 hq
->mm
= nouveau_mm_allocate(screen
->base
.mm_GART
, size
, &hq
->bo
,
688 hq
->offset
= hq
->base_offset
;
690 ret
= nouveau_bo_map(hq
->bo
, 0, screen
->base
.client
);
692 nvc0_hw_query_allocate(nvc0
, q
, 0);
695 hq
->data
= (uint32_t *)((uint8_t *)hq
->bo
->map
+ hq
->base_offset
);
701 nvc0_hw_query_get(struct nouveau_pushbuf
*push
, struct nvc0_query
*q
,
702 unsigned offset
, uint32_t get
)
704 struct nvc0_hw_query
*hq
= nvc0_hw_query(q
);
706 offset
+= hq
->offset
;
709 PUSH_REFN (push
, hq
->bo
, NOUVEAU_BO_GART
| NOUVEAU_BO_WR
);
710 BEGIN_NVC0(push
, NVC0_3D(QUERY_ADDRESS_HIGH
), 4);
711 PUSH_DATAh(push
, hq
->bo
->offset
+ offset
);
712 PUSH_DATA (push
, hq
->bo
->offset
+ offset
);
713 PUSH_DATA (push
, hq
->sequence
);
714 PUSH_DATA (push
, get
);
718 nvc0_hw_query_rotate(struct nvc0_context
*nvc0
, struct nvc0_query
*q
)
720 struct nvc0_hw_query
*hq
= nvc0_hw_query(q
);
722 hq
->offset
+= hq
->rotate
;
723 hq
->data
+= hq
->rotate
/ sizeof(*hq
->data
);
724 if (hq
->offset
- hq
->base_offset
== NVC0_HW_QUERY_ALLOC_SPACE
)
725 nvc0_hw_query_allocate(nvc0
, q
, NVC0_HW_QUERY_ALLOC_SPACE
);
729 nvc0_hw_query_update(struct nouveau_client
*cli
, struct nvc0_query
*q
)
731 struct nvc0_hw_query
*hq
= nvc0_hw_query(q
);
734 if (nouveau_fence_signalled(hq
->fence
))
735 hq
->state
= NVC0_HW_QUERY_STATE_READY
;
737 if (hq
->data
[0] == hq
->sequence
)
738 hq
->state
= NVC0_HW_QUERY_STATE_READY
;
743 nvc0_hw_destroy_query(struct nvc0_context
*nvc0
, struct nvc0_query
*q
)
745 struct nvc0_hw_query
*hq
= nvc0_hw_query(q
);
746 nvc0_hw_query_allocate(nvc0
, q
, 0);
747 nouveau_fence_ref(NULL
, &hq
->fence
);
752 nvc0_hw_begin_query(struct nvc0_context
*nvc0
, struct nvc0_query
*q
)
754 struct nouveau_pushbuf
*push
= nvc0
->base
.pushbuf
;
755 struct nvc0_hw_query
*hq
= nvc0_hw_query(q
);
758 /* For occlusion queries we have to change the storage, because a previous
759 * query might set the initial render conition to false even *after* we re-
760 * initialized it to true.
763 nvc0_hw_query_rotate(nvc0
, q
);
765 /* XXX: can we do this with the GPU, and sync with respect to a previous
768 hq
->data
[0] = hq
->sequence
; /* initialize sequence */
769 hq
->data
[1] = 1; /* initial render condition = true */
770 hq
->data
[4] = hq
->sequence
+ 1; /* for comparison COND_MODE */
776 case PIPE_QUERY_OCCLUSION_COUNTER
:
777 case PIPE_QUERY_OCCLUSION_PREDICATE
:
778 hq
->nesting
= nvc0
->screen
->num_occlusion_queries_active
++;
780 nvc0_hw_query_get(push
, q
, 0x10, 0x0100f002);
783 BEGIN_NVC0(push
, NVC0_3D(COUNTER_RESET
), 1);
784 PUSH_DATA (push
, NVC0_3D_COUNTER_RESET_SAMPLECNT
);
785 IMMED_NVC0(push
, NVC0_3D(SAMPLECNT_ENABLE
), 1);
788 case PIPE_QUERY_PRIMITIVES_GENERATED
:
789 nvc0_hw_query_get(push
, q
, 0x10, 0x09005002 | (q
->index
<< 5));
791 case PIPE_QUERY_PRIMITIVES_EMITTED
:
792 nvc0_hw_query_get(push
, q
, 0x10, 0x05805002 | (q
->index
<< 5));
794 case PIPE_QUERY_SO_STATISTICS
:
795 nvc0_hw_query_get(push
, q
, 0x20, 0x05805002 | (q
->index
<< 5));
796 nvc0_hw_query_get(push
, q
, 0x30, 0x06805002 | (q
->index
<< 5));
798 case PIPE_QUERY_SO_OVERFLOW_PREDICATE
:
799 nvc0_hw_query_get(push
, q
, 0x10, 0x03005002 | (q
->index
<< 5));
801 case PIPE_QUERY_TIME_ELAPSED
:
802 nvc0_hw_query_get(push
, q
, 0x10, 0x00005002);
804 case PIPE_QUERY_PIPELINE_STATISTICS
:
805 nvc0_hw_query_get(push
, q
, 0xc0 + 0x00, 0x00801002); /* VFETCH, VERTICES */
806 nvc0_hw_query_get(push
, q
, 0xc0 + 0x10, 0x01801002); /* VFETCH, PRIMS */
807 nvc0_hw_query_get(push
, q
, 0xc0 + 0x20, 0x02802002); /* VP, LAUNCHES */
808 nvc0_hw_query_get(push
, q
, 0xc0 + 0x30, 0x03806002); /* GP, LAUNCHES */
809 nvc0_hw_query_get(push
, q
, 0xc0 + 0x40, 0x04806002); /* GP, PRIMS_OUT */
810 nvc0_hw_query_get(push
, q
, 0xc0 + 0x50, 0x07804002); /* RAST, PRIMS_IN */
811 nvc0_hw_query_get(push
, q
, 0xc0 + 0x60, 0x08804002); /* RAST, PRIMS_OUT */
812 nvc0_hw_query_get(push
, q
, 0xc0 + 0x70, 0x0980a002); /* ROP, PIXELS */
813 nvc0_hw_query_get(push
, q
, 0xc0 + 0x80, 0x0d808002); /* TCP, LAUNCHES */
814 nvc0_hw_query_get(push
, q
, 0xc0 + 0x90, 0x0e809002); /* TEP, LAUNCHES */
817 if ((q
->type
>= NVE4_HW_SM_QUERY(0) && q
->type
<= NVE4_HW_SM_QUERY_LAST
) ||
818 (q
->type
>= NVC0_HW_SM_QUERY(0) && q
->type
<= NVC0_HW_SM_QUERY_LAST
)) {
819 ret
= nvc0_hw_sm_query_begin(nvc0
, q
);
823 hq
->state
= NVC0_HW_QUERY_STATE_ACTIVE
;
828 nvc0_hw_end_query(struct nvc0_context
*nvc0
, struct nvc0_query
*q
)
830 struct nouveau_pushbuf
*push
= nvc0
->base
.pushbuf
;
831 struct nvc0_hw_query
*hq
= nvc0_hw_query(q
);
833 if (hq
->state
!= NVC0_HW_QUERY_STATE_ACTIVE
) {
834 /* some queries don't require 'begin' to be called (e.g. GPU_FINISHED) */
836 nvc0_hw_query_rotate(nvc0
, q
);
839 hq
->state
= NVC0_HW_QUERY_STATE_ENDED
;
842 case PIPE_QUERY_OCCLUSION_COUNTER
:
843 case PIPE_QUERY_OCCLUSION_PREDICATE
:
844 nvc0_hw_query_get(push
, q
, 0, 0x0100f002);
845 if (--nvc0
->screen
->num_occlusion_queries_active
== 0) {
847 IMMED_NVC0(push
, NVC0_3D(SAMPLECNT_ENABLE
), 0);
850 case PIPE_QUERY_PRIMITIVES_GENERATED
:
851 nvc0_hw_query_get(push
, q
, 0, 0x09005002 | (q
->index
<< 5));
853 case PIPE_QUERY_PRIMITIVES_EMITTED
:
854 nvc0_hw_query_get(push
, q
, 0, 0x05805002 | (q
->index
<< 5));
856 case PIPE_QUERY_SO_STATISTICS
:
857 nvc0_hw_query_get(push
, q
, 0x00, 0x05805002 | (q
->index
<< 5));
858 nvc0_hw_query_get(push
, q
, 0x10, 0x06805002 | (q
->index
<< 5));
860 case PIPE_QUERY_SO_OVERFLOW_PREDICATE
:
861 /* TODO: How do we sum over all streams for render condition ? */
862 /* PRIMS_DROPPED doesn't write sequence, use a ZERO query to sync on */
863 nvc0_hw_query_get(push
, q
, 0x00, 0x03005002 | (q
->index
<< 5));
864 nvc0_hw_query_get(push
, q
, 0x20, 0x00005002);
866 case PIPE_QUERY_TIMESTAMP
:
867 case PIPE_QUERY_TIME_ELAPSED
:
868 nvc0_hw_query_get(push
, q
, 0, 0x00005002);
870 case PIPE_QUERY_GPU_FINISHED
:
871 nvc0_hw_query_get(push
, q
, 0, 0x1000f010);
873 case PIPE_QUERY_PIPELINE_STATISTICS
:
874 nvc0_hw_query_get(push
, q
, 0x00, 0x00801002); /* VFETCH, VERTICES */
875 nvc0_hw_query_get(push
, q
, 0x10, 0x01801002); /* VFETCH, PRIMS */
876 nvc0_hw_query_get(push
, q
, 0x20, 0x02802002); /* VP, LAUNCHES */
877 nvc0_hw_query_get(push
, q
, 0x30, 0x03806002); /* GP, LAUNCHES */
878 nvc0_hw_query_get(push
, q
, 0x40, 0x04806002); /* GP, PRIMS_OUT */
879 nvc0_hw_query_get(push
, q
, 0x50, 0x07804002); /* RAST, PRIMS_IN */
880 nvc0_hw_query_get(push
, q
, 0x60, 0x08804002); /* RAST, PRIMS_OUT */
881 nvc0_hw_query_get(push
, q
, 0x70, 0x0980a002); /* ROP, PIXELS */
882 nvc0_hw_query_get(push
, q
, 0x80, 0x0d808002); /* TCP, LAUNCHES */
883 nvc0_hw_query_get(push
, q
, 0x90, 0x0e809002); /* TEP, LAUNCHES */
885 case PIPE_QUERY_TIMESTAMP_DISJOINT
:
886 /* This query is not issued on GPU because disjoint is forced to false */
887 hq
->state
= NVC0_HW_QUERY_STATE_READY
;
889 case NVC0_HW_QUERY_TFB_BUFFER_OFFSET
:
890 /* indexed by TFB buffer instead of by vertex stream */
891 nvc0_hw_query_get(push
, q
, 0x00, 0x0d005002 | (q
->index
<< 5));
894 if ((q
->type
>= NVE4_HW_SM_QUERY(0) && q
->type
<= NVE4_HW_SM_QUERY_LAST
) ||
895 (q
->type
>= NVC0_HW_SM_QUERY(0) && q
->type
<= NVC0_HW_SM_QUERY_LAST
)) {
896 nvc0_hw_sm_query_end(nvc0
, q
);
901 nouveau_fence_ref(nvc0
->screen
->base
.fence
.current
, &hq
->fence
);
905 nvc0_hw_get_query_result(struct nvc0_context
*nvc0
, struct nvc0_query
*q
,
906 boolean wait
, union pipe_query_result
*result
)
908 struct nvc0_hw_query
*hq
= nvc0_hw_query(q
);
909 uint64_t *res64
= (uint64_t*)result
;
910 uint32_t *res32
= (uint32_t*)result
;
911 uint8_t *res8
= (uint8_t*)result
;
912 uint64_t *data64
= (uint64_t *)hq
->data
;
915 if ((q
->type
>= NVE4_HW_SM_QUERY(0) && q
->type
<= NVE4_HW_SM_QUERY_LAST
) ||
916 (q
->type
>= NVC0_HW_SM_QUERY(0) && q
->type
<= NVC0_HW_SM_QUERY_LAST
)) {
917 return nvc0_hw_sm_query_result(nvc0
, q
, result
, wait
);
920 if (hq
->state
!= NVC0_HW_QUERY_STATE_READY
)
921 nvc0_hw_query_update(nvc0
->screen
->base
.client
, q
);
923 if (hq
->state
!= NVC0_HW_QUERY_STATE_READY
) {
925 if (hq
->state
!= NVC0_HW_QUERY_STATE_FLUSHED
) {
926 hq
->state
= NVC0_HW_QUERY_STATE_FLUSHED
;
927 /* flush for silly apps that spin on GL_QUERY_RESULT_AVAILABLE */
928 PUSH_KICK(nvc0
->base
.pushbuf
);
932 if (nouveau_bo_wait(hq
->bo
, NOUVEAU_BO_RD
, nvc0
->screen
->base
.client
))
934 NOUVEAU_DRV_STAT(&nvc0
->screen
->base
, query_sync_count
, 1);
936 hq
->state
= NVC0_HW_QUERY_STATE_READY
;
939 case PIPE_QUERY_GPU_FINISHED
:
942 case PIPE_QUERY_OCCLUSION_COUNTER
: /* u32 sequence, u32 count, u64 time */
943 res64
[0] = hq
->data
[1] - hq
->data
[5];
945 case PIPE_QUERY_OCCLUSION_PREDICATE
:
946 res8
[0] = hq
->data
[1] != hq
->data
[5];
948 case PIPE_QUERY_PRIMITIVES_GENERATED
: /* u64 count, u64 time */
949 case PIPE_QUERY_PRIMITIVES_EMITTED
: /* u64 count, u64 time */
950 res64
[0] = data64
[0] - data64
[2];
952 case PIPE_QUERY_SO_STATISTICS
:
953 res64
[0] = data64
[0] - data64
[4];
954 res64
[1] = data64
[2] - data64
[6];
956 case PIPE_QUERY_SO_OVERFLOW_PREDICATE
:
957 res8
[0] = data64
[0] != data64
[2];
959 case PIPE_QUERY_TIMESTAMP
:
960 res64
[0] = data64
[1];
962 case PIPE_QUERY_TIMESTAMP_DISJOINT
:
963 res64
[0] = 1000000000;
966 case PIPE_QUERY_TIME_ELAPSED
:
967 res64
[0] = data64
[1] - data64
[3];
969 case PIPE_QUERY_PIPELINE_STATISTICS
:
970 for (i
= 0; i
< 10; ++i
)
971 res64
[i
] = data64
[i
* 2] - data64
[24 + i
* 2];
973 case NVC0_HW_QUERY_TFB_BUFFER_OFFSET
:
974 res32
[0] = hq
->data
[1];
977 assert(0); /* can't happen, we don't create queries with invalid type */
984 static const struct nvc0_query_funcs hw_query_funcs
= {
985 .destroy_query
= nvc0_hw_destroy_query
,
986 .begin_query
= nvc0_hw_begin_query
,
987 .end_query
= nvc0_hw_end_query
,
988 .get_query_result
= nvc0_hw_get_query_result
,
992 nvc0_hw_create_query(struct nvc0_context
*nvc0
, unsigned type
, unsigned index
)
994 struct nvc0_hw_query
*hq
;
995 struct nvc0_query
*q
;
996 unsigned space
= NVC0_HW_QUERY_ALLOC_SPACE
;
998 hq
= CALLOC_STRUCT(nvc0_hw_query
);
1003 q
->funcs
= &hw_query_funcs
;
1007 case PIPE_QUERY_OCCLUSION_COUNTER
:
1008 case PIPE_QUERY_OCCLUSION_PREDICATE
:
1010 space
= NVC0_HW_QUERY_ALLOC_SPACE
;
1012 case PIPE_QUERY_PIPELINE_STATISTICS
:
1016 case PIPE_QUERY_SO_STATISTICS
:
1017 case PIPE_QUERY_SO_OVERFLOW_PREDICATE
:
1021 case PIPE_QUERY_PRIMITIVES_GENERATED
:
1022 case PIPE_QUERY_PRIMITIVES_EMITTED
:
1027 case PIPE_QUERY_TIME_ELAPSED
:
1028 case PIPE_QUERY_TIMESTAMP
:
1029 case PIPE_QUERY_TIMESTAMP_DISJOINT
:
1030 case PIPE_QUERY_GPU_FINISHED
:
1033 case NVC0_HW_QUERY_TFB_BUFFER_OFFSET
:
1037 if (nvc0
->screen
->base
.device
->drm_version
>= 0x01000101) {
1038 if (type
>= NVE4_HW_SM_QUERY(0) && type
<= NVE4_HW_SM_QUERY_LAST
) {
1060 * [50] = WS0.sequence
1061 * [54] = WS1.sequence
1062 * [58] = WS2.sequence
1063 * [5c] = WS3.sequence
1065 space
= (4 * 4 + 4 + 4) * nvc0
->screen
->mp_count
* sizeof(uint32_t);
1068 if (type
>= NVC0_HW_SM_QUERY(0) && type
<= NVC0_HW_SM_QUERY_LAST
) {
1078 * [20] = MP.sequence
1080 space
= (8 + 1) * nvc0
->screen
->mp_count
* sizeof(uint32_t);
1084 debug_printf("invalid query type: %u\n", type
);
1089 if (!nvc0_hw_query_allocate(nvc0
, q
, space
)) {
1095 /* we advance before query_begin ! */
1096 hq
->offset
-= hq
->rotate
;
1097 hq
->data
-= hq
->rotate
/ sizeof(*hq
->data
);
1100 hq
->data
[0] = 0; /* initialize sequence */
1106 nvc0_hw_query_pushbuf_submit(struct nouveau_pushbuf
*push
,
1107 struct nvc0_query
*q
, unsigned result_offset
)
1109 struct nvc0_hw_query
*hq
= nvc0_hw_query(q
);
1111 #define NVC0_IB_ENTRY_1_NO_PREFETCH (1 << (31 - 8))
1113 PUSH_REFN(push
, hq
->bo
, NOUVEAU_BO_RD
| NOUVEAU_BO_GART
);
1114 nouveau_pushbuf_space(push
, 0, 0, 1);
1115 nouveau_pushbuf_data(push
, hq
->bo
, hq
->offset
+ result_offset
, 4 |
1116 NVC0_IB_ENTRY_1_NO_PREFETCH
);
1120 nvc0_hw_query_fifo_wait(struct nouveau_pushbuf
*push
, struct nvc0_query
*q
)
1122 struct nvc0_hw_query
*hq
= nvc0_hw_query(q
);
1123 unsigned offset
= hq
->offset
;
1125 if (q
->type
== PIPE_QUERY_SO_OVERFLOW_PREDICATE
) offset
+= 0x20;
1127 PUSH_SPACE(push
, 5);
1128 PUSH_REFN (push
, hq
->bo
, NOUVEAU_BO_GART
| NOUVEAU_BO_RD
);
1129 BEGIN_NVC0(push
, SUBC_3D(NV84_SUBCHAN_SEMAPHORE_ADDRESS_HIGH
), 4);
1130 PUSH_DATAh(push
, hq
->bo
->offset
+ offset
);
1131 PUSH_DATA (push
, hq
->bo
->offset
+ offset
);
1132 PUSH_DATA (push
, hq
->sequence
);
1133 PUSH_DATA (push
, (1 << 12) |
1134 NV84_SUBCHAN_SEMAPHORE_TRIGGER_ACQUIRE_EQUAL
);