2 * Copyright 2011 Christoph Bumiller
3 * Copyright 2015 Samuel Pitoiset
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21 * OTHER DEALINGS IN THE SOFTWARE.
24 #define NVC0_PUSH_EXPLICIT_SPACE_CHECKING
26 #include "nvc0/nvc0_context.h"
27 #include "nvc0/nvc0_query_hw_sm.h"
29 #include "nv_object.xml.h"
30 #include "nvc0/nve4_compute.xml.h"
31 #include "nvc0/nvc0_compute.xml.h"
33 /* === PERFORMANCE MONITORING COUNTERS for NVE4+ === */
35 /* NOTE: intentionally using the same names as NV */
36 static const char *nve4_hw_sm_query_names
[] =
46 "global_ld_mem_divergence_replays",
47 "global_store_transaction",
48 "global_st_mem_divergence_replays",
55 "l1_global_load_miss",
56 "__l1_global_load_transactions",
57 "__l1_global_store_transactions",
61 "l1_local_store_miss",
62 "l1_shared_load_transactions",
63 "l1_shared_store_transactions",
65 "local_load_transactions",
67 "local_store_transactions",
79 "shared_store_replay",
82 "uncached_global_load_transaction",
86 /* Code to read out MP counters: They are accessible via mmio, too, but let's
87 * just avoid mapping registers in userspace. We'd have to know which MPs are
88 * enabled/present, too, and that information is not presently exposed.
89 * We could add a kernel interface for it, but reading the counters like this
90 * has the advantage of being async (if get_result isn't called immediately).
92 static const uint64_t nve4_read_hw_sm_counters_code
[] =
94 /* sched 0x20 0x20 0x20 0x20 0x20 0x20 0x20
96 * mov b32 $r12 $physid
102 * sched 0x20 0x20 0x23 0x04 0x20 0x04 0x2b
106 * set $p0 0x1 eq u32 $r8 0x0
107 * mov b32 $r10 c0[0x0]
108 * ext u32 $r8 $r12 0x414
109 * mov b32 $r11 c0[0x4]
110 * sched 0x04 0x2e 0x04 0x20 0x20 0x28 0x04
111 * ext u32 $r9 $r12 0x208
113 * set $p1 0x1 eq u32 $r9 0x0
114 * mul $r8 u32 $r8 u32 96
115 * mul $r12 u32 $r9 u32 16
116 * mul $r13 u32 $r9 u32 4
117 * add b32 $r9 $r8 $r13
118 * sched 0x28 0x04 0x2c 0x04 0x2c 0x04 0x2c
119 * add b32 $r8 $r8 $r12
121 * add b32 $r10 $c $r10 $r8
123 * add b32 $r11 $r11 0x0 $c
124 * add b32 $r12 $c $r12 $r9
125 * st b128 wt g[$r10d] $r0q
126 * sched 0x4 0x2c 0x20 0x04 0x2e 0x00 0x00
127 * mov b32 $r0 c0[0x8]
128 * add b32 $r13 $r13 0x0 $c
129 * $p1 st b128 wt g[$r12d+0x40] $r4q
130 * st b32 wt g[$r12d+0x50] $r0
132 0x2202020202020207ULL
,
133 0x2c00000084021c04ULL
,
134 0x2c0000000c031c04ULL
,
135 0x2c00000010001c04ULL
,
136 0x2c00000014005c04ULL
,
137 0x2c00000018009c04ULL
,
138 0x2c0000001c00dc04ULL
,
139 0x2c00000020011c04ULL
,
140 0x22b0420042320207ULL
,
141 0x2c00000024015c04ULL
,
142 0x2c00000028019c04ULL
,
143 0x2c0000002c01dc04ULL
,
144 0x190e0000fc81dc03ULL
,
145 0x2800400000029de4ULL
,
146 0x7000c01050c21c03ULL
,
147 0x280040001002dde4ULL
,
148 0x204282020042e047ULL
,
149 0x7000c00820c25c03ULL
,
150 0x80000000000021e7ULL
,
151 0x190e0000fc93dc03ULL
,
152 0x1000000180821c02ULL
,
153 0x1000000040931c02ULL
,
154 0x1000000010935c02ULL
,
155 0x4800000034825c03ULL
,
156 0x22c042c042c04287ULL
,
157 0x4800000030821c03ULL
,
158 0x2800000028031de4ULL
,
159 0x4801000020a29c03ULL
,
160 0x280000002c035de4ULL
,
161 0x0800000000b2dc42ULL
,
162 0x4801000024c31c03ULL
,
163 0x9400000000a01fc5ULL
,
164 0x200002e04202c047ULL
,
165 0x2800400020001de4ULL
,
166 0x0800000000d35c42ULL
,
167 0x9400000100c107c5ULL
,
168 0x9400000140c01f85ULL
,
169 0x8000000000001de7ULL
172 /* For simplicity, we will allocate as many group slots as we allocate counter
173 * slots. This means that a single counter which wants to source from 2 groups
174 * will have to be declared as using 2 counter slots. This shouldn't really be
175 * a problem because such queries don't make much sense ... (unless someone is
178 struct nvc0_hw_sm_counter_cfg
180 uint32_t func
: 16; /* mask or 4-bit logic op (depending on mode) */
181 uint32_t mode
: 4; /* LOGOP,B6,LOGOP_B6(_PULSE) */
182 uint32_t sig_dom
: 1; /* if 0, MP_PM_A (per warp-sched), if 1, MP_PM_B */
183 uint32_t sig_sel
: 8; /* signal group */
184 uint32_t src_mask
; /* mask for signal selection (only for NVC0:NVE4) */
185 uint32_t src_sel
; /* signal selection for up to 4 sources */
188 struct nvc0_hw_sm_query_cfg
190 struct nvc0_hw_sm_counter_cfg ctr
[8];
191 uint8_t num_counters
;
192 uint8_t norm
[2]; /* normalization num,denom */
195 #define _Q1A(n, f, m, g, s, nu, dn) [NVE4_HW_SM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g, 0, s }, {}, {}, {} }, 1, { nu, dn } }
196 #define _Q1B(n, f, m, g, s, nu, dn) [NVE4_HW_SM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g, 0, s }, {}, {}, {} }, 1, { nu, dn } }
199 * active_warps: bit 0 alternates btw 0 and 1 for odd nr of warps
200 * inst_executed etc.: we only count a single warp scheduler
202 static const struct nvc0_hw_sm_query_cfg nve4_hw_sm_queries
[] =
204 _Q1B(ACTIVE_CYCLES
, 0x0001, B6
, WARP
, 0x00000000, 1, 1),
205 _Q1B(ACTIVE_WARPS
, 0x003f, B6
, WARP
, 0x31483104, 2, 1),
206 _Q1A(ATOM_CAS_COUNT
, 0x0001, B6
, BRANCH
, 0x000000004, 1, 1),
207 _Q1A(ATOM_COUNT
, 0x0001, B6
, BRANCH
, 0x00000000, 1, 1),
208 _Q1A(BRANCH
, 0x0001, B6
, BRANCH
, 0x0000000c, 1, 1),
209 _Q1A(DIVERGENT_BRANCH
, 0x0001, B6
, BRANCH
, 0x00000010, 1, 1),
210 _Q1A(GLD_REQUEST
, 0x0001, B6
, LDST
, 0x00000010, 1, 1),
211 _Q1B(GLD_MEM_DIV_REPLAY
, 0x0001, B6
, REPLAY
, 0x00000010, 1, 1),
212 _Q1B(GST_TRANSACTIONS
, 0x0001, B6
, MEM
, 0x00000004, 1, 1),
213 _Q1B(GST_MEM_DIV_REPLAY
, 0x0001, B6
, REPLAY
, 0x00000014, 1, 1),
214 _Q1A(GRED_COUNT
, 0x0001, B6
, BRANCH
, 0x00000008, 1, 1),
215 _Q1A(GST_REQUEST
, 0x0001, B6
, LDST
, 0x00000014, 1, 1),
216 _Q1A(INST_EXECUTED
, 0x0003, B6
, EXEC
, 0x00000398, 1, 1),
217 _Q1A(INST_ISSUED1
, 0x0001, B6
, ISSUE
, 0x00000004, 1, 1),
218 _Q1A(INST_ISSUED2
, 0x0001, B6
, ISSUE
, 0x00000008, 1, 1),
219 _Q1B(L1_GLD_HIT
, 0x0001, B6
, L1
, 0x00000010, 1, 1),
220 _Q1B(L1_GLD_MISS
, 0x0001, B6
, L1
, 0x00000014, 1, 1),
221 _Q1B(L1_GLD_TRANSACTIONS
, 0x0001, B6
, UNK0F
, 0x00000000, 1, 1),
222 _Q1B(L1_GST_TRANSACTIONS
, 0x0001, B6
, UNK0F
, 0x00000004, 1, 1),
223 _Q1B(L1_LOCAL_LD_HIT
, 0x0001, B6
, L1
, 0x00000000, 1, 1),
224 _Q1B(L1_LOCAL_LD_MISS
, 0x0001, B6
, L1
, 0x00000004, 1, 1),
225 _Q1B(L1_LOCAL_ST_HIT
, 0x0001, B6
, L1
, 0x00000008, 1, 1),
226 _Q1B(L1_LOCAL_ST_MISS
, 0x0001, B6
, L1
, 0x0000000c, 1, 1),
227 _Q1B(L1_SHARED_LD_TRANSACTIONS
, 0x0001, B6
, TRANSACTION
, 0x00000008, 1, 1),
228 _Q1B(L1_SHARED_ST_TRANSACTIONS
, 0x0001, B6
, TRANSACTION
, 0x0000000c, 1, 1),
229 _Q1A(LOCAL_LD
, 0x0001, B6
, LDST
, 0x00000008, 1, 1),
230 _Q1B(LOCAL_LD_TRANSACTIONS
, 0x0001, B6
, TRANSACTION
, 0x00000000, 1, 1),
231 _Q1A(LOCAL_ST
, 0x0001, B6
, LDST
, 0x0000000c, 1, 1),
232 _Q1B(LOCAL_ST_TRANSACTIONS
, 0x0001, B6
, TRANSACTION
, 0x00000004, 1, 1),
233 _Q1A(PROF_TRIGGER_0
, 0x0001, B6
, USER
, 0x00000000, 1, 1),
234 _Q1A(PROF_TRIGGER_1
, 0x0001, B6
, USER
, 0x00000004, 1, 1),
235 _Q1A(PROF_TRIGGER_2
, 0x0001, B6
, USER
, 0x00000008, 1, 1),
236 _Q1A(PROF_TRIGGER_3
, 0x0001, B6
, USER
, 0x0000000c, 1, 1),
237 _Q1A(PROF_TRIGGER_4
, 0x0001, B6
, USER
, 0x00000010, 1, 1),
238 _Q1A(PROF_TRIGGER_5
, 0x0001, B6
, USER
, 0x00000014, 1, 1),
239 _Q1A(PROF_TRIGGER_6
, 0x0001, B6
, USER
, 0x00000018, 1, 1),
240 _Q1A(PROF_TRIGGER_7
, 0x0001, B6
, USER
, 0x0000001c, 1, 1),
241 _Q1A(SHARED_LD
, 0x0001, B6
, LDST
, 0x00000000, 1, 1),
242 _Q1B(SHARED_LD_REPLAY
, 0x0001, B6
, REPLAY
, 0x00000008, 1, 1),
243 _Q1A(SHARED_ST
, 0x0001, B6
, LDST
, 0x00000004, 1, 1),
244 _Q1B(SHARED_ST_REPLAY
, 0x0001, B6
, REPLAY
, 0x0000000c, 1, 1),
245 _Q1B(SM_CTA_LAUNCHED
, 0x0001, B6
, WARP
, 0x0000001c, 1, 1),
246 _Q1A(THREADS_LAUNCHED
, 0x003f, B6
, LAUNCH
, 0x398a4188, 1, 1),
247 _Q1B(UNCACHED_GLD_TRANSACTIONS
, 0x0001, B6
, MEM
, 0x00000000, 1, 1),
248 _Q1A(WARPS_LAUNCHED
, 0x0001, B6
, LAUNCH
, 0x00000004, 1, 1),
254 /* === PERFORMANCE MONITORING COUNTERS for NVC0:NVE4 === */
256 * - MP counters on GF100/GF110 (compute capability 2.0) are buggy
257 * because there is a context-switch problem that we need to fix.
258 * Results might be wrong sometimes, be careful!
260 static const char *nvc0_hw_sm_query_names
[] =
290 "thread_inst_executed_0",
291 "thread_inst_executed_1",
292 "thread_inst_executed_2",
293 "thread_inst_executed_3",
297 static const uint64_t nvc0_read_hw_sm_counters_code
[] =
300 * mov b32 $r9 $physid
309 * set $p0 0x1 eq u32 $r8 0x0
310 * mov b32 $r10 c0[0x0]
311 * mov b32 $r11 c0[0x4]
312 * ext u32 $r8 $r9 0x414
314 * mul $r8 u32 $r8 u32 48
315 * add b32 $r10 $c $r10 $r8
316 * add b32 $r11 $r11 0x0 $c
317 * mov b32 $r8 c0[0x8]
318 * st b128 wt g[$r10d+0x00] $r0q
319 * st b128 wt g[$r10d+0x10] $r4q
320 * st b32 wt g[$r10d+0x20] $r8
322 0x2c00000084021c04ULL
,
323 0x2c0000000c025c04ULL
,
324 0x2c00000010001c04ULL
,
325 0x2c00000014005c04ULL
,
326 0x2c00000018009c04ULL
,
327 0x2c0000001c00dc04ULL
,
328 0x2c00000020011c04ULL
,
329 0x2c00000024015c04ULL
,
330 0x2c00000028019c04ULL
,
331 0x2c0000002c01dc04ULL
,
332 0x190e0000fc81dc03ULL
,
333 0x2800400000029de4ULL
,
334 0x280040001002dde4ULL
,
335 0x7000c01050921c03ULL
,
336 0x80000000000021e7ULL
,
337 0x10000000c0821c02ULL
,
338 0x4801000020a29c03ULL
,
339 0x0800000000b2dc42ULL
,
340 0x2800400020021de4ULL
,
341 0x9400000000a01fc5ULL
,
342 0x9400000040a11fc5ULL
,
343 0x9400000080a21f85ULL
,
344 0x8000000000001de7ULL
347 #define _C(f, o, g, m, s) { f, NVC0_COMPUTE_MP_PM_OP_MODE_##o, 0, g, m, s }
348 #define _Q(n, c) [NVC0_HW_SM_QUERY_##n] = c
350 /* ==== Compute capability 2.0 (GF100/GF110) ==== */
351 static const struct nvc0_hw_sm_query_cfg
354 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x11, 0x000000ff, 0x00000000),
359 static const struct nvc0_hw_sm_query_cfg
362 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x24, 0x000000ff, 0x00000010),
363 .ctr
[1] = _C(0xaaaa, LOGOP
, 0x24, 0x000000ff, 0x00000020),
364 .ctr
[2] = _C(0xaaaa, LOGOP
, 0x24, 0x000000ff, 0x00000030),
365 .ctr
[3] = _C(0xaaaa, LOGOP
, 0x24, 0x000000ff, 0x00000040),
366 .ctr
[4] = _C(0xaaaa, LOGOP
, 0x24, 0x000000ff, 0x00000050),
367 .ctr
[5] = _C(0xaaaa, LOGOP
, 0x24, 0x000000ff, 0x00000060),
372 static const struct nvc0_hw_sm_query_cfg
375 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x63, 0x000000ff, 0x00000030),
380 static const struct nvc0_hw_sm_query_cfg
383 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x1a, 0x000000ff, 0x00000000),
384 .ctr
[1] = _C(0xaaaa, LOGOP
, 0x1a, 0x000000ff, 0x00000010),
389 static const struct nvc0_hw_sm_query_cfg
390 sm20_divergent_branch
=
392 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x19, 0x000000ff, 0x00000020),
393 .ctr
[1] = _C(0xaaaa, LOGOP
, 0x19, 0x000000ff, 0x00000030),
398 static const struct nvc0_hw_sm_query_cfg
401 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x64, 0x000000ff, 0x00000030),
406 static const struct nvc0_hw_sm_query_cfg
409 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x63, 0x000000ff, 0x00000040),
414 static const struct nvc0_hw_sm_query_cfg
417 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x64, 0x000000ff, 0x00000060),
422 static const struct nvc0_hw_sm_query_cfg
425 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x2d, 0x0000ffff, 0x00001000),
426 .ctr
[1] = _C(0xaaaa, LOGOP
, 0x2d, 0x0000ffff, 0x00001010),
431 static const struct nvc0_hw_sm_query_cfg
434 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x27, 0x0000ffff, 0x00007060),
435 .ctr
[1] = _C(0xaaaa, LOGOP
, 0x27, 0x0000ffff, 0x00007070),
440 static const struct nvc0_hw_sm_query_cfg
443 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x64, 0x000000ff, 0x00000020),
448 static const struct nvc0_hw_sm_query_cfg
451 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x64, 0x000000ff, 0x00000050),
456 static const struct nvc0_hw_sm_query_cfg
457 sm20_prof_trigger_0
=
459 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x01, 0x000000ff, 0x00000000),
464 static const struct nvc0_hw_sm_query_cfg
465 sm20_prof_trigger_1
=
467 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x01, 0x000000ff, 0x00000010),
472 static const struct nvc0_hw_sm_query_cfg
473 sm20_prof_trigger_2
=
475 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x01, 0x000000ff, 0x00000020),
480 static const struct nvc0_hw_sm_query_cfg
481 sm20_prof_trigger_3
=
483 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x01, 0x000000ff, 0x00000030),
488 static const struct nvc0_hw_sm_query_cfg
489 sm20_prof_trigger_4
=
491 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x01, 0x000000ff, 0x00000040),
496 static const struct nvc0_hw_sm_query_cfg
497 sm20_prof_trigger_5
=
499 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x01, 0x000000ff, 0x00000050),
504 static const struct nvc0_hw_sm_query_cfg
505 sm20_prof_trigger_6
=
507 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x01, 0x000000ff, 0x00000060),
512 static const struct nvc0_hw_sm_query_cfg
513 sm20_prof_trigger_7
=
515 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x01, 0x000000ff, 0x00000070),
520 static const struct nvc0_hw_sm_query_cfg
523 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x64, 0x000000ff, 0x00000010),
528 static const struct nvc0_hw_sm_query_cfg
531 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x64, 0x000000ff, 0x00000040),
536 static const struct nvc0_hw_sm_query_cfg
537 sm20_threads_launched
=
539 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x26, 0x000000ff, 0x00000010),
540 .ctr
[1] = _C(0xaaaa, LOGOP
, 0x26, 0x000000ff, 0x00000020),
541 .ctr
[2] = _C(0xaaaa, LOGOP
, 0x26, 0x000000ff, 0x00000030),
542 .ctr
[3] = _C(0xaaaa, LOGOP
, 0x26, 0x000000ff, 0x00000040),
543 .ctr
[4] = _C(0xaaaa, LOGOP
, 0x26, 0x000000ff, 0x00000050),
544 .ctr
[5] = _C(0xaaaa, LOGOP
, 0x26, 0x000000ff, 0x00000060),
549 static const struct nvc0_hw_sm_query_cfg
550 sm20_th_inst_executed_0
=
552 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x2f, 0x000000ff, 0x00000000),
553 .ctr
[1] = _C(0xaaaa, LOGOP
, 0x2f, 0x000000ff, 0x00000010),
554 .ctr
[2] = _C(0xaaaa, LOGOP
, 0x2f, 0x000000ff, 0x00000020),
555 .ctr
[3] = _C(0xaaaa, LOGOP
, 0x2f, 0x000000ff, 0x00000030),
556 .ctr
[4] = _C(0xaaaa, LOGOP
, 0x2f, 0x000000ff, 0x00000040),
557 .ctr
[5] = _C(0xaaaa, LOGOP
, 0x2f, 0x000000ff, 0x00000050),
562 static const struct nvc0_hw_sm_query_cfg
563 sm20_th_inst_executed_1
=
565 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x30, 0x000000ff, 0x00000000),
566 .ctr
[1] = _C(0xaaaa, LOGOP
, 0x30, 0x000000ff, 0x00000010),
567 .ctr
[2] = _C(0xaaaa, LOGOP
, 0x30, 0x000000ff, 0x00000020),
568 .ctr
[3] = _C(0xaaaa, LOGOP
, 0x30, 0x000000ff, 0x00000030),
569 .ctr
[4] = _C(0xaaaa, LOGOP
, 0x30, 0x000000ff, 0x00000040),
570 .ctr
[5] = _C(0xaaaa, LOGOP
, 0x30, 0x000000ff, 0x00000050),
575 static const struct nvc0_hw_sm_query_cfg
576 sm20_warps_launched
=
578 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x26, 0x000000ff, 0x00000000),
583 static const struct nvc0_hw_sm_query_cfg
*sm20_hw_sm_queries
[] =
585 _Q(ACTIVE_CYCLES
, &sm20_active_cycles
),
586 _Q(ACTIVE_WARPS
, &sm20_active_warps
),
587 _Q(ATOM_COUNT
, &sm20_atom_count
),
588 _Q(BRANCH
, &sm20_branch
),
589 _Q(DIVERGENT_BRANCH
, &sm20_divergent_branch
),
590 _Q(GLD_REQUEST
, &sm20_gld_request
),
591 _Q(GRED_COUNT
, &sm20_gred_count
),
592 _Q(GST_REQUEST
, &sm20_gst_request
),
593 _Q(INST_EXECUTED
, &sm20_inst_executed
),
594 _Q(INST_ISSUED
, &sm20_inst_issued
),
595 _Q(INST_ISSUED1_0
, NULL
),
596 _Q(INST_ISSUED1_1
, NULL
),
597 _Q(INST_ISSUED2_0
, NULL
),
598 _Q(INST_ISSUED2_1
, NULL
),
599 _Q(LOCAL_LD
, &sm20_local_ld
),
600 _Q(LOCAL_ST
, &sm20_local_st
),
601 _Q(PROF_TRIGGER_0
, &sm20_prof_trigger_0
),
602 _Q(PROF_TRIGGER_1
, &sm20_prof_trigger_1
),
603 _Q(PROF_TRIGGER_2
, &sm20_prof_trigger_2
),
604 _Q(PROF_TRIGGER_3
, &sm20_prof_trigger_3
),
605 _Q(PROF_TRIGGER_4
, &sm20_prof_trigger_4
),
606 _Q(PROF_TRIGGER_5
, &sm20_prof_trigger_5
),
607 _Q(PROF_TRIGGER_6
, &sm20_prof_trigger_6
),
608 _Q(PROF_TRIGGER_7
, &sm20_prof_trigger_7
),
609 _Q(SHARED_LD
, &sm20_shared_ld
),
610 _Q(SHARED_ST
, &sm20_shared_st
),
611 _Q(THREADS_LAUNCHED
, &sm20_threads_launched
),
612 _Q(TH_INST_EXECUTED_0
, &sm20_th_inst_executed_0
),
613 _Q(TH_INST_EXECUTED_1
, &sm20_th_inst_executed_1
),
614 _Q(TH_INST_EXECUTED_2
, NULL
),
615 _Q(TH_INST_EXECUTED_3
, NULL
),
616 _Q(WARPS_LAUNCHED
, &sm20_warps_launched
),
619 /* ==== Compute capability 2.1 (GF108+ except GF110) ==== */
620 static const struct nvc0_hw_sm_query_cfg
623 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x2d, 0x000000ff, 0x00000000),
624 .ctr
[1] = _C(0xaaaa, LOGOP
, 0x2d, 0x000000ff, 0x00000010),
625 .ctr
[2] = _C(0xaaaa, LOGOP
, 0x2d, 0x000000ff, 0x00000020),
630 static const struct nvc0_hw_sm_query_cfg
631 sm21_inst_issued1_0
=
633 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x7e, 0x000000ff, 0x00000010),
638 static const struct nvc0_hw_sm_query_cfg
639 sm21_inst_issued1_1
=
641 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x7e, 0x000000ff, 0x00000040),
646 static const struct nvc0_hw_sm_query_cfg
647 sm21_inst_issued2_0
=
649 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x7e, 0x000000ff, 0x00000020),
654 static const struct nvc0_hw_sm_query_cfg
655 sm21_inst_issued2_1
=
657 .ctr
[0] = _C(0xaaaa, LOGOP
, 0x7e, 0x000000ff, 0x00000050),
662 static const struct nvc0_hw_sm_query_cfg
663 sm21_th_inst_executed_0
=
665 .ctr
[0] = _C(0xaaaa, LOGOP
, 0xa3, 0x000000ff, 0x00000000),
666 .ctr
[1] = _C(0xaaaa, LOGOP
, 0xa3, 0x000000ff, 0x00000010),
667 .ctr
[2] = _C(0xaaaa, LOGOP
, 0xa3, 0x000000ff, 0x00000020),
668 .ctr
[3] = _C(0xaaaa, LOGOP
, 0xa3, 0x000000ff, 0x00000030),
669 .ctr
[4] = _C(0xaaaa, LOGOP
, 0xa3, 0x000000ff, 0x00000040),
670 .ctr
[5] = _C(0xaaaa, LOGOP
, 0xa3, 0x000000ff, 0x00000050),
675 static const struct nvc0_hw_sm_query_cfg
676 sm21_th_inst_executed_1
=
678 .ctr
[0] = _C(0xaaaa, LOGOP
, 0xa5, 0x000000ff, 0x00000000),
679 .ctr
[1] = _C(0xaaaa, LOGOP
, 0xa5, 0x000000ff, 0x00000010),
680 .ctr
[2] = _C(0xaaaa, LOGOP
, 0xa5, 0x000000ff, 0x00000020),
681 .ctr
[3] = _C(0xaaaa, LOGOP
, 0xa5, 0x000000ff, 0x00000030),
682 .ctr
[4] = _C(0xaaaa, LOGOP
, 0xa5, 0x000000ff, 0x00000040),
683 .ctr
[5] = _C(0xaaaa, LOGOP
, 0xa5, 0x000000ff, 0x00000050),
688 static const struct nvc0_hw_sm_query_cfg
689 sm21_th_inst_executed_2
=
691 .ctr
[0] = _C(0xaaaa, LOGOP
, 0xa4, 0x000000ff, 0x00000000),
692 .ctr
[1] = _C(0xaaaa, LOGOP
, 0xa4, 0x000000ff, 0x00000010),
693 .ctr
[2] = _C(0xaaaa, LOGOP
, 0xa4, 0x000000ff, 0x00000020),
694 .ctr
[3] = _C(0xaaaa, LOGOP
, 0xa4, 0x000000ff, 0x00000030),
695 .ctr
[4] = _C(0xaaaa, LOGOP
, 0xa4, 0x000000ff, 0x00000040),
696 .ctr
[5] = _C(0xaaaa, LOGOP
, 0xa4, 0x000000ff, 0x00000050),
701 static const struct nvc0_hw_sm_query_cfg
702 sm21_th_inst_executed_3
=
704 .ctr
[0] = _C(0xaaaa, LOGOP
, 0xa6, 0x000000ff, 0x00000000),
705 .ctr
[1] = _C(0xaaaa, LOGOP
, 0xa6, 0x000000ff, 0x00000010),
706 .ctr
[2] = _C(0xaaaa, LOGOP
, 0xa6, 0x000000ff, 0x00000020),
707 .ctr
[3] = _C(0xaaaa, LOGOP
, 0xa6, 0x000000ff, 0x00000030),
708 .ctr
[4] = _C(0xaaaa, LOGOP
, 0xa6, 0x000000ff, 0x00000040),
709 .ctr
[5] = _C(0xaaaa, LOGOP
, 0xa6, 0x000000ff, 0x00000050),
714 static const struct nvc0_hw_sm_query_cfg
*sm21_hw_sm_queries
[] =
716 _Q(ACTIVE_CYCLES
, &sm20_active_cycles
),
717 _Q(ACTIVE_WARPS
, &sm20_active_warps
),
718 _Q(ATOM_COUNT
, &sm20_atom_count
),
719 _Q(BRANCH
, &sm20_branch
),
720 _Q(DIVERGENT_BRANCH
, &sm20_divergent_branch
),
721 _Q(GLD_REQUEST
, &sm20_gld_request
),
722 _Q(GRED_COUNT
, &sm20_gred_count
),
723 _Q(GST_REQUEST
, &sm20_gst_request
),
724 _Q(INST_EXECUTED
, &sm21_inst_executed
),
725 _Q(INST_ISSUED
, NULL
),
726 _Q(INST_ISSUED1_0
, &sm21_inst_issued1_0
),
727 _Q(INST_ISSUED1_1
, &sm21_inst_issued1_1
),
728 _Q(INST_ISSUED2_0
, &sm21_inst_issued2_0
),
729 _Q(INST_ISSUED2_1
, &sm21_inst_issued2_1
),
730 _Q(LOCAL_LD
, &sm20_local_ld
),
731 _Q(LOCAL_ST
, &sm20_local_st
),
732 _Q(PROF_TRIGGER_0
, &sm20_prof_trigger_0
),
733 _Q(PROF_TRIGGER_1
, &sm20_prof_trigger_1
),
734 _Q(PROF_TRIGGER_2
, &sm20_prof_trigger_2
),
735 _Q(PROF_TRIGGER_3
, &sm20_prof_trigger_3
),
736 _Q(PROF_TRIGGER_4
, &sm20_prof_trigger_4
),
737 _Q(PROF_TRIGGER_5
, &sm20_prof_trigger_5
),
738 _Q(PROF_TRIGGER_6
, &sm20_prof_trigger_6
),
739 _Q(PROF_TRIGGER_7
, &sm20_prof_trigger_7
),
740 _Q(SHARED_LD
, &sm20_shared_ld
),
741 _Q(SHARED_ST
, &sm20_shared_st
),
742 _Q(THREADS_LAUNCHED
, &sm20_threads_launched
),
743 _Q(TH_INST_EXECUTED_0
, &sm21_th_inst_executed_0
),
744 _Q(TH_INST_EXECUTED_1
, &sm21_th_inst_executed_1
),
745 _Q(TH_INST_EXECUTED_2
, &sm21_th_inst_executed_2
),
746 _Q(TH_INST_EXECUTED_3
, &sm21_th_inst_executed_3
),
747 _Q(WARPS_LAUNCHED
, &sm20_warps_launched
),
753 static inline const struct nvc0_hw_sm_query_cfg
**
754 nvc0_hw_sm_get_queries(struct nvc0_screen
*screen
)
756 struct nouveau_device
*dev
= screen
->base
.device
;
758 if (dev
->chipset
== 0xc0 || dev
->chipset
== 0xc8)
759 return sm20_hw_sm_queries
;
760 return sm21_hw_sm_queries
;
763 static const struct nvc0_hw_sm_query_cfg
*
764 nvc0_hw_sm_query_get_cfg(struct nvc0_context
*nvc0
, struct nvc0_hw_query
*hq
)
766 struct nvc0_screen
*screen
= nvc0
->screen
;
767 struct nvc0_query
*q
= &hq
->base
;
769 if (screen
->base
.class_3d
>= NVE4_3D_CLASS
)
770 return &nve4_hw_sm_queries
[q
->type
- PIPE_QUERY_DRIVER_SPECIFIC
];
772 if (q
->type
>= NVC0_HW_SM_QUERY(0) && q
->type
<= NVC0_HW_SM_QUERY_LAST
) {
773 const struct nvc0_hw_sm_query_cfg
**queries
=
774 nvc0_hw_sm_get_queries(screen
);
775 return queries
[q
->type
- NVC0_HW_SM_QUERY(0)];
777 debug_printf("invalid query type: %d\n", q
->type
);
782 nvc0_hw_sm_destroy_query(struct nvc0_context
*nvc0
, struct nvc0_hw_query
*hq
)
784 struct nvc0_query
*q
= &hq
->base
;
785 nvc0_hw_query_allocate(nvc0
, q
, 0);
786 nouveau_fence_ref(NULL
, &hq
->fence
);
791 nve4_hw_sm_begin_query(struct nvc0_context
*nvc0
, struct nvc0_hw_query
*hq
)
793 struct nvc0_screen
*screen
= nvc0
->screen
;
794 struct nouveau_pushbuf
*push
= nvc0
->base
.pushbuf
;
795 struct nvc0_hw_sm_query
*hsq
= nvc0_hw_sm_query(hq
);
796 const struct nvc0_hw_sm_query_cfg
*cfg
;
798 unsigned num_ab
[2] = { 0, 0 };
800 cfg
= nvc0_hw_sm_query_get_cfg(nvc0
, hq
);
802 /* check if we have enough free counter slots */
803 for (i
= 0; i
< cfg
->num_counters
; ++i
)
804 num_ab
[cfg
->ctr
[i
].sig_dom
]++;
806 if (screen
->pm
.num_hw_sm_active
[0] + num_ab
[0] > 4 ||
807 screen
->pm
.num_hw_sm_active
[1] + num_ab
[1] > 4) {
808 NOUVEAU_ERR("Not enough free MP counter slots !\n");
812 assert(cfg
->num_counters
<= 4);
813 PUSH_SPACE(push
, 4 * 8 * + 6);
815 if (!screen
->pm
.mp_counters_enabled
) {
816 screen
->pm
.mp_counters_enabled
= true;
817 BEGIN_NVC0(push
, SUBC_SW(0x06ac), 1);
818 PUSH_DATA (push
, 0x1fcb);
821 /* set sequence field to 0 (used to check if result is available) */
822 for (i
= 0; i
< screen
->mp_count
; ++i
)
823 hq
->data
[i
* 10 + 10] = 0;
826 for (i
= 0; i
< cfg
->num_counters
; ++i
) {
827 const unsigned d
= cfg
->ctr
[i
].sig_dom
;
829 if (!screen
->pm
.num_hw_sm_active
[d
]) {
830 uint32_t m
= (1 << 22) | (1 << (7 + (8 * !d
)));
831 if (screen
->pm
.num_hw_sm_active
[!d
])
832 m
|= 1 << (7 + (8 * d
));
833 BEGIN_NVC0(push
, SUBC_SW(0x0600), 1);
836 screen
->pm
.num_hw_sm_active
[d
]++;
838 for (c
= d
* 4; c
< (d
* 4 + 4); ++c
) {
839 if (!screen
->pm
.mp_counter
[c
]) {
841 screen
->pm
.mp_counter
[c
] = hsq
;
845 assert(c
<= (d
* 4 + 3)); /* must succeed, already checked for space */
847 /* configure and reset the counter(s) */
849 BEGIN_NVC0(push
, NVE4_COMPUTE(MP_PM_A_SIGSEL(c
& 3)), 1);
851 BEGIN_NVC0(push
, NVE4_COMPUTE(MP_PM_B_SIGSEL(c
& 3)), 1);
852 PUSH_DATA (push
, cfg
->ctr
[i
].sig_sel
);
853 BEGIN_NVC0(push
, NVE4_COMPUTE(MP_PM_SRCSEL(c
)), 1);
854 PUSH_DATA (push
, cfg
->ctr
[i
].src_sel
+ 0x2108421 * (c
& 3));
855 BEGIN_NVC0(push
, NVE4_COMPUTE(MP_PM_FUNC(c
)), 1);
856 PUSH_DATA (push
, (cfg
->ctr
[i
].func
<< 4) | cfg
->ctr
[i
].mode
);
857 BEGIN_NVC0(push
, NVE4_COMPUTE(MP_PM_SET(c
)), 1);
864 nvc0_hw_sm_begin_query(struct nvc0_context
*nvc0
, struct nvc0_hw_query
*hq
)
866 struct nvc0_screen
*screen
= nvc0
->screen
;
867 struct nouveau_pushbuf
*push
= nvc0
->base
.pushbuf
;
868 struct nvc0_hw_sm_query
*hsq
= nvc0_hw_sm_query(hq
);
869 const struct nvc0_hw_sm_query_cfg
*cfg
;
872 if (screen
->base
.class_3d
>= NVE4_3D_CLASS
)
873 return nve4_hw_sm_begin_query(nvc0
, hq
);
875 cfg
= nvc0_hw_sm_query_get_cfg(nvc0
, hq
);
877 /* check if we have enough free counter slots */
878 if (screen
->pm
.num_hw_sm_active
[0] + cfg
->num_counters
> 8) {
879 NOUVEAU_ERR("Not enough free MP counter slots !\n");
883 assert(cfg
->num_counters
<= 8);
884 PUSH_SPACE(push
, 8 * 8 + 2);
886 /* set sequence field to 0 (used to check if result is available) */
887 for (i
= 0; i
< screen
->mp_count
; ++i
) {
888 const unsigned b
= (0x30 / 4) * i
;
893 for (i
= 0; i
< cfg
->num_counters
; ++i
) {
894 uint32_t mask_sel
= 0x00000000;
896 if (!screen
->pm
.num_hw_sm_active
[0]) {
897 BEGIN_NVC0(push
, SUBC_SW(0x0600), 1);
898 PUSH_DATA (push
, 0x80000000);
900 screen
->pm
.num_hw_sm_active
[0]++;
902 for (c
= 0; c
< 8; ++c
) {
903 if (!screen
->pm
.mp_counter
[c
]) {
905 screen
->pm
.mp_counter
[c
] = hsq
;
910 /* Oddly-enough, the signal id depends on the slot selected on Fermi but
911 * not on Kepler. Fortunately, the signal ids are just offseted by the
914 mask_sel
|= (c
<< 8);
915 mask_sel
|= (c
<< 16);
916 mask_sel
|= (c
<< 24);
917 mask_sel
&= cfg
->ctr
[i
].src_mask
;
919 /* configure and reset the counter(s) */
920 BEGIN_NVC0(push
, NVC0_COMPUTE(MP_PM_SIGSEL(c
)), 1);
921 PUSH_DATA (push
, cfg
->ctr
[i
].sig_sel
);
922 BEGIN_NVC0(push
, NVC0_COMPUTE(MP_PM_SRCSEL(c
)), 1);
923 PUSH_DATA (push
, cfg
->ctr
[i
].src_sel
| mask_sel
);
924 BEGIN_NVC0(push
, NVC0_COMPUTE(MP_PM_OP(c
)), 1);
925 PUSH_DATA (push
, (cfg
->ctr
[i
].func
<< 4) | cfg
->ctr
[i
].mode
);
926 BEGIN_NVC0(push
, NVC0_COMPUTE(MP_PM_SET(c
)), 1);
933 nvc0_hw_sm_end_query(struct nvc0_context
*nvc0
, struct nvc0_hw_query
*hq
)
935 struct nvc0_screen
*screen
= nvc0
->screen
;
936 struct pipe_context
*pipe
= &nvc0
->base
.pipe
;
937 struct nouveau_pushbuf
*push
= nvc0
->base
.pushbuf
;
938 const bool is_nve4
= screen
->base
.class_3d
>= NVE4_3D_CLASS
;
939 struct nvc0_hw_sm_query
*hsq
= nvc0_hw_sm_query(hq
);
940 struct pipe_grid_info info
= {};
943 const uint block
[3] = { 32, is_nve4
? 4 : 1, 1 };
944 const uint grid
[3] = { screen
->mp_count
, screen
->gpc_count
, 1 };
947 if (unlikely(!screen
->pm
.prog
)) {
948 struct nvc0_program
*prog
= CALLOC_STRUCT(nvc0_program
);
949 prog
->type
= PIPE_SHADER_COMPUTE
;
950 prog
->translated
= true;
951 prog
->parm_size
= 12;
953 prog
->code
= (uint32_t *)nve4_read_hw_sm_counters_code
;
954 prog
->code_size
= sizeof(nve4_read_hw_sm_counters_code
);
957 prog
->code
= (uint32_t *)nvc0_read_hw_sm_counters_code
;
958 prog
->code_size
= sizeof(nvc0_read_hw_sm_counters_code
);
961 screen
->pm
.prog
= prog
;
964 /* disable all counting */
966 for (c
= 0; c
< 8; ++c
)
967 if (screen
->pm
.mp_counter
[c
]) {
969 IMMED_NVC0(push
, NVE4_COMPUTE(MP_PM_FUNC(c
)), 0);
971 IMMED_NVC0(push
, NVC0_COMPUTE(MP_PM_OP(c
)), 0);
974 /* release counters for this query */
975 for (c
= 0; c
< 8; ++c
) {
976 if (screen
->pm
.mp_counter
[c
] == hsq
) {
977 uint8_t d
= is_nve4
? c
/ 4 : 0; /* only one domain for NVC0:NVE4 */
978 screen
->pm
.num_hw_sm_active
[d
]--;
979 screen
->pm
.mp_counter
[c
] = NULL
;
983 BCTX_REFN_bo(nvc0
->bufctx_cp
, CP_QUERY
, NOUVEAU_BO_GART
| NOUVEAU_BO_WR
,
987 IMMED_NVC0(push
, SUBC_COMPUTE(NV50_GRAPH_SERIALIZE
), 0);
989 pipe
->bind_compute_state(pipe
, screen
->pm
.prog
);
990 input
[0] = (hq
->bo
->offset
+ hq
->base_offset
);
991 input
[1] = (hq
->bo
->offset
+ hq
->base_offset
) >> 32;
992 input
[2] = hq
->sequence
;
994 for (i
= 0; i
< 3; i
++) {
995 info
.block
[i
] = block
[i
];
996 info
.grid
[i
] = grid
[i
];
1000 pipe
->launch_grid(pipe
, &info
);
1002 nouveau_bufctx_reset(nvc0
->bufctx_cp
, NVC0_BIND_CP_QUERY
);
1004 /* re-activate other counters */
1005 PUSH_SPACE(push
, 16);
1007 for (c
= 0; c
< 8; ++c
) {
1008 const struct nvc0_hw_sm_query_cfg
*cfg
;
1011 hsq
= screen
->pm
.mp_counter
[c
];
1015 cfg
= nvc0_hw_sm_query_get_cfg(nvc0
, &hsq
->base
);
1016 for (i
= 0; i
< cfg
->num_counters
; ++i
) {
1017 if (mask
& (1 << hsq
->ctr
[i
]))
1019 mask
|= 1 << hsq
->ctr
[i
];
1021 BEGIN_NVC0(push
, NVE4_COMPUTE(MP_PM_FUNC(hsq
->ctr
[i
])), 1);
1023 BEGIN_NVC0(push
, NVC0_COMPUTE(MP_PM_OP(hsq
->ctr
[i
])), 1);
1025 PUSH_DATA (push
, (cfg
->ctr
[i
].func
<< 4) | cfg
->ctr
[i
].mode
);
1031 nvc0_hw_sm_query_read_data(uint32_t count
[32][8],
1032 struct nvc0_context
*nvc0
, bool wait
,
1033 struct nvc0_hw_query
*hq
,
1034 const struct nvc0_hw_sm_query_cfg
*cfg
,
1037 struct nvc0_hw_sm_query
*hsq
= nvc0_hw_sm_query(hq
);
1040 for (p
= 0; p
< mp_count
; ++p
) {
1041 const unsigned b
= (0x30 / 4) * p
;
1043 for (c
= 0; c
< cfg
->num_counters
; ++c
) {
1044 if (hq
->data
[b
+ 8] != hq
->sequence
) {
1047 if (nouveau_bo_wait(hq
->bo
, NOUVEAU_BO_RD
, nvc0
->base
.client
))
1050 count
[p
][c
] = hq
->data
[b
+ hsq
->ctr
[c
]] * (1 << c
);
1057 nve4_hw_sm_query_read_data(uint32_t count
[32][8],
1058 struct nvc0_context
*nvc0
, bool wait
,
1059 struct nvc0_hw_query
*hq
,
1060 const struct nvc0_hw_sm_query_cfg
*cfg
,
1063 struct nvc0_hw_sm_query
*hsq
= nvc0_hw_sm_query(hq
);
1066 for (p
= 0; p
< mp_count
; ++p
) {
1067 const unsigned b
= (0x60 / 4) * p
;
1069 for (c
= 0; c
< cfg
->num_counters
; ++c
) {
1071 for (d
= 0; d
< ((hsq
->ctr
[c
] & ~3) ? 1 : 4); ++d
) {
1072 if (hq
->data
[b
+ 20 + d
] != hq
->sequence
) {
1075 if (nouveau_bo_wait(hq
->bo
, NOUVEAU_BO_RD
, nvc0
->base
.client
))
1078 if (hsq
->ctr
[c
] & ~0x3)
1079 count
[p
][c
] = hq
->data
[b
+ 16 + (hsq
->ctr
[c
] & 3)];
1081 count
[p
][c
] += hq
->data
[b
+ d
* 4 + hsq
->ctr
[c
]];
1089 nvc0_hw_sm_get_query_result(struct nvc0_context
*nvc0
, struct nvc0_hw_query
*hq
,
1090 boolean wait
, union pipe_query_result
*result
)
1092 uint32_t count
[32][8];
1094 unsigned mp_count
= MIN2(nvc0
->screen
->mp_count_compute
, 32);
1096 const struct nvc0_hw_sm_query_cfg
*cfg
;
1099 cfg
= nvc0_hw_sm_query_get_cfg(nvc0
, hq
);
1101 if (nvc0
->screen
->base
.class_3d
>= NVE4_3D_CLASS
)
1102 ret
= nve4_hw_sm_query_read_data(count
, nvc0
, wait
, hq
, cfg
, mp_count
);
1104 ret
= nvc0_hw_sm_query_read_data(count
, nvc0
, wait
, hq
, cfg
, mp_count
);
1108 for (c
= 0; c
< cfg
->num_counters
; ++c
)
1109 for (p
= 0; p
< mp_count
; ++p
)
1110 value
+= count
[p
][c
];
1111 value
= (value
* cfg
->norm
[0]) / cfg
->norm
[1];
1113 *(uint64_t *)result
= value
;
1117 static const struct nvc0_hw_query_funcs hw_sm_query_funcs
= {
1118 .destroy_query
= nvc0_hw_sm_destroy_query
,
1119 .begin_query
= nvc0_hw_sm_begin_query
,
1120 .end_query
= nvc0_hw_sm_end_query
,
1121 .get_query_result
= nvc0_hw_sm_get_query_result
,
1124 struct nvc0_hw_query
*
1125 nvc0_hw_sm_create_query(struct nvc0_context
*nvc0
, unsigned type
)
1127 struct nvc0_screen
*screen
= nvc0
->screen
;
1128 struct nvc0_hw_sm_query
*hsq
;
1129 struct nvc0_hw_query
*hq
;
1132 if (nvc0
->screen
->base
.drm
->version
< 0x01000101)
1135 if ((type
< NVE4_HW_SM_QUERY(0) || type
> NVE4_HW_SM_QUERY_LAST
) &&
1136 (type
< NVC0_HW_SM_QUERY(0) || type
> NVC0_HW_SM_QUERY_LAST
))
1139 hsq
= CALLOC_STRUCT(nvc0_hw_sm_query
);
1144 hq
->funcs
= &hw_sm_query_funcs
;
1145 hq
->base
.type
= type
;
1147 if (screen
->base
.class_3d
>= NVE4_3D_CLASS
) {
1169 * [50] = WS0.sequence
1170 * [54] = WS1.sequence
1171 * [58] = WS2.sequence
1172 * [5c] = WS3.sequence
1174 space
= (4 * 4 + 4 + 4) * nvc0
->screen
->mp_count
* sizeof(uint32_t);
1177 * Note that padding is used to align memory access to 128 bits.
1188 * [20] = MP.sequence
1193 space
= (8 + 1 + 3) * nvc0
->screen
->mp_count
* sizeof(uint32_t);
1196 if (!nvc0_hw_query_allocate(nvc0
, &hq
->base
, space
)) {
1205 nvc0_hw_sm_get_next_query_id(const struct nvc0_hw_sm_query_cfg
**queries
,
1208 unsigned i
, next
= 0;
1210 for (i
= 0; i
< NVC0_HW_SM_QUERY_COUNT
; i
++) {
1214 if (i
>= id
&& queries
[id
+ next
]) {
1222 nvc0_hw_sm_get_driver_query_info(struct nvc0_screen
*screen
, unsigned id
,
1223 struct pipe_driver_query_info
*info
)
1227 if (screen
->base
.drm
->version
>= 0x01000101) {
1228 if (screen
->compute
) {
1229 if (screen
->base
.class_3d
== NVE4_3D_CLASS
) {
1230 count
+= NVE4_HW_SM_QUERY_COUNT
;
1232 if (screen
->base
.class_3d
< NVE4_3D_CLASS
) {
1233 const struct nvc0_hw_sm_query_cfg
**queries
=
1234 nvc0_hw_sm_get_queries(screen
);
1237 for (i
= 0; i
< NVC0_HW_SM_QUERY_COUNT
; i
++) {
1249 if (screen
->compute
) {
1250 if (screen
->base
.class_3d
== NVE4_3D_CLASS
) {
1251 info
->name
= nve4_hw_sm_query_names
[id
];
1252 info
->query_type
= NVE4_HW_SM_QUERY(id
);
1253 info
->group_id
= NVC0_HW_SM_QUERY_GROUP
;
1256 if (screen
->base
.class_3d
< NVE4_3D_CLASS
) {
1257 const struct nvc0_hw_sm_query_cfg
**queries
=
1258 nvc0_hw_sm_get_queries(screen
);
1260 id
= nvc0_hw_sm_get_next_query_id(queries
, id
);
1261 info
->name
= nvc0_hw_sm_query_names
[id
];
1262 info
->query_type
= NVC0_HW_SM_QUERY(id
);
1263 info
->group_id
= NVC0_HW_SM_QUERY_GROUP
;