2 * Copyright 2015 Samuel Pitoiset
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
23 #define NV50_PUSH_EXPLICIT_SPACE_CHECKING
25 #include "nv50/nv50_context.h"
26 #include "nv50/nv50_query_hw_sm.h"
28 #include "nv_object.xml.h"
29 #include "nv50/nv50_compute.xml.h"
31 /* === PERFORMANCE MONITORING COUNTERS for NV84+ === */
33 /* NOTE: intentionally using the same names as NV */
34 static const char *nv50_hw_sm_query_names
[] =
51 static const uint64_t nv50_read_hw_sm_counters_code
[] =
53 /* and b32 $r0 $r0 0x0000ffff
54 * add b32 $c0 $r0 $r0 $r0
63 * and b32 $r4 $r4 0x000f0000
64 * shr u32 $r4 $r4 0x10
65 * mul $r4 u24 $r4 0x14
68 * add b32 $r5 $r5 0x04
70 * add b32 $r5 $r5 0x04
72 * add b32 $r5 $r5 0x04
74 * add b32 $r5 $r5 0x04
75 * exit st b32 g15[$r5] $r6 */
76 0x00000fffd03f0001ULL
,
77 0x040007c020000001ULL
,
78 0x0000028030000003ULL
,
79 0x6001078000000001ULL
,
80 0x6001478000000005ULL
,
81 0x6001878000000009ULL
,
82 0x6001c7800000000dULL
,
83 0x6000078000000011ULL
,
84 0x4400c78010000815ULL
,
85 0x4400c78010000a19ULL
,
86 0x0000f003d0000811ULL
,
87 0xe410078030100811ULL
,
88 0x0000000340540811ULL
,
89 0x0401078020000a15ULL
,
90 0xa0c00780d00f0a01ULL
,
91 0x0000000320048a15ULL
,
92 0xa0c00780d00f0a05ULL
,
93 0x0000000320048a15ULL
,
94 0xa0c00780d00f0a09ULL
,
95 0x0000000320048a15ULL
,
96 0xa0c00780d00f0a0dULL
,
97 0x0000000320048a15ULL
,
98 0xa0c00781d00f0a19ULL
,
101 struct nv50_hw_sm_counter_cfg
103 uint32_t mode
: 4; /* LOGOP, LOGOP_PULSE */
104 uint32_t unit
: 8; /* UNK[0-5] */
105 uint32_t sig
: 8; /* signal selection */
108 struct nv50_hw_sm_query_cfg
110 struct nv50_hw_sm_counter_cfg ctr
[4];
111 uint8_t num_counters
;
114 #define _Q(n, m, u, s) [NV50_HW_SM_QUERY_##n] = { { { NV50_COMPUTE_MP_PM_CONTROL_MODE_##m, NV50_COMPUTE_MP_PM_CONTROL_UNIT_##u, s, }, {}, {}, {} }, 1 }
116 /* ==== Compute capability 1.1 (G84+) ==== */
117 static const struct nv50_hw_sm_query_cfg sm11_hw_sm_queries
[] =
119 _Q(BRANCH
, LOGOP
, UNK4
, 0x02),
120 _Q(DIVERGENT_BRANCH
, LOGOP
, UNK4
, 0x09),
121 _Q(INSTRUCTIONS
, LOGOP
, UNK4
, 0x04),
122 _Q(PROF_TRIGGER_0
, LOGOP
, UNK1
, 0x26),
123 _Q(PROF_TRIGGER_1
, LOGOP
, UNK1
, 0x27),
124 _Q(PROF_TRIGGER_2
, LOGOP
, UNK1
, 0x28),
125 _Q(PROF_TRIGGER_3
, LOGOP
, UNK1
, 0x29),
126 _Q(PROF_TRIGGER_4
, LOGOP
, UNK1
, 0x2a),
127 _Q(PROF_TRIGGER_5
, LOGOP
, UNK1
, 0x2b),
128 _Q(PROF_TRIGGER_6
, LOGOP
, UNK1
, 0x2c),
129 _Q(PROF_TRIGGER_7
, LOGOP
, UNK1
, 0x2d),
130 _Q(SM_CTA_LAUNCHED
, LOGOP
, UNK1
, 0x33),
131 _Q(WARP_SERIALIZE
, LOGOP
, UNK0
, 0x0b),
134 static inline uint16_t nv50_hw_sm_get_func(uint8_t slot
)
137 case 0: return 0xaaaa;
138 case 1: return 0xcccc;
139 case 2: return 0xf0f0;
140 case 3: return 0xff00;
145 static const struct nv50_hw_sm_query_cfg
*
146 nv50_hw_sm_query_get_cfg(struct nv50_context
*nv50
, struct nv50_hw_query
*hq
)
148 struct nv50_query
*q
= &hq
->base
;
149 return &sm11_hw_sm_queries
[q
->type
- NV50_HW_SM_QUERY(0)];
153 nv50_hw_sm_destroy_query(struct nv50_context
*nv50
, struct nv50_hw_query
*hq
)
155 struct nv50_query
*q
= &hq
->base
;
156 nv50_hw_query_allocate(nv50
, q
, 0);
157 nouveau_fence_ref(NULL
, &hq
->fence
);
162 nv50_hw_sm_begin_query(struct nv50_context
*nv50
, struct nv50_hw_query
*hq
)
164 struct nv50_screen
*screen
= nv50
->screen
;
165 struct nouveau_pushbuf
*push
= nv50
->base
.pushbuf
;
166 struct nv50_hw_sm_query
*hsq
= nv50_hw_sm_query(hq
);
167 const struct nv50_hw_sm_query_cfg
*cfg
;
171 cfg
= nv50_hw_sm_query_get_cfg(nv50
, hq
);
173 /* check if we have enough free counter slots */
174 if (screen
->pm
.num_hw_sm_active
+ cfg
->num_counters
> 4) {
175 NOUVEAU_ERR("Not enough free MP counter slots !\n");
179 assert(cfg
->num_counters
<= 4);
180 PUSH_SPACE(push
, 4 * 4);
182 /* set sequence field to 0 (used to check if result is available) */
183 for (i
= 0; i
< screen
->MPsInTP
; ++i
) {
184 const unsigned b
= (0x14 / 4) * i
;
185 hq
->data
[b
+ 16] = 0;
189 for (i
= 0; i
< cfg
->num_counters
; i
++) {
190 screen
->pm
.num_hw_sm_active
++;
192 /* find free counter slots */
193 for (c
= 0; c
< 4; ++c
) {
194 if (!screen
->pm
.mp_counter
[c
]) {
196 screen
->pm
.mp_counter
[c
] = hsq
;
201 /* select func to aggregate counters */
202 func
= nv50_hw_sm_get_func(c
);
204 /* configure and reset the counter(s) */
205 BEGIN_NV04(push
, NV50_CP(MP_PM_CONTROL(c
)), 1);
206 PUSH_DATA (push
, (cfg
->ctr
[i
].sig
<< 24) | (func
<< 8)
207 | cfg
->ctr
[i
].unit
| cfg
->ctr
[i
].mode
);
208 BEGIN_NV04(push
, NV50_CP(MP_PM_SET(c
)), 1);
215 nv50_hw_sm_end_query(struct nv50_context
*nv50
, struct nv50_hw_query
*hq
)
217 struct nv50_screen
*screen
= nv50
->screen
;
218 struct pipe_context
*pipe
= &nv50
->base
.pipe
;
219 struct nouveau_pushbuf
*push
= nv50
->base
.pushbuf
;
220 struct nv50_hw_sm_query
*hsq
= nv50_hw_sm_query(hq
);
221 struct nv50_program
*old
= nv50
->compprog
;
222 struct pipe_grid_info info
= {};
225 const uint block
[3] = { 32, 1, 1 };
226 const uint grid
[3] = { screen
->MPsInTP
, screen
->TPs
, 1 };
229 if (unlikely(!screen
->pm
.prog
)) {
230 struct nv50_program
*prog
= CALLOC_STRUCT(nv50_program
);
231 prog
->type
= PIPE_SHADER_COMPUTE
;
232 prog
->translated
= true;
235 prog
->code
= (uint32_t *)nv50_read_hw_sm_counters_code
;
236 prog
->code_size
= sizeof(nv50_read_hw_sm_counters_code
);
237 screen
->pm
.prog
= prog
;
240 /* disable all counting */
242 for (c
= 0; c
< 4; c
++) {
243 if (screen
->pm
.mp_counter
[c
]) {
244 BEGIN_NV04(push
, NV50_CP(MP_PM_CONTROL(c
)), 1);
249 /* release counters for this query */
250 for (c
= 0; c
< 4; c
++) {
251 if (screen
->pm
.mp_counter
[c
] == hsq
) {
252 screen
->pm
.num_hw_sm_active
--;
253 screen
->pm
.mp_counter
[c
] = NULL
;
257 BCTX_REFN_bo(nv50
->bufctx_cp
, CP_QUERY
, NOUVEAU_BO_GART
| NOUVEAU_BO_WR
,
261 BEGIN_NV04(push
, SUBC_CP(NV50_GRAPH_SERIALIZE
), 1);
264 pipe
->bind_compute_state(pipe
, screen
->pm
.prog
);
265 input
[0] = hq
->bo
->offset
+ hq
->base_offset
;
266 input
[1] = hq
->sequence
;
268 for (i
= 0; i
< 3; i
++) {
269 info
.block
[i
] = block
[i
];
270 info
.grid
[i
] = grid
[i
];
274 pipe
->launch_grid(pipe
, &info
);
275 pipe
->bind_compute_state(pipe
, old
);
277 nouveau_bufctx_reset(nv50
->bufctx_cp
, NV50_BIND_CP_QUERY
);
279 /* re-active other counters */
282 for (c
= 0; c
< 4; c
++) {
283 const struct nv50_hw_sm_query_cfg
*cfg
;
286 hsq
= screen
->pm
.mp_counter
[c
];
290 cfg
= nv50_hw_sm_query_get_cfg(nv50
, &hsq
->base
);
291 for (i
= 0; i
< cfg
->num_counters
; i
++) {
294 if (mask
& (1 << hsq
->ctr
[i
]))
297 mask
|= 1 << hsq
->ctr
[i
];
298 func
= nv50_hw_sm_get_func(hsq
->ctr
[i
]);
300 BEGIN_NV04(push
, NV50_CP(MP_PM_CONTROL(hsq
->ctr
[i
])), 1);
301 PUSH_DATA (push
, (cfg
->ctr
[i
].sig
<< 24) | (func
<< 8)
302 | cfg
->ctr
[i
].unit
| cfg
->ctr
[i
].mode
);
308 nv50_hw_sm_query_read_data(uint32_t count
[32][4],
309 struct nv50_context
*nv50
, bool wait
,
310 struct nv50_hw_query
*hq
,
311 const struct nv50_hw_sm_query_cfg
*cfg
,
314 struct nv50_hw_sm_query
*hsq
= nv50_hw_sm_query(hq
);
317 for (p
= 0; p
< mp_count
; ++p
) {
318 const unsigned b
= (0x14 / 4) * p
;
320 for (c
= 0; c
< cfg
->num_counters
; ++c
) {
321 if (hq
->data
[b
+ 4] != hq
->sequence
) {
324 if (nouveau_bo_wait(hq
->bo
, NOUVEAU_BO_RD
, nv50
->base
.client
))
327 count
[p
][c
] = hq
->data
[b
+ hsq
->ctr
[c
]];
334 nv50_hw_sm_get_query_result(struct nv50_context
*nv50
, struct nv50_hw_query
*hq
,
335 boolean wait
, union pipe_query_result
*result
)
337 uint32_t count
[32][4];
339 unsigned mp_count
= MIN2(nv50
->screen
->MPsInTP
, 32);
341 const struct nv50_hw_sm_query_cfg
*cfg
;
344 cfg
= nv50_hw_sm_query_get_cfg(nv50
, hq
);
346 ret
= nv50_hw_sm_query_read_data(count
, nv50
, wait
, hq
, cfg
, mp_count
);
350 for (c
= 0; c
< cfg
->num_counters
; ++c
)
351 for (p
= 0; p
< mp_count
; ++p
)
352 value
+= count
[p
][c
];
354 /* We only count a single TP, and simply multiply by the total number of
355 * TPs to compute result over all TPs. This is inaccurate, but enough! */
356 value
*= nv50
->screen
->TPs
;
358 *(uint64_t *)result
= value
;
362 static const struct nv50_hw_query_funcs hw_sm_query_funcs
= {
363 .destroy_query
= nv50_hw_sm_destroy_query
,
364 .begin_query
= nv50_hw_sm_begin_query
,
365 .end_query
= nv50_hw_sm_end_query
,
366 .get_query_result
= nv50_hw_sm_get_query_result
,
369 struct nv50_hw_query
*
370 nv50_hw_sm_create_query(struct nv50_context
*nv50
, unsigned type
)
372 struct nv50_hw_sm_query
*hsq
;
373 struct nv50_hw_query
*hq
;
376 if (type
< NV50_HW_SM_QUERY(0) || type
> NV50_HW_SM_QUERY_LAST
)
379 hsq
= CALLOC_STRUCT(nv50_hw_sm_query
);
384 hq
->funcs
= &hw_sm_query_funcs
;
385 hq
->base
.type
= type
;
395 space
= (4 + 1) * nv50
->screen
->MPsInTP
* sizeof(uint32_t);
397 if (!nv50_hw_query_allocate(nv50
, &hq
->base
, space
)) {
406 nv50_hw_sm_get_driver_query_info(struct nv50_screen
*screen
, unsigned id
,
407 struct pipe_driver_query_info
*info
)
412 if (screen
->base
.class_3d
>= NV84_3D_CLASS
)
413 count
+= NV50_HW_SM_QUERY_COUNT
;
419 if (screen
->compute
) {
420 if (screen
->base
.class_3d
>= NV84_3D_CLASS
) {
421 info
->name
= nv50_hw_sm_query_names
[id
];
422 info
->query_type
= NV50_HW_SM_QUERY(id
);
423 info
->group_id
= NV50_HW_SM_QUERY_GROUP
;