2 * Copyright 2011 Nouveau Project
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
22 * Authors: Christoph Bumiller
25 #define NVC0_PUSH_EXPLICIT_SPACE_CHECKING
27 #include "nvc0/nvc0_context.h"
28 #include "nv_object.xml.h"
29 #include "nvc0/nve4_compute.xml.h"
30 #include "nvc0/nvc0_compute.xml.h"
32 #define NVC0_QUERY_STATE_READY 0
33 #define NVC0_QUERY_STATE_ACTIVE 1
34 #define NVC0_QUERY_STATE_ENDED 2
35 #define NVC0_QUERY_STATE_FLUSHED 3
43 struct nouveau_bo
*bo
;
45 uint32_t offset
; /* base + i * rotate */
49 int nesting
; /* only used for occlusion queries */
51 struct nouveau_mm_allocation
*mm
;
54 struct nouveau_fence
*fence
;
57 #define NVC0_QUERY_ALLOC_SPACE 256
59 static boolean
nvc0_mp_pm_query_begin(struct nvc0_context
*,
61 static void nvc0_mp_pm_query_end(struct nvc0_context
*, struct nvc0_query
*);
62 static boolean
nvc0_mp_pm_query_result(struct nvc0_context
*,
63 struct nvc0_query
*, void *, boolean
);
65 static INLINE
struct nvc0_query
*
66 nvc0_query(struct pipe_query
*pipe
)
68 return (struct nvc0_query
*)pipe
;
72 nvc0_query_allocate(struct nvc0_context
*nvc0
, struct nvc0_query
*q
, int size
)
74 struct nvc0_screen
*screen
= nvc0
->screen
;
78 nouveau_bo_ref(NULL
, &q
->bo
);
80 if (q
->state
== NVC0_QUERY_STATE_READY
)
81 nouveau_mm_free(q
->u
.mm
);
83 nouveau_fence_work(screen
->base
.fence
.current
,
84 nouveau_mm_free_work
, q
->u
.mm
);
88 q
->u
.mm
= nouveau_mm_allocate(screen
->base
.mm_GART
, size
, &q
->bo
, &q
->base
);
93 ret
= nouveau_bo_map(q
->bo
, 0, screen
->base
.client
);
95 nvc0_query_allocate(nvc0
, q
, 0);
98 q
->data
= (uint32_t *)((uint8_t *)q
->bo
->map
+ q
->base
);
104 nvc0_query_destroy(struct pipe_context
*pipe
, struct pipe_query
*pq
)
106 nvc0_query_allocate(nvc0_context(pipe
), nvc0_query(pq
), 0);
107 nouveau_fence_ref(NULL
, &nvc0_query(pq
)->fence
);
108 FREE(nvc0_query(pq
));
111 static struct pipe_query
*
112 nvc0_query_create(struct pipe_context
*pipe
, unsigned type
, unsigned index
)
114 struct nvc0_context
*nvc0
= nvc0_context(pipe
);
115 struct nvc0_query
*q
;
116 unsigned space
= NVC0_QUERY_ALLOC_SPACE
;
118 q
= CALLOC_STRUCT(nvc0_query
);
123 case PIPE_QUERY_OCCLUSION_COUNTER
:
124 case PIPE_QUERY_OCCLUSION_PREDICATE
:
126 space
= NVC0_QUERY_ALLOC_SPACE
;
128 case PIPE_QUERY_PIPELINE_STATISTICS
:
132 case PIPE_QUERY_SO_STATISTICS
:
133 case PIPE_QUERY_SO_OVERFLOW_PREDICATE
:
137 case PIPE_QUERY_PRIMITIVES_GENERATED
:
138 case PIPE_QUERY_PRIMITIVES_EMITTED
:
143 case PIPE_QUERY_TIME_ELAPSED
:
144 case PIPE_QUERY_TIMESTAMP
:
145 case PIPE_QUERY_TIMESTAMP_DISJOINT
:
146 case PIPE_QUERY_GPU_FINISHED
:
149 case NVC0_QUERY_TFB_BUFFER_OFFSET
:
153 #ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS
154 if (type
>= NVC0_QUERY_DRV_STAT(0) && type
<= NVC0_QUERY_DRV_STAT_LAST
) {
157 q
->index
= type
- NVC0_QUERY_DRV_STAT(0);
161 if (nvc0
->screen
->base
.device
->drm_version
>= 0x01000101) {
162 if (type
>= NVE4_PM_QUERY(0) && type
<= NVE4_PM_QUERY_LAST
) {
184 * [50] = WS0.sequence
185 * [54] = WS1.sequence
186 * [58] = WS2.sequence
187 * [5c] = WS3.sequence
189 space
= (4 * 4 + 4 + 4) * nvc0
->screen
->mp_count
* sizeof(uint32_t);
192 if (type
>= NVC0_PM_QUERY(0) && type
<= NVC0_PM_QUERY_LAST
) {
204 space
= (8 + 1) * nvc0
->screen
->mp_count
* sizeof(uint32_t);
208 debug_printf("invalid query type: %u\n", type
);
212 if (!nvc0_query_allocate(nvc0
, q
, space
)) {
220 /* we advance before query_begin ! */
221 q
->offset
-= q
->rotate
;
222 q
->data
-= q
->rotate
/ sizeof(*q
->data
);
225 q
->data
[0] = 0; /* initialize sequence */
227 return (struct pipe_query
*)q
;
231 nvc0_query_get(struct nouveau_pushbuf
*push
, struct nvc0_query
*q
,
232 unsigned offset
, uint32_t get
)
237 PUSH_REFN (push
, q
->bo
, NOUVEAU_BO_GART
| NOUVEAU_BO_WR
);
238 BEGIN_NVC0(push
, NVC0_3D(QUERY_ADDRESS_HIGH
), 4);
239 PUSH_DATAh(push
, q
->bo
->offset
+ offset
);
240 PUSH_DATA (push
, q
->bo
->offset
+ offset
);
241 PUSH_DATA (push
, q
->sequence
);
242 PUSH_DATA (push
, get
);
246 nvc0_query_rotate(struct nvc0_context
*nvc0
, struct nvc0_query
*q
)
248 q
->offset
+= q
->rotate
;
249 q
->data
+= q
->rotate
/ sizeof(*q
->data
);
250 if (q
->offset
- q
->base
== NVC0_QUERY_ALLOC_SPACE
)
251 nvc0_query_allocate(nvc0
, q
, NVC0_QUERY_ALLOC_SPACE
);
255 nvc0_query_begin(struct pipe_context
*pipe
, struct pipe_query
*pq
)
257 struct nvc0_context
*nvc0
= nvc0_context(pipe
);
258 struct nouveau_pushbuf
*push
= nvc0
->base
.pushbuf
;
259 struct nvc0_query
*q
= nvc0_query(pq
);
262 /* For occlusion queries we have to change the storage, because a previous
263 * query might set the initial render conition to FALSE even *after* we re-
264 * initialized it to TRUE.
267 nvc0_query_rotate(nvc0
, q
);
269 /* XXX: can we do this with the GPU, and sync with respect to a previous
272 q
->data
[0] = q
->sequence
; /* initialize sequence */
273 q
->data
[1] = 1; /* initial render condition = TRUE */
274 q
->data
[4] = q
->sequence
+ 1; /* for comparison COND_MODE */
280 case PIPE_QUERY_OCCLUSION_COUNTER
:
281 case PIPE_QUERY_OCCLUSION_PREDICATE
:
282 q
->nesting
= nvc0
->screen
->num_occlusion_queries_active
++;
284 nvc0_query_get(push
, q
, 0x10, 0x0100f002);
287 BEGIN_NVC0(push
, NVC0_3D(COUNTER_RESET
), 1);
288 PUSH_DATA (push
, NVC0_3D_COUNTER_RESET_SAMPLECNT
);
289 IMMED_NVC0(push
, NVC0_3D(SAMPLECNT_ENABLE
), 1);
292 case PIPE_QUERY_PRIMITIVES_GENERATED
:
293 nvc0_query_get(push
, q
, 0x10, 0x09005002 | (q
->index
<< 5));
295 case PIPE_QUERY_PRIMITIVES_EMITTED
:
296 nvc0_query_get(push
, q
, 0x10, 0x05805002 | (q
->index
<< 5));
298 case PIPE_QUERY_SO_STATISTICS
:
299 nvc0_query_get(push
, q
, 0x20, 0x05805002 | (q
->index
<< 5));
300 nvc0_query_get(push
, q
, 0x30, 0x06805002 | (q
->index
<< 5));
302 case PIPE_QUERY_SO_OVERFLOW_PREDICATE
:
303 nvc0_query_get(push
, q
, 0x10, 0x03005002 | (q
->index
<< 5));
305 case PIPE_QUERY_TIME_ELAPSED
:
306 nvc0_query_get(push
, q
, 0x10, 0x00005002);
308 case PIPE_QUERY_PIPELINE_STATISTICS
:
309 nvc0_query_get(push
, q
, 0xc0 + 0x00, 0x00801002); /* VFETCH, VERTICES */
310 nvc0_query_get(push
, q
, 0xc0 + 0x10, 0x01801002); /* VFETCH, PRIMS */
311 nvc0_query_get(push
, q
, 0xc0 + 0x20, 0x02802002); /* VP, LAUNCHES */
312 nvc0_query_get(push
, q
, 0xc0 + 0x30, 0x03806002); /* GP, LAUNCHES */
313 nvc0_query_get(push
, q
, 0xc0 + 0x40, 0x04806002); /* GP, PRIMS_OUT */
314 nvc0_query_get(push
, q
, 0xc0 + 0x50, 0x07804002); /* RAST, PRIMS_IN */
315 nvc0_query_get(push
, q
, 0xc0 + 0x60, 0x08804002); /* RAST, PRIMS_OUT */
316 nvc0_query_get(push
, q
, 0xc0 + 0x70, 0x0980a002); /* ROP, PIXELS */
317 nvc0_query_get(push
, q
, 0xc0 + 0x80, 0x0d808002); /* TCP, LAUNCHES */
318 nvc0_query_get(push
, q
, 0xc0 + 0x90, 0x0e809002); /* TEP, LAUNCHES */
321 #ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS
322 if (q
->type
>= NVC0_QUERY_DRV_STAT(0) &&
323 q
->type
<= NVC0_QUERY_DRV_STAT_LAST
) {
325 q
->u
.value
= nvc0
->screen
->base
.stats
.v
[q
->index
];
330 if ((q
->type
>= NVE4_PM_QUERY(0) && q
->type
<= NVE4_PM_QUERY_LAST
) ||
331 (q
->type
>= NVC0_PM_QUERY(0) && q
->type
<= NVC0_PM_QUERY_LAST
)) {
332 ret
= nvc0_mp_pm_query_begin(nvc0
, q
);
336 q
->state
= NVC0_QUERY_STATE_ACTIVE
;
341 nvc0_query_end(struct pipe_context
*pipe
, struct pipe_query
*pq
)
343 struct nvc0_context
*nvc0
= nvc0_context(pipe
);
344 struct nouveau_pushbuf
*push
= nvc0
->base
.pushbuf
;
345 struct nvc0_query
*q
= nvc0_query(pq
);
347 if (q
->state
!= NVC0_QUERY_STATE_ACTIVE
) {
348 /* some queries don't require 'begin' to be called (e.g. GPU_FINISHED) */
350 nvc0_query_rotate(nvc0
, q
);
353 q
->state
= NVC0_QUERY_STATE_ENDED
;
356 case PIPE_QUERY_OCCLUSION_COUNTER
:
357 case PIPE_QUERY_OCCLUSION_PREDICATE
:
358 nvc0_query_get(push
, q
, 0, 0x0100f002);
359 if (--nvc0
->screen
->num_occlusion_queries_active
== 0) {
361 IMMED_NVC0(push
, NVC0_3D(SAMPLECNT_ENABLE
), 0);
364 case PIPE_QUERY_PRIMITIVES_GENERATED
:
365 nvc0_query_get(push
, q
, 0, 0x09005002 | (q
->index
<< 5));
367 case PIPE_QUERY_PRIMITIVES_EMITTED
:
368 nvc0_query_get(push
, q
, 0, 0x05805002 | (q
->index
<< 5));
370 case PIPE_QUERY_SO_STATISTICS
:
371 nvc0_query_get(push
, q
, 0x00, 0x05805002 | (q
->index
<< 5));
372 nvc0_query_get(push
, q
, 0x10, 0x06805002 | (q
->index
<< 5));
374 case PIPE_QUERY_SO_OVERFLOW_PREDICATE
:
375 /* TODO: How do we sum over all streams for render condition ? */
376 /* PRIMS_DROPPED doesn't write sequence, use a ZERO query to sync on */
377 nvc0_query_get(push
, q
, 0x00, 0x03005002 | (q
->index
<< 5));
378 nvc0_query_get(push
, q
, 0x20, 0x00005002);
380 case PIPE_QUERY_TIMESTAMP
:
381 case PIPE_QUERY_TIME_ELAPSED
:
382 nvc0_query_get(push
, q
, 0, 0x00005002);
384 case PIPE_QUERY_GPU_FINISHED
:
385 nvc0_query_get(push
, q
, 0, 0x1000f010);
387 case PIPE_QUERY_PIPELINE_STATISTICS
:
388 nvc0_query_get(push
, q
, 0x00, 0x00801002); /* VFETCH, VERTICES */
389 nvc0_query_get(push
, q
, 0x10, 0x01801002); /* VFETCH, PRIMS */
390 nvc0_query_get(push
, q
, 0x20, 0x02802002); /* VP, LAUNCHES */
391 nvc0_query_get(push
, q
, 0x30, 0x03806002); /* GP, LAUNCHES */
392 nvc0_query_get(push
, q
, 0x40, 0x04806002); /* GP, PRIMS_OUT */
393 nvc0_query_get(push
, q
, 0x50, 0x07804002); /* RAST, PRIMS_IN */
394 nvc0_query_get(push
, q
, 0x60, 0x08804002); /* RAST, PRIMS_OUT */
395 nvc0_query_get(push
, q
, 0x70, 0x0980a002); /* ROP, PIXELS */
396 nvc0_query_get(push
, q
, 0x80, 0x0d808002); /* TCP, LAUNCHES */
397 nvc0_query_get(push
, q
, 0x90, 0x0e809002); /* TEP, LAUNCHES */
399 case NVC0_QUERY_TFB_BUFFER_OFFSET
:
400 /* indexed by TFB buffer instead of by vertex stream */
401 nvc0_query_get(push
, q
, 0x00, 0x0d005002 | (q
->index
<< 5));
403 case PIPE_QUERY_TIMESTAMP_DISJOINT
:
404 /* This query is not issued on GPU because disjoint is forced to FALSE */
405 q
->state
= NVC0_QUERY_STATE_READY
;
408 #ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS
409 if (q
->type
>= NVC0_QUERY_DRV_STAT(0) &&
410 q
->type
<= NVC0_QUERY_DRV_STAT_LAST
) {
411 q
->u
.value
= nvc0
->screen
->base
.stats
.v
[q
->index
] - q
->u
.value
;
415 if ((q
->type
>= NVE4_PM_QUERY(0) && q
->type
<= NVE4_PM_QUERY_LAST
) ||
416 (q
->type
>= NVC0_PM_QUERY(0) && q
->type
<= NVC0_PM_QUERY_LAST
)) {
417 nvc0_mp_pm_query_end(nvc0
, q
);
422 nouveau_fence_ref(nvc0
->screen
->base
.fence
.current
, &q
->fence
);
426 nvc0_query_update(struct nouveau_client
*cli
, struct nvc0_query
*q
)
429 if (nouveau_fence_signalled(q
->fence
))
430 q
->state
= NVC0_QUERY_STATE_READY
;
432 if (q
->data
[0] == q
->sequence
)
433 q
->state
= NVC0_QUERY_STATE_READY
;
438 nvc0_query_result(struct pipe_context
*pipe
, struct pipe_query
*pq
,
439 boolean wait
, union pipe_query_result
*result
)
441 struct nvc0_context
*nvc0
= nvc0_context(pipe
);
442 struct nvc0_query
*q
= nvc0_query(pq
);
443 uint64_t *res64
= (uint64_t*)result
;
444 uint32_t *res32
= (uint32_t*)result
;
445 boolean
*res8
= (boolean
*)result
;
446 uint64_t *data64
= (uint64_t *)q
->data
;
449 #ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS
450 if (q
->type
>= NVC0_QUERY_DRV_STAT(0) &&
451 q
->type
<= NVC0_QUERY_DRV_STAT_LAST
) {
452 res64
[0] = q
->u
.value
;
456 if ((q
->type
>= NVE4_PM_QUERY(0) && q
->type
<= NVE4_PM_QUERY_LAST
) ||
457 (q
->type
>= NVC0_PM_QUERY(0) && q
->type
<= NVC0_PM_QUERY_LAST
)) {
458 return nvc0_mp_pm_query_result(nvc0
, q
, result
, wait
);
461 if (q
->state
!= NVC0_QUERY_STATE_READY
)
462 nvc0_query_update(nvc0
->screen
->base
.client
, q
);
464 if (q
->state
!= NVC0_QUERY_STATE_READY
) {
466 if (q
->state
!= NVC0_QUERY_STATE_FLUSHED
) {
467 q
->state
= NVC0_QUERY_STATE_FLUSHED
;
468 /* flush for silly apps that spin on GL_QUERY_RESULT_AVAILABLE */
469 PUSH_KICK(nvc0
->base
.pushbuf
);
473 if (nouveau_bo_wait(q
->bo
, NOUVEAU_BO_RD
, nvc0
->screen
->base
.client
))
475 NOUVEAU_DRV_STAT(&nvc0
->screen
->base
, query_sync_count
, 1);
477 q
->state
= NVC0_QUERY_STATE_READY
;
480 case PIPE_QUERY_GPU_FINISHED
:
483 case PIPE_QUERY_OCCLUSION_COUNTER
: /* u32 sequence, u32 count, u64 time */
484 res64
[0] = q
->data
[1] - q
->data
[5];
486 case PIPE_QUERY_OCCLUSION_PREDICATE
:
487 res8
[0] = q
->data
[1] != q
->data
[5];
489 case PIPE_QUERY_PRIMITIVES_GENERATED
: /* u64 count, u64 time */
490 case PIPE_QUERY_PRIMITIVES_EMITTED
: /* u64 count, u64 time */
491 res64
[0] = data64
[0] - data64
[2];
493 case PIPE_QUERY_SO_STATISTICS
:
494 res64
[0] = data64
[0] - data64
[4];
495 res64
[1] = data64
[2] - data64
[6];
497 case PIPE_QUERY_SO_OVERFLOW_PREDICATE
:
498 res8
[0] = data64
[0] != data64
[2];
500 case PIPE_QUERY_TIMESTAMP
:
501 res64
[0] = data64
[1];
503 case PIPE_QUERY_TIMESTAMP_DISJOINT
:
504 res64
[0] = 1000000000;
507 case PIPE_QUERY_TIME_ELAPSED
:
508 res64
[0] = data64
[1] - data64
[3];
510 case PIPE_QUERY_PIPELINE_STATISTICS
:
511 for (i
= 0; i
< 10; ++i
)
512 res64
[i
] = data64
[i
* 2] - data64
[24 + i
* 2];
514 case NVC0_QUERY_TFB_BUFFER_OFFSET
:
515 res32
[0] = q
->data
[1];
518 assert(0); /* can't happen, we don't create queries with invalid type */
526 nvc0_query_fifo_wait(struct nouveau_pushbuf
*push
, struct pipe_query
*pq
)
528 struct nvc0_query
*q
= nvc0_query(pq
);
529 unsigned offset
= q
->offset
;
531 if (q
->type
== PIPE_QUERY_SO_OVERFLOW_PREDICATE
) offset
+= 0x20;
534 PUSH_REFN (push
, q
->bo
, NOUVEAU_BO_GART
| NOUVEAU_BO_RD
);
535 BEGIN_NVC0(push
, SUBC_3D(NV84_SUBCHAN_SEMAPHORE_ADDRESS_HIGH
), 4);
536 PUSH_DATAh(push
, q
->bo
->offset
+ offset
);
537 PUSH_DATA (push
, q
->bo
->offset
+ offset
);
538 PUSH_DATA (push
, q
->sequence
);
539 PUSH_DATA (push
, (1 << 12) |
540 NV84_SUBCHAN_SEMAPHORE_TRIGGER_ACQUIRE_EQUAL
);
544 nvc0_render_condition(struct pipe_context
*pipe
,
545 struct pipe_query
*pq
,
546 boolean condition
, uint mode
)
548 struct nvc0_context
*nvc0
= nvc0_context(pipe
);
549 struct nouveau_pushbuf
*push
= nvc0
->base
.pushbuf
;
550 struct nvc0_query
*q
;
553 mode
!= PIPE_RENDER_COND_NO_WAIT
&&
554 mode
!= PIPE_RENDER_COND_BY_REGION_NO_WAIT
;
557 cond
= NVC0_3D_COND_MODE_ALWAYS
;
561 /* NOTE: comparison of 2 queries only works if both have completed */
563 case PIPE_QUERY_SO_OVERFLOW_PREDICATE
:
564 cond
= condition
? NVC0_3D_COND_MODE_EQUAL
:
565 NVC0_3D_COND_MODE_NOT_EQUAL
;
568 case PIPE_QUERY_OCCLUSION_COUNTER
:
569 case PIPE_QUERY_OCCLUSION_PREDICATE
:
570 if (likely(!condition
)) {
571 if (unlikely(q
->nesting
))
572 cond
= wait
? NVC0_3D_COND_MODE_NOT_EQUAL
:
573 NVC0_3D_COND_MODE_ALWAYS
;
575 cond
= NVC0_3D_COND_MODE_RES_NON_ZERO
;
577 cond
= wait
? NVC0_3D_COND_MODE_EQUAL
: NVC0_3D_COND_MODE_ALWAYS
;
581 assert(!"render condition query not a predicate");
582 cond
= NVC0_3D_COND_MODE_ALWAYS
;
587 nvc0
->cond_query
= pq
;
588 nvc0
->cond_cond
= condition
;
589 nvc0
->cond_condmode
= cond
;
590 nvc0
->cond_mode
= mode
;
594 IMMED_NVC0(push
, NVC0_3D(COND_MODE
), cond
);
599 nvc0_query_fifo_wait(push
, pq
);
602 PUSH_REFN (push
, q
->bo
, NOUVEAU_BO_GART
| NOUVEAU_BO_RD
);
603 BEGIN_NVC0(push
, NVC0_3D(COND_ADDRESS_HIGH
), 3);
604 PUSH_DATAh(push
, q
->bo
->offset
+ q
->offset
);
605 PUSH_DATA (push
, q
->bo
->offset
+ q
->offset
);
606 PUSH_DATA (push
, cond
);
607 BEGIN_NVC0(push
, NVC0_2D(COND_ADDRESS_HIGH
), 2);
608 PUSH_DATAh(push
, q
->bo
->offset
+ q
->offset
);
609 PUSH_DATA (push
, q
->bo
->offset
+ q
->offset
);
613 nvc0_query_pushbuf_submit(struct nouveau_pushbuf
*push
,
614 struct pipe_query
*pq
, unsigned result_offset
)
616 struct nvc0_query
*q
= nvc0_query(pq
);
618 #define NVC0_IB_ENTRY_1_NO_PREFETCH (1 << (31 - 8))
620 nouveau_pushbuf_space(push
, 0, 0, 1);
621 nouveau_pushbuf_data(push
, q
->bo
, q
->offset
+ result_offset
, 4 |
622 NVC0_IB_ENTRY_1_NO_PREFETCH
);
626 nvc0_so_target_save_offset(struct pipe_context
*pipe
,
627 struct pipe_stream_output_target
*ptarg
,
628 unsigned index
, boolean
*serialize
)
630 struct nvc0_so_target
*targ
= nvc0_so_target(ptarg
);
634 PUSH_SPACE(nvc0_context(pipe
)->base
.pushbuf
, 1);
635 IMMED_NVC0(nvc0_context(pipe
)->base
.pushbuf
, NVC0_3D(SERIALIZE
), 0);
637 NOUVEAU_DRV_STAT(nouveau_screen(pipe
->screen
), gpu_serialize_count
, 1);
640 nvc0_query(targ
->pq
)->index
= index
;
642 nvc0_query_end(pipe
, targ
->pq
);
646 /* === DRIVER STATISTICS === */
648 #ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS
650 static const char *nvc0_drv_stat_names
[] =
652 "drv-tex_obj_current_count",
653 "drv-tex_obj_current_bytes",
654 "drv-buf_obj_current_count",
655 "drv-buf_obj_current_bytes_vid",
656 "drv-buf_obj_current_bytes_sys",
657 "drv-tex_transfers_rd",
658 "drv-tex_transfers_wr",
659 "drv-tex_copy_count",
660 "drv-tex_blit_count",
661 "drv-tex_cache_flush_count",
662 "drv-buf_transfers_rd",
663 "drv-buf_transfers_wr",
664 "drv-buf_read_bytes_staging_vid",
665 "drv-buf_write_bytes_direct",
666 "drv-buf_write_bytes_staging_vid",
667 "drv-buf_write_bytes_staging_sys",
668 "drv-buf_copy_bytes",
669 "drv-buf_non_kernel_fence_sync_count",
670 "drv-any_non_kernel_fence_sync_count",
671 "drv-query_sync_count",
672 "drv-gpu_serialize_count",
673 "drv-draw_calls_array",
674 "drv-draw_calls_indexed",
675 "drv-draw_calls_fallback_count",
676 "drv-user_buffer_upload_bytes",
677 "drv-constbuf_upload_count",
678 "drv-constbuf_upload_bytes",
680 "drv-resource_validate_count"
683 #endif /* NOUVEAU_ENABLE_DRIVER_STATISTICS */
686 /* === PERFORMANCE MONITORING COUNTERS for NVE4+ === */
688 /* Code to read out MP counters: They are accessible via mmio, too, but let's
689 * just avoid mapping registers in userspace. We'd have to know which MPs are
690 * enabled/present, too, and that information is not presently exposed.
691 * We could add a kernel interface for it, but reading the counters like this
692 * has the advantage of being async (if get_result isn't called immediately).
694 static const uint64_t nve4_read_mp_pm_counters_code
[] =
696 /* sched 0x20 0x20 0x20 0x20 0x20 0x20 0x20
698 * mov b32 $r12 $physid
704 * sched 0x20 0x20 0x23 0x04 0x20 0x04 0x2b
708 * set $p0 0x1 eq u32 $r8 0x0
709 * mov b32 $r10 c0[0x0]
710 * ext u32 $r8 $r12 0x414
711 * mov b32 $r11 c0[0x4]
712 * sched 0x04 0x2e 0x04 0x20 0x20 0x28 0x04
713 * ext u32 $r9 $r12 0x208
715 * set $p1 0x1 eq u32 $r9 0x0
716 * mul $r8 u32 $r8 u32 96
717 * mul $r12 u32 $r9 u32 16
718 * mul $r13 u32 $r9 u32 4
719 * add b32 $r9 $r8 $r13
720 * sched 0x28 0x04 0x2c 0x04 0x2c 0x04 0x2c
721 * add b32 $r8 $r8 $r12
723 * add b32 $r10 $c $r10 $r8
725 * add b32 $r11 $r11 0x0 $c
726 * add b32 $r12 $c $r12 $r9
727 * st b128 wt g[$r10d] $r0q
728 * sched 0x4 0x2c 0x20 0x04 0x2e 0x00 0x00
729 * mov b32 $r0 c0[0x8]
730 * add b32 $r13 $r13 0x0 $c
731 * $p1 st b128 wt g[$r12d+0x40] $r4q
732 * st b32 wt g[$r12d+0x50] $r0
734 0x2202020202020207ULL
,
735 0x2c00000084021c04ULL
,
736 0x2c0000000c031c04ULL
,
737 0x2c00000010001c04ULL
,
738 0x2c00000014005c04ULL
,
739 0x2c00000018009c04ULL
,
740 0x2c0000001c00dc04ULL
,
741 0x2c00000020011c04ULL
,
742 0x22b0420042320207ULL
,
743 0x2c00000024015c04ULL
,
744 0x2c00000028019c04ULL
,
745 0x2c0000002c01dc04ULL
,
746 0x190e0000fc81dc03ULL
,
747 0x2800400000029de4ULL
,
748 0x7000c01050c21c03ULL
,
749 0x280040001002dde4ULL
,
750 0x204282020042e047ULL
,
751 0x7000c00820c25c03ULL
,
752 0x80000000000021e7ULL
,
753 0x190e0000fc93dc03ULL
,
754 0x1000000180821c02ULL
,
755 0x1000000040931c02ULL
,
756 0x1000000010935c02ULL
,
757 0x4800000034825c03ULL
,
758 0x22c042c042c04287ULL
,
759 0x4800000030821c03ULL
,
760 0x2800000028031de4ULL
,
761 0x4801000020a29c03ULL
,
762 0x280000002c035de4ULL
,
763 0x0800000000b2dc42ULL
,
764 0x4801000024c31c03ULL
,
765 0x9400000000a01fc5ULL
,
766 0x200002e04202c047ULL
,
767 0x2800400020001de4ULL
,
768 0x0800000000d35c42ULL
,
769 0x9400000100c107c5ULL
,
770 0x9400000140c01f85ULL
,
771 0x8000000000001de7ULL
774 /* NOTE: intentionally using the same names as NV */
775 static const char *nve4_pm_query_names
[] =
797 "l1_local_load_miss",
798 "l1_local_store_hit",
799 "l1_local_store_miss",
802 "l1_global_load_hit",
803 "l1_global_load_miss",
804 "uncached_global_load_transaction",
805 "global_store_transaction",
813 "shared_load_replay",
814 "shared_store_replay",
815 "local_load_transactions",
816 "local_store_transactions",
817 "l1_shared_load_transactions",
818 "l1_shared_store_transactions",
819 "global_ld_mem_divergence_replays",
820 "global_st_mem_divergence_replays",
821 /* metrics, i.e. functions of the MP counters */
822 "metric-ipc", /* inst_executed, clock */
823 "metric-ipac", /* inst_executed, active_cycles */
824 "metric-ipec", /* inst_executed, (bool)inst_executed */
825 "metric-achieved_occupancy", /* active_warps, active_cycles */
826 "metric-sm_efficiency", /* active_cycles, clock */
827 "metric-inst_replay_overhead" /* inst_issued, inst_executed */
830 /* For simplicity, we will allocate as many group slots as we allocate counter
831 * slots. This means that a single counter which wants to source from 2 groups
832 * will have to be declared as using 2 counter slots. This shouldn't really be
833 * a problem because such queries don't make much sense ... (unless someone is
836 struct nvc0_mp_counter_cfg
838 uint32_t func
: 16; /* mask or 4-bit logic op (depending on mode) */
839 uint32_t mode
: 4; /* LOGOP,B6,LOGOP_B6(_PULSE) */
840 uint32_t num_src
: 3; /* number of sources (1 - 6, only for NVC0:NVE4) */
841 uint32_t sig_dom
: 1; /* if 0, MP_PM_A (per warp-sched), if 1, MP_PM_B */
842 uint32_t sig_sel
: 8; /* signal group */
843 uint64_t src_sel
; /* signal selection for up to 6 sources (48 bit) */
846 #define NVC0_COUNTER_OPn_SUM 0
847 #define NVC0_COUNTER_OPn_OR 1
848 #define NVC0_COUNTER_OPn_AND 2
849 #define NVC0_COUNTER_OP2_REL_SUM_MM 3 /* (sum(ctr0) - sum(ctr1)) / sum(ctr0) */
850 #define NVC0_COUNTER_OP2_DIV_SUM_M0 4 /* sum(ctr0) / ctr1 of MP[0]) */
851 #define NVC0_COUNTER_OP2_AVG_DIV_MM 5 /* avg(ctr0 / ctr1) */
852 #define NVC0_COUNTER_OP2_AVG_DIV_M0 6 /* avg(ctr0) / ctr1 of MP[0]) */
854 struct nvc0_mp_pm_query_cfg
856 struct nvc0_mp_counter_cfg ctr
[4];
857 uint8_t num_counters
;
859 uint8_t norm
[2]; /* normalization num,denom */
862 #define _Q1A(n, f, m, g, s, nu, dn) [NVE4_PM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g, s }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { nu, dn } }
863 #define _Q1B(n, f, m, g, s, nu, dn) [NVE4_PM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g, s }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { nu, dn } }
864 #define _M2A(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_PM_QUERY_METRIC_##n] = { { \
865 { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g0, s0 }, \
866 { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g1, s1 }, \
867 {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } }
868 #define _M2B(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_PM_QUERY_METRIC_##n] = { { \
869 { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g0, s0 }, \
870 { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g1, s1 }, \
871 {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } }
872 #define _M2AB(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_PM_QUERY_METRIC_##n] = { { \
873 { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g0, s0 }, \
874 { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g1, s1 }, \
875 {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } }
878 * active_warps: bit 0 alternates btw 0 and 1 for odd nr of warps
879 * inst_executed etc.: we only count a single warp scheduler
880 * metric-ipXc: we simply multiply by 4 to account for the 4 warp schedulers;
881 * this is inaccurate !
883 static const struct nvc0_mp_pm_query_cfg nve4_mp_pm_queries
[] =
885 _Q1A(PROF_TRIGGER_0
, 0x0001, B6
, USER
, 0x00000000, 1, 1),
886 _Q1A(PROF_TRIGGER_1
, 0x0001, B6
, USER
, 0x00000004, 1, 1),
887 _Q1A(PROF_TRIGGER_2
, 0x0001, B6
, USER
, 0x00000008, 1, 1),
888 _Q1A(PROF_TRIGGER_3
, 0x0001, B6
, USER
, 0x0000000c, 1, 1),
889 _Q1A(PROF_TRIGGER_4
, 0x0001, B6
, USER
, 0x00000010, 1, 1),
890 _Q1A(PROF_TRIGGER_5
, 0x0001, B6
, USER
, 0x00000014, 1, 1),
891 _Q1A(PROF_TRIGGER_6
, 0x0001, B6
, USER
, 0x00000018, 1, 1),
892 _Q1A(PROF_TRIGGER_7
, 0x0001, B6
, USER
, 0x0000001c, 1, 1),
893 _Q1A(LAUNCHED_WARPS
, 0x0001, B6
, LAUNCH
, 0x00000004, 1, 1),
894 _Q1A(LAUNCHED_THREADS
, 0x003f, B6
, LAUNCH
, 0x398a4188, 1, 1),
895 _Q1B(LAUNCHED_CTA
, 0x0001, B6
, WARP
, 0x0000001c, 1, 1),
896 _Q1A(INST_ISSUED1
, 0x0001, B6
, ISSUE
, 0x00000004, 1, 1),
897 _Q1A(INST_ISSUED2
, 0x0001, B6
, ISSUE
, 0x00000008, 1, 1),
898 _Q1A(INST_ISSUED
, 0x0003, B6
, ISSUE
, 0x00000104, 1, 1),
899 _Q1A(INST_EXECUTED
, 0x0003, B6
, EXEC
, 0x00000398, 1, 1),
900 _Q1A(LD_SHARED
, 0x0001, B6
, LDST
, 0x00000000, 1, 1),
901 _Q1A(ST_SHARED
, 0x0001, B6
, LDST
, 0x00000004, 1, 1),
902 _Q1A(LD_LOCAL
, 0x0001, B6
, LDST
, 0x00000008, 1, 1),
903 _Q1A(ST_LOCAL
, 0x0001, B6
, LDST
, 0x0000000c, 1, 1),
904 _Q1A(GLD_REQUEST
, 0x0001, B6
, LDST
, 0x00000010, 1, 1),
905 _Q1A(GST_REQUEST
, 0x0001, B6
, LDST
, 0x00000014, 1, 1),
906 _Q1B(L1_LOCAL_LOAD_HIT
, 0x0001, B6
, L1
, 0x00000000, 1, 1),
907 _Q1B(L1_LOCAL_LOAD_MISS
, 0x0001, B6
, L1
, 0x00000004, 1, 1),
908 _Q1B(L1_LOCAL_STORE_HIT
, 0x0001, B6
, L1
, 0x00000008, 1, 1),
909 _Q1B(L1_LOCAL_STORE_MISS
, 0x0001, B6
, L1
, 0x0000000c, 1, 1),
910 _Q1B(L1_GLOBAL_LOAD_HIT
, 0x0001, B6
, L1
, 0x00000010, 1, 1),
911 _Q1B(L1_GLOBAL_LOAD_MISS
, 0x0001, B6
, L1
, 0x00000014, 1, 1),
912 _Q1B(GLD_TRANSACTIONS_UNCACHED
, 0x0001, B6
, MEM
, 0x00000000, 1, 1),
913 _Q1B(GST_TRANSACTIONS
, 0x0001, B6
, MEM
, 0x00000004, 1, 1),
914 _Q1A(BRANCH
, 0x0001, B6
, BRANCH
, 0x0000000c, 1, 1),
915 _Q1A(BRANCH_DIVERGENT
, 0x0001, B6
, BRANCH
, 0x00000010, 1, 1),
916 _Q1B(ACTIVE_WARPS
, 0x003f, B6
, WARP
, 0x31483104, 2, 1),
917 _Q1B(ACTIVE_CYCLES
, 0x0001, B6
, WARP
, 0x00000000, 1, 1),
918 _Q1A(ATOM_COUNT
, 0x0001, B6
, BRANCH
, 0x00000000, 1, 1),
919 _Q1A(GRED_COUNT
, 0x0001, B6
, BRANCH
, 0x00000008, 1, 1),
920 _Q1B(LD_SHARED_REPLAY
, 0x0001, B6
, REPLAY
, 0x00000008, 1, 1),
921 _Q1B(ST_SHARED_REPLAY
, 0x0001, B6
, REPLAY
, 0x0000000c, 1, 1),
922 _Q1B(LD_LOCAL_TRANSACTIONS
, 0x0001, B6
, TRANSACTION
, 0x00000000, 1, 1),
923 _Q1B(ST_LOCAL_TRANSACTIONS
, 0x0001, B6
, TRANSACTION
, 0x00000004, 1, 1),
924 _Q1B(L1_LD_SHARED_TRANSACTIONS
, 0x0001, B6
, TRANSACTION
, 0x00000008, 1, 1),
925 _Q1B(L1_ST_SHARED_TRANSACTIONS
, 0x0001, B6
, TRANSACTION
, 0x0000000c, 1, 1),
926 _Q1B(GLD_MEM_DIV_REPLAY
, 0x0001, B6
, REPLAY
, 0x00000010, 1, 1),
927 _Q1B(GST_MEM_DIV_REPLAY
, 0x0001, B6
, REPLAY
, 0x00000014, 1, 1),
928 _M2AB(IPC
, 0x3, B6
, EXEC
, 0x398, 0xffff, LOGOP
, WARP
, 0x0, DIV_SUM_M0
, 10, 1),
929 _M2AB(IPAC
, 0x3, B6
, EXEC
, 0x398, 0x1, B6
, WARP
, 0x0, AVG_DIV_MM
, 10, 1),
930 _M2A(IPEC
, 0x3, B6
, EXEC
, 0x398, 0xe, LOGOP
, EXEC
, 0x398, AVG_DIV_MM
, 10, 1),
931 _M2A(INST_REPLAY_OHEAD
, 0x3, B6
, ISSUE
, 0x104, 0x3, B6
, EXEC
, 0x398, REL_SUM_MM
, 100, 1),
932 _M2B(MP_OCCUPANCY
, 0x3f, B6
, WARP
, 0x31483104, 0x01, B6
, WARP
, 0x0, AVG_DIV_MM
, 200, 64),
933 _M2B(MP_EFFICIENCY
, 0x01, B6
, WARP
, 0x0, 0xffff, LOGOP
, WARP
, 0x0, AVG_DIV_M0
, 100, 1),
941 /* === PERFORMANCE MONITORING COUNTERS for NVC0:NVE4 === */
942 static const uint64_t nvc0_read_mp_pm_counters_code
[] =
945 * mov b32 $r9 $physid
954 * set $p0 0x1 eq u32 $r8 0x0
955 * mov b32 $r10 c0[0x0]
956 * mov b32 $r11 c0[0x4]
957 * ext u32 $r8 $r9 0x414
959 * mul $r8 u32 $r8 u32 36
960 * add b32 $r10 $c $r10 $r8
961 * add b32 $r11 $r11 0x0 $c
962 * mov b32 $r8 c0[0x8]
963 * st b128 wt g[$r10d+0x00] $r0q
964 * st b128 wt g[$r10d+0x10] $r4q
965 * st b32 wt g[$r10d+0x20] $r8
967 0x2c00000084021c04ULL
,
968 0x2c0000000c025c04ULL
,
969 0x2c00000010001c04ULL
,
970 0x2c00000014005c04ULL
,
971 0x2c00000018009c04ULL
,
972 0x2c0000001c00dc04ULL
,
973 0x2c00000020011c04ULL
,
974 0x2c00000024015c04ULL
,
975 0x2c00000028019c04ULL
,
976 0x2c0000002c01dc04ULL
,
977 0x190e0000fc81dc03ULL
,
978 0x2800400000029de4ULL
,
979 0x280040001002dde4ULL
,
980 0x7000c01050921c03ULL
,
981 0x80000000000021e7ULL
,
982 0x1000000090821c02ULL
,
983 0x4801000020a29c03ULL
,
984 0x0800000000b2dc42ULL
,
985 0x2800400020021de4ULL
,
986 0x9400000000a01fc5ULL
,
987 0x9400000040a11fc5ULL
,
988 0x9400000080a21f85ULL
,
989 0x8000000000001de7ULL
992 static const char *nvc0_pm_query_names
[] =
1014 "thread_inst_executed_0",
1015 "thread_inst_executed_1",
1016 "thread_inst_executed_2",
1017 "thread_inst_executed_3",
1028 #define _Q(n, f, m, g, c, s0, s1, s2, s3, s4, s5) [NVC0_PM_QUERY_##n] = { { { f, NVC0_COMPUTE_MP_PM_OP_MODE_##m, c, 0, g, s0|(s1 << 8)|(s2 << 16)|(s3 << 24)|(s4##ULL << 32)|(s5##ULL << 40) }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { 1, 1 } }
1030 static const struct nvc0_mp_pm_query_cfg nvc0_mp_pm_queries
[] =
1032 _Q(INST_EXECUTED
, 0xaaaa, LOGOP
, 0x2d, 3, 0x00, 0x11, 0x22, 0x00, 0x00, 0x00),
1033 _Q(BRANCH
, 0xaaaa, LOGOP
, 0x1a, 2, 0x00, 0x11, 0x00, 0x00, 0x00, 0x00),
1034 _Q(BRANCH_DIVERGENT
, 0xaaaa, LOGOP
, 0x19, 2, 0x20, 0x31, 0x00, 0x00, 0x00, 0x00),
1035 _Q(ACTIVE_WARPS
, 0xaaaa, LOGOP
, 0x24, 6, 0x10, 0x21, 0x32, 0x43, 0x54, 0x65),
1036 _Q(ACTIVE_CYCLES
, 0xaaaa, LOGOP
, 0x11, 1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00),
1037 _Q(LAUNCHED_WARPS
, 0xaaaa, LOGOP
, 0x26, 1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00),
1038 _Q(LAUNCHED_THREADS
, 0xaaaa, LOGOP
, 0x26, 6, 0x10, 0x21, 0x32, 0x43, 0x54, 0x65),
1039 _Q(LD_SHARED
, 0xaaaa, LOGOP
, 0x64, 1, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00),
1040 _Q(ST_SHARED
, 0xaaaa, LOGOP
, 0x64, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00),
1041 _Q(LD_LOCAL
, 0xaaaa, LOGOP
, 0x64, 1, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00),
1042 _Q(ST_LOCAL
, 0xaaaa, LOGOP
, 0x64, 1, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00),
1043 _Q(GRED_COUNT
, 0xaaaa, LOGOP
, 0x63, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00),
1044 _Q(ATOM_COUNT
, 0xaaaa, LOGOP
, 0x63, 1, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00),
1045 _Q(GLD_REQUEST
, 0xaaaa, LOGOP
, 0x64, 1, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00),
1046 _Q(GST_REQUEST
, 0xaaaa, LOGOP
, 0x64, 1, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00),
1047 _Q(INST_ISSUED1_0
, 0xaaaa, LOGOP
, 0x7e, 1, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00),
1048 _Q(INST_ISSUED1_1
, 0xaaaa, LOGOP
, 0x7e, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00),
1049 _Q(INST_ISSUED2_0
, 0xaaaa, LOGOP
, 0x7e, 1, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00),
1050 _Q(INST_ISSUED2_1
, 0xaaaa, LOGOP
, 0x7e, 1, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00),
1051 _Q(TH_INST_EXECUTED_0
, 0xaaaa, LOGOP
, 0xa3, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55),
1052 _Q(TH_INST_EXECUTED_1
, 0xaaaa, LOGOP
, 0xa5, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55),
1053 _Q(TH_INST_EXECUTED_2
, 0xaaaa, LOGOP
, 0xa4, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55),
1054 _Q(TH_INST_EXECUTED_3
, 0xaaaa, LOGOP
, 0xa6, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55),
1055 _Q(PROF_TRIGGER_0
, 0xaaaa, LOGOP
, 0x01, 1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00),
1056 _Q(PROF_TRIGGER_1
, 0xaaaa, LOGOP
, 0x01, 1, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00),
1057 _Q(PROF_TRIGGER_2
, 0xaaaa, LOGOP
, 0x01, 1, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00),
1058 _Q(PROF_TRIGGER_3
, 0xaaaa, LOGOP
, 0x01, 1, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00),
1059 _Q(PROF_TRIGGER_4
, 0xaaaa, LOGOP
, 0x01, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00),
1060 _Q(PROF_TRIGGER_5
, 0xaaaa, LOGOP
, 0x01, 1, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00),
1061 _Q(PROF_TRIGGER_6
, 0xaaaa, LOGOP
, 0x01, 1, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00),
1062 _Q(PROF_TRIGGER_7
, 0xaaaa, LOGOP
, 0x01, 1, 0x70, 0x00, 0x00, 0x00, 0x00, 0x00),
1067 static const struct nvc0_mp_pm_query_cfg
*
1068 nvc0_mp_pm_query_get_cfg(struct nvc0_context
*nvc0
, struct nvc0_query
*q
)
1070 struct nvc0_screen
*screen
= nvc0
->screen
;
1072 if (screen
->base
.class_3d
>= NVE4_3D_CLASS
)
1073 return &nve4_mp_pm_queries
[q
->type
- PIPE_QUERY_DRIVER_SPECIFIC
];
1074 return &nvc0_mp_pm_queries
[q
->type
- NVC0_PM_QUERY(0)];
1078 nvc0_mp_pm_query_begin(struct nvc0_context
*nvc0
, struct nvc0_query
*q
)
1080 struct nvc0_screen
*screen
= nvc0
->screen
;
1081 struct nouveau_pushbuf
*push
= nvc0
->base
.pushbuf
;
1082 const boolean is_nve4
= screen
->base
.class_3d
>= NVE4_3D_CLASS
;
1083 const struct nvc0_mp_pm_query_cfg
*cfg
;
1085 unsigned num_ab
[2] = { 0, 0 };
1087 cfg
= nvc0_mp_pm_query_get_cfg(nvc0
, q
);
1089 /* check if we have enough free counter slots */
1090 for (i
= 0; i
< cfg
->num_counters
; ++i
)
1091 num_ab
[cfg
->ctr
[i
].sig_dom
]++;
1093 if (screen
->pm
.num_mp_pm_active
[0] + num_ab
[0] > 4 ||
1094 screen
->pm
.num_mp_pm_active
[1] + num_ab
[1] > 4) {
1095 NOUVEAU_ERR("Not enough free MP counter slots !\n");
1099 assert(cfg
->num_counters
<= 4);
1100 PUSH_SPACE(push
, 4 * 8 * (is_nve4
? 1 : 6) + 6);
1102 if (!screen
->pm
.mp_counters_enabled
) {
1103 screen
->pm
.mp_counters_enabled
= TRUE
;
1104 BEGIN_NVC0(push
, SUBC_SW(0x06ac), 1);
1105 PUSH_DATA (push
, 0x1fcb);
1108 /* set sequence field to 0 (used to check if result is available) */
1109 for (i
= 0; i
< screen
->mp_count
; ++i
)
1110 q
->data
[i
* 10 + 10] = 0;
1112 for (i
= 0; i
< cfg
->num_counters
; ++i
) {
1113 const unsigned d
= cfg
->ctr
[i
].sig_dom
;
1115 if (!screen
->pm
.num_mp_pm_active
[d
]) {
1116 uint32_t m
= (1 << 22) | (1 << (7 + (8 * !d
)));
1117 if (screen
->pm
.num_mp_pm_active
[!d
])
1118 m
|= 1 << (7 + (8 * d
));
1119 BEGIN_NVC0(push
, SUBC_SW(0x0600), 1);
1120 PUSH_DATA (push
, m
);
1122 screen
->pm
.num_mp_pm_active
[d
]++;
1124 for (c
= d
* 4; c
< (d
* 4 + 4); ++c
) {
1125 if (!screen
->pm
.mp_counter
[c
]) {
1127 screen
->pm
.mp_counter
[c
] = (struct pipe_query
*)q
;
1131 assert(c
<= (d
* 4 + 3)); /* must succeed, already checked for space */
1133 /* configure and reset the counter(s) */
1136 BEGIN_NVC0(push
, NVE4_COMPUTE(MP_PM_A_SIGSEL(c
& 3)), 1);
1138 BEGIN_NVC0(push
, NVE4_COMPUTE(MP_PM_B_SIGSEL(c
& 3)), 1);
1139 PUSH_DATA (push
, cfg
->ctr
[i
].sig_sel
);
1140 BEGIN_NVC0(push
, NVE4_COMPUTE(MP_PM_SRCSEL(c
)), 1);
1141 PUSH_DATA (push
, cfg
->ctr
[i
].src_sel
+ 0x2108421 * (c
& 3));
1142 BEGIN_NVC0(push
, NVE4_COMPUTE(MP_PM_FUNC(c
)), 1);
1143 PUSH_DATA (push
, (cfg
->ctr
[i
].func
<< 4) | cfg
->ctr
[i
].mode
);
1144 BEGIN_NVC0(push
, NVE4_COMPUTE(MP_PM_SET(c
)), 1);
1145 PUSH_DATA (push
, 0);
1149 for (s
= 0; s
< cfg
->ctr
[i
].num_src
; s
++) {
1150 BEGIN_NVC0(push
, NVC0_COMPUTE(MP_PM_SIGSEL(s
)), 1);
1151 PUSH_DATA (push
, cfg
->ctr
[i
].sig_sel
);
1152 BEGIN_NVC0(push
, NVC0_COMPUTE(MP_PM_SRCSEL(s
)), 1);
1153 PUSH_DATA (push
, (cfg
->ctr
[i
].src_sel
>> (s
* 8)) & 0xff);
1154 BEGIN_NVC0(push
, NVC0_COMPUTE(MP_PM_OP(s
)), 1);
1155 PUSH_DATA (push
, (cfg
->ctr
[i
].func
<< 4) | cfg
->ctr
[i
].mode
);
1156 BEGIN_NVC0(push
, NVC0_COMPUTE(MP_PM_SET(s
)), 1);
1157 PUSH_DATA (push
, 0);
1165 nvc0_mp_pm_query_end(struct nvc0_context
*nvc0
, struct nvc0_query
*q
)
1167 struct nvc0_screen
*screen
= nvc0
->screen
;
1168 struct pipe_context
*pipe
= &nvc0
->base
.pipe
;
1169 struct nouveau_pushbuf
*push
= nvc0
->base
.pushbuf
;
1170 const boolean is_nve4
= screen
->base
.class_3d
>= NVE4_3D_CLASS
;
1173 const uint block
[3] = { 32, is_nve4
? 4 : 1, 1 };
1174 const uint grid
[3] = { screen
->mp_count
, 1, 1 };
1176 const struct nvc0_mp_pm_query_cfg
*cfg
;
1178 cfg
= nvc0_mp_pm_query_get_cfg(nvc0
, q
);
1180 if (unlikely(!screen
->pm
.prog
)) {
1181 struct nvc0_program
*prog
= CALLOC_STRUCT(nvc0_program
);
1182 prog
->type
= PIPE_SHADER_COMPUTE
;
1183 prog
->translated
= TRUE
;
1184 prog
->num_gprs
= 14;
1185 prog
->parm_size
= 12;
1187 prog
->code
= (uint32_t *)nve4_read_mp_pm_counters_code
;
1188 prog
->code_size
= sizeof(nve4_read_mp_pm_counters_code
);
1190 prog
->code
= (uint32_t *)nvc0_read_mp_pm_counters_code
;
1191 prog
->code_size
= sizeof(nvc0_read_mp_pm_counters_code
);
1193 screen
->pm
.prog
= prog
;
1196 /* disable all counting */
1197 PUSH_SPACE(push
, 8);
1198 for (c
= 0; c
< 8; ++c
)
1199 if (screen
->pm
.mp_counter
[c
]) {
1201 IMMED_NVC0(push
, NVE4_COMPUTE(MP_PM_FUNC(c
)), 0);
1203 IMMED_NVC0(push
, NVC0_COMPUTE(MP_PM_OP(c
)), 0);
1206 /* release counters for this query */
1207 for (c
= 0; c
< 8; ++c
) {
1208 if (nvc0_query(screen
->pm
.mp_counter
[c
]) == q
) {
1209 screen
->pm
.num_mp_pm_active
[c
/ 4]--;
1210 screen
->pm
.mp_counter
[c
] = NULL
;
1214 BCTX_REFN_bo(nvc0
->bufctx_cp
, CP_QUERY
, NOUVEAU_BO_GART
| NOUVEAU_BO_WR
,
1217 PUSH_SPACE(push
, 1);
1218 IMMED_NVC0(push
, SUBC_COMPUTE(NV50_GRAPH_SERIALIZE
), 0);
1220 pipe
->bind_compute_state(pipe
, screen
->pm
.prog
);
1221 input
[0] = (q
->bo
->offset
+ q
->base
);
1222 input
[1] = (q
->bo
->offset
+ q
->base
) >> 32;
1223 input
[2] = q
->sequence
;
1224 pipe
->launch_grid(pipe
, block
, grid
, 0, input
);
1226 nouveau_bufctx_reset(nvc0
->bufctx_cp
, NVC0_BIND_CP_QUERY
);
1228 /* re-activate other counters */
1229 PUSH_SPACE(push
, 16);
1231 for (c
= 0; c
< 8; ++c
) {
1233 q
= nvc0_query(screen
->pm
.mp_counter
[c
]);
1236 cfg
= nvc0_mp_pm_query_get_cfg(nvc0
, q
);
1237 for (i
= 0; i
< cfg
->num_counters
; ++i
) {
1238 if (mask
& (1 << q
->ctr
[i
]))
1240 mask
|= 1 << q
->ctr
[i
];
1242 BEGIN_NVC0(push
, NVE4_COMPUTE(MP_PM_FUNC(q
->ctr
[i
])), 1);
1244 BEGIN_NVC0(push
, NVC0_COMPUTE(MP_PM_OP(q
->ctr
[i
])), 1);
1246 PUSH_DATA (push
, (cfg
->ctr
[i
].func
<< 4) | cfg
->ctr
[i
].mode
);
1251 static INLINE boolean
1252 nvc0_mp_pm_query_read_data(uint32_t count
[32][4],
1253 struct nvc0_context
*nvc0
, boolean wait
,
1254 struct nvc0_query
*q
,
1255 const struct nvc0_mp_pm_query_cfg
*cfg
,
1260 for (p
= 0; p
< mp_count
; ++p
) {
1261 const unsigned b
= (0x24 / 4) * p
;
1263 for (c
= 0; c
< cfg
->num_counters
; ++c
) {
1264 if (q
->data
[b
+ 8] != q
->sequence
) {
1267 if (nouveau_bo_wait(q
->bo
, NOUVEAU_BO_RD
, nvc0
->base
.client
))
1270 count
[p
][c
] = q
->data
[b
+ q
->ctr
[c
]];
1276 static INLINE boolean
1277 nve4_mp_pm_query_read_data(uint32_t count
[32][4],
1278 struct nvc0_context
*nvc0
, boolean wait
,
1279 struct nvc0_query
*q
,
1280 const struct nvc0_mp_pm_query_cfg
*cfg
,
1285 for (p
= 0; p
< mp_count
; ++p
) {
1286 const unsigned b
= (0x60 / 4) * p
;
1288 for (c
= 0; c
< cfg
->num_counters
; ++c
) {
1290 for (d
= 0; d
< ((q
->ctr
[c
] & ~3) ? 1 : 4); ++d
) {
1291 if (q
->data
[b
+ 20 + d
] != q
->sequence
) {
1294 if (nouveau_bo_wait(q
->bo
, NOUVEAU_BO_RD
, nvc0
->base
.client
))
1297 if (q
->ctr
[c
] & ~0x3)
1298 count
[p
][c
] = q
->data
[b
+ 16 + (q
->ctr
[c
] & 3)];
1300 count
[p
][c
] += q
->data
[b
+ d
* 4 + q
->ctr
[c
]];
1307 /* Metric calculations:
1308 * sum(x) ... sum of x over all MPs
1309 * avg(x) ... average of x over all MPs
1311 * IPC : sum(inst_executed) / clock
1312 * INST_REPLAY_OHEAD: (sum(inst_issued) - sum(inst_executed)) / sum(inst_issued)
1313 * MP_OCCUPANCY : avg((active_warps / 64) / active_cycles)
1314 * MP_EFFICIENCY : avg(active_cycles / clock)
1316 * NOTE: Interpretation of IPC requires knowledge of MP count.
1319 nvc0_mp_pm_query_result(struct nvc0_context
*nvc0
, struct nvc0_query
*q
,
1320 void *result
, boolean wait
)
1322 uint32_t count
[32][4];
1324 unsigned mp_count
= MIN2(nvc0
->screen
->mp_count_compute
, 32);
1326 const struct nvc0_mp_pm_query_cfg
*cfg
;
1329 cfg
= nvc0_mp_pm_query_get_cfg(nvc0
, q
);
1331 if (nvc0
->screen
->base
.class_3d
>= NVE4_3D_CLASS
)
1332 ret
= nve4_mp_pm_query_read_data(count
, nvc0
, wait
, q
, cfg
, mp_count
);
1334 ret
= nvc0_mp_pm_query_read_data(count
, nvc0
, wait
, q
, cfg
, mp_count
);
1338 if (cfg
->op
== NVC0_COUNTER_OPn_SUM
) {
1339 for (c
= 0; c
< cfg
->num_counters
; ++c
)
1340 for (p
= 0; p
< mp_count
; ++p
)
1341 value
+= count
[p
][c
];
1342 value
= (value
* cfg
->norm
[0]) / cfg
->norm
[1];
1344 if (cfg
->op
== NVC0_COUNTER_OPn_OR
) {
1346 for (c
= 0; c
< cfg
->num_counters
; ++c
)
1347 for (p
= 0; p
< mp_count
; ++p
)
1349 value
= ((uint64_t)v
* cfg
->norm
[0]) / cfg
->norm
[1];
1351 if (cfg
->op
== NVC0_COUNTER_OPn_AND
) {
1353 for (c
= 0; c
< cfg
->num_counters
; ++c
)
1354 for (p
= 0; p
< mp_count
; ++p
)
1356 value
= ((uint64_t)v
* cfg
->norm
[0]) / cfg
->norm
[1];
1358 if (cfg
->op
== NVC0_COUNTER_OP2_REL_SUM_MM
) {
1359 uint64_t v
[2] = { 0, 0 };
1360 for (p
= 0; p
< mp_count
; ++p
) {
1361 v
[0] += count
[p
][0];
1362 v
[1] += count
[p
][1];
1365 value
= ((v
[0] - v
[1]) * cfg
->norm
[0]) / (v
[0] * cfg
->norm
[1]);
1367 if (cfg
->op
== NVC0_COUNTER_OP2_DIV_SUM_M0
) {
1368 for (p
= 0; p
< mp_count
; ++p
)
1369 value
+= count
[p
][0];
1371 value
= (value
* cfg
->norm
[0]) / (count
[0][1] * cfg
->norm
[1]);
1375 if (cfg
->op
== NVC0_COUNTER_OP2_AVG_DIV_MM
) {
1376 unsigned mp_used
= 0;
1377 for (p
= 0; p
< mp_count
; ++p
, mp_used
+= !!count
[p
][0])
1379 value
+= (count
[p
][0] * cfg
->norm
[0]) / count
[p
][1];
1381 value
/= (uint64_t)mp_used
* cfg
->norm
[1];
1383 if (cfg
->op
== NVC0_COUNTER_OP2_AVG_DIV_M0
) {
1384 unsigned mp_used
= 0;
1385 for (p
= 0; p
< mp_count
; ++p
, mp_used
+= !!count
[p
][0])
1386 value
+= count
[p
][0];
1387 if (count
[0][1] && mp_used
) {
1388 value
*= cfg
->norm
[0];
1389 value
/= (uint64_t)count
[0][1] * mp_used
* cfg
->norm
[1];
1395 *(uint64_t *)result
= value
;
1400 nvc0_screen_get_driver_query_info(struct pipe_screen
*pscreen
,
1402 struct pipe_driver_query_info
*info
)
1404 struct nvc0_screen
*screen
= nvc0_screen(pscreen
);
1407 count
+= NVC0_QUERY_DRV_STAT_COUNT
;
1409 if (screen
->base
.device
->drm_version
>= 0x01000101) {
1410 if (screen
->base
.class_3d
>= NVE4_3D_CLASS
) {
1411 count
+= NVE4_PM_QUERY_COUNT
;
1413 if (screen
->compute
) {
1414 count
+= NVC0_PM_QUERY_COUNT
; /* NVC0_COMPUTE is not always enabled */
1421 /* Init default values. */
1422 info
->name
= "this_is_not_the_query_you_are_looking_for";
1423 info
->query_type
= 0xdeadd01d;
1424 info
->max_value
.u64
= 0;
1425 info
->type
= PIPE_DRIVER_QUERY_TYPE_UINT64
;
1426 info
->group_id
= -1;
1428 #ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS
1429 if (id
< NVC0_QUERY_DRV_STAT_COUNT
) {
1430 info
->name
= nvc0_drv_stat_names
[id
];
1431 info
->query_type
= NVC0_QUERY_DRV_STAT(id
);
1432 info
->max_value
.u64
= 0;
1433 if (strstr(info
->name
, "bytes"))
1434 info
->type
= PIPE_DRIVER_QUERY_TYPE_BYTES
;
1435 info
->group_id
= NVC0_QUERY_DRV_STAT_GROUP
;
1440 if (screen
->base
.class_3d
>= NVE4_3D_CLASS
) {
1441 info
->name
= nve4_pm_query_names
[id
- NVC0_QUERY_DRV_STAT_COUNT
];
1442 info
->query_type
= NVE4_PM_QUERY(id
- NVC0_QUERY_DRV_STAT_COUNT
);
1443 info
->max_value
.u64
=
1444 (id
< NVE4_PM_QUERY_METRIC_MP_OCCUPANCY
) ? 0 : 100;
1445 info
->group_id
= NVC0_QUERY_MP_COUNTER_GROUP
;
1448 if (screen
->compute
) {
1449 info
->name
= nvc0_pm_query_names
[id
- NVC0_QUERY_DRV_STAT_COUNT
];
1450 info
->query_type
= NVC0_PM_QUERY(id
- NVC0_QUERY_DRV_STAT_COUNT
);
1451 info
->group_id
= NVC0_QUERY_MP_COUNTER_GROUP
;
1455 /* user asked for info about non-existing query */
1460 nvc0_screen_get_driver_query_group_info(struct pipe_screen
*pscreen
,
1462 struct pipe_driver_query_group_info
*info
)
1464 struct nvc0_screen
*screen
= nvc0_screen(pscreen
);
1467 #ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS
1471 if (screen
->base
.device
->drm_version
>= 0x01000101) {
1472 if (screen
->base
.class_3d
>= NVE4_3D_CLASS
) {
1474 } else if (screen
->compute
) {
1475 count
++; /* NVC0_COMPUTE is not always enabled */
1482 if (id
== NVC0_QUERY_MP_COUNTER_GROUP
) {
1483 info
->name
= "MP counters";
1484 info
->type
= PIPE_DRIVER_QUERY_GROUP_TYPE_GPU
;
1486 if (screen
->base
.class_3d
>= NVE4_3D_CLASS
) {
1487 info
->num_queries
= NVE4_PM_QUERY_COUNT
;
1489 /* On NVE4+, each multiprocessor have 8 hardware counters separated
1490 * in two distinct domains, but we allow only one active query
1491 * simultaneously because some of them use more than one hardware
1492 * counter and this will result in an undefined behaviour. */
1493 info
->max_active_queries
= 1; /* TODO: handle multiple hw counters */
1495 } else if (screen
->compute
) {
1496 info
->num_queries
= NVC0_PM_QUERY_COUNT
;
1498 /* On NVC0:NVE4, each multiprocessor have 8 hardware counters
1499 * in a single domain. */
1500 info
->max_active_queries
= 8;
1504 #ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS
1505 else if (id
== NVC0_QUERY_DRV_STAT_GROUP
) {
1506 info
->name
= "Driver statistics";
1507 info
->type
= PIPE_DRIVER_QUERY_GROUP_TYPE_CPU
;
1508 info
->max_active_queries
= NVC0_QUERY_DRV_STAT_COUNT
;
1509 info
->num_queries
= NVC0_QUERY_DRV_STAT_COUNT
;
1514 /* user asked for info about non-existing query group */
1515 info
->name
= "this_is_not_the_query_group_you_are_looking_for";
1516 info
->max_active_queries
= 0;
1517 info
->num_queries
= 0;
1523 nvc0_init_query_functions(struct nvc0_context
*nvc0
)
1525 struct pipe_context
*pipe
= &nvc0
->base
.pipe
;
1527 pipe
->create_query
= nvc0_query_create
;
1528 pipe
->destroy_query
= nvc0_query_destroy
;
1529 pipe
->begin_query
= nvc0_query_begin
;
1530 pipe
->end_query
= nvc0_query_end
;
1531 pipe
->get_query_result
= nvc0_query_result
;
1532 pipe
->render_condition
= nvc0_render_condition
;