gallium: add condition parameter to render_condition
[mesa.git] / src / gallium / drivers / nvc0 / nvc0_query.c
1 /*
2 * Copyright 2011 Nouveau Project
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 *
22 * Authors: Christoph Bumiller
23 */
24
25 #define NVC0_PUSH_EXPLICIT_SPACE_CHECKING
26
27 #include "nvc0_context.h"
28 #include "nouveau/nv_object.xml.h"
29 #include "nve4_compute.xml.h"
30
31 #define NVC0_QUERY_STATE_READY 0
32 #define NVC0_QUERY_STATE_ACTIVE 1
33 #define NVC0_QUERY_STATE_ENDED 2
34 #define NVC0_QUERY_STATE_FLUSHED 3
35
36 struct nvc0_query {
37 uint32_t *data;
38 uint16_t type;
39 uint16_t index;
40 int8_t ctr[4];
41 uint32_t sequence;
42 struct nouveau_bo *bo;
43 uint32_t base;
44 uint32_t offset; /* base + i * rotate */
45 uint8_t state;
46 boolean is64bit;
47 uint8_t rotate;
48 int nesting; /* only used for occlusion queries */
49 union {
50 struct nouveau_mm_allocation *mm;
51 uint64_t value;
52 } u;
53 struct nouveau_fence *fence;
54 };
55
56 #define NVC0_QUERY_ALLOC_SPACE 256
57
58 static void nve4_mp_pm_query_begin(struct nvc0_context *, struct nvc0_query *);
59 static void nve4_mp_pm_query_end(struct nvc0_context *, struct nvc0_query *);
60 static boolean nve4_mp_pm_query_result(struct nvc0_context *,
61 struct nvc0_query *, void *, boolean);
62
63 static INLINE struct nvc0_query *
64 nvc0_query(struct pipe_query *pipe)
65 {
66 return (struct nvc0_query *)pipe;
67 }
68
69 static boolean
70 nvc0_query_allocate(struct nvc0_context *nvc0, struct nvc0_query *q, int size)
71 {
72 struct nvc0_screen *screen = nvc0->screen;
73 int ret;
74
75 if (q->bo) {
76 nouveau_bo_ref(NULL, &q->bo);
77 if (q->u.mm) {
78 if (q->state == NVC0_QUERY_STATE_READY)
79 nouveau_mm_free(q->u.mm);
80 else
81 nouveau_fence_work(screen->base.fence.current,
82 nouveau_mm_free_work, q->u.mm);
83 }
84 }
85 if (size) {
86 q->u.mm = nouveau_mm_allocate(screen->base.mm_GART, size, &q->bo, &q->base);
87 if (!q->bo)
88 return FALSE;
89 q->offset = q->base;
90
91 ret = nouveau_bo_map(q->bo, 0, screen->base.client);
92 if (ret) {
93 nvc0_query_allocate(nvc0, q, 0);
94 return FALSE;
95 }
96 q->data = (uint32_t *)((uint8_t *)q->bo->map + q->base);
97 }
98 return TRUE;
99 }
100
101 static void
102 nvc0_query_destroy(struct pipe_context *pipe, struct pipe_query *pq)
103 {
104 nvc0_query_allocate(nvc0_context(pipe), nvc0_query(pq), 0);
105 nouveau_fence_ref(NULL, &nvc0_query(pq)->fence);
106 FREE(nvc0_query(pq));
107 }
108
109 static struct pipe_query *
110 nvc0_query_create(struct pipe_context *pipe, unsigned type)
111 {
112 struct nvc0_context *nvc0 = nvc0_context(pipe);
113 struct nvc0_query *q;
114 unsigned space = NVC0_QUERY_ALLOC_SPACE;
115
116 q = CALLOC_STRUCT(nvc0_query);
117 if (!q)
118 return NULL;
119
120 switch (type) {
121 case PIPE_QUERY_OCCLUSION_COUNTER:
122 case PIPE_QUERY_OCCLUSION_PREDICATE:
123 q->rotate = 32;
124 space = NVC0_QUERY_ALLOC_SPACE;
125 break;
126 case PIPE_QUERY_PIPELINE_STATISTICS:
127 q->is64bit = TRUE;
128 space = 512;
129 break;
130 case PIPE_QUERY_SO_STATISTICS:
131 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
132 q->is64bit = TRUE;
133 space = 64;
134 break;
135 case PIPE_QUERY_PRIMITIVES_GENERATED:
136 case PIPE_QUERY_PRIMITIVES_EMITTED:
137 q->is64bit = TRUE;
138 space = 32;
139 break;
140 case PIPE_QUERY_TIME_ELAPSED:
141 case PIPE_QUERY_TIMESTAMP:
142 case PIPE_QUERY_TIMESTAMP_DISJOINT:
143 case PIPE_QUERY_GPU_FINISHED:
144 space = 32;
145 break;
146 case NVC0_QUERY_TFB_BUFFER_OFFSET:
147 space = 16;
148 break;
149 default:
150 #ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS
151 if (type >= NVC0_QUERY_DRV_STAT(0) && type <= NVC0_QUERY_DRV_STAT_LAST) {
152 space = 0;
153 q->is64bit = true;
154 q->index = type - NVC0_QUERY_DRV_STAT(0);
155 break;
156 } else
157 #endif
158 if (nvc0->screen->base.class_3d >= NVE4_3D_CLASS &&
159 nvc0->screen->base.device->drm_version >= 0x01000101) {
160 if (type >= NVE4_PM_QUERY(0) &&
161 type <= NVE4_PM_QUERY_LAST) {
162 /* for each MP:
163 * [00] = WS0.C0
164 * [04] = WS0.C1
165 * [08] = WS0.C2
166 * [0c] = WS0.C3
167 * [10] = WS0.C0
168 * [14] = WS1.C1
169 * [18] = WS1.C2
170 * [1c] = WS1.C3
171 * [20] = WS1.C0
172 * [24] = WS2.C1
173 * [28] = WS2.C2
174 * [2c] = WS2.C3
175 * [30] = WS3.C0
176 * [34] = WS3.C1
177 * [38] = WS3.C2
178 * [3c] = WS3.C3
179 * [40] = MP.C4
180 * [44] = MP.C5
181 * [48] = MP.C6
182 * [4c] = MP.C7
183 * [50] = WS0.sequence
184 * [54] = WS1.sequence
185 * [58] = WS2.sequence
186 * [5c] = WS3.sequence
187 */
188 space = (4 * 4 + 4 + 4) * nvc0->screen->mp_count * sizeof(uint32_t);
189 break;
190 }
191 }
192 debug_printf("invalid query type: %u\n", type);
193 FREE(q);
194 return NULL;
195 }
196 if (!nvc0_query_allocate(nvc0, q, space)) {
197 FREE(q);
198 return NULL;
199 }
200
201 q->type = type;
202
203 if (q->rotate) {
204 /* we advance before query_begin ! */
205 q->offset -= q->rotate;
206 q->data -= q->rotate / sizeof(*q->data);
207 } else
208 if (!q->is64bit)
209 q->data[0] = 0; /* initialize sequence */
210
211 return (struct pipe_query *)q;
212 }
213
214 static void
215 nvc0_query_get(struct nouveau_pushbuf *push, struct nvc0_query *q,
216 unsigned offset, uint32_t get)
217 {
218 offset += q->offset;
219
220 PUSH_SPACE(push, 5);
221 PUSH_REFN (push, q->bo, NOUVEAU_BO_GART | NOUVEAU_BO_WR);
222 BEGIN_NVC0(push, NVC0_3D(QUERY_ADDRESS_HIGH), 4);
223 PUSH_DATAh(push, q->bo->offset + offset);
224 PUSH_DATA (push, q->bo->offset + offset);
225 PUSH_DATA (push, q->sequence);
226 PUSH_DATA (push, get);
227 }
228
229 static void
230 nvc0_query_rotate(struct nvc0_context *nvc0, struct nvc0_query *q)
231 {
232 q->offset += q->rotate;
233 q->data += q->rotate / sizeof(*q->data);
234 if (q->offset - q->base == NVC0_QUERY_ALLOC_SPACE)
235 nvc0_query_allocate(nvc0, q, NVC0_QUERY_ALLOC_SPACE);
236 }
237
238 static void
239 nvc0_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
240 {
241 struct nvc0_context *nvc0 = nvc0_context(pipe);
242 struct nouveau_pushbuf *push = nvc0->base.pushbuf;
243 struct nvc0_query *q = nvc0_query(pq);
244
245 /* For occlusion queries we have to change the storage, because a previous
246 * query might set the initial render conition to FALSE even *after* we re-
247 * initialized it to TRUE.
248 */
249 if (q->rotate) {
250 nvc0_query_rotate(nvc0, q);
251
252 /* XXX: can we do this with the GPU, and sync with respect to a previous
253 * query ?
254 */
255 q->data[0] = q->sequence; /* initialize sequence */
256 q->data[1] = 1; /* initial render condition = TRUE */
257 q->data[4] = q->sequence + 1; /* for comparison COND_MODE */
258 q->data[5] = 0;
259 }
260 q->sequence++;
261
262 switch (q->type) {
263 case PIPE_QUERY_OCCLUSION_COUNTER:
264 case PIPE_QUERY_OCCLUSION_PREDICATE:
265 q->nesting = nvc0->screen->num_occlusion_queries_active++;
266 if (q->nesting) {
267 nvc0_query_get(push, q, 0x10, 0x0100f002);
268 } else {
269 PUSH_SPACE(push, 3);
270 BEGIN_NVC0(push, NVC0_3D(COUNTER_RESET), 1);
271 PUSH_DATA (push, NVC0_3D_COUNTER_RESET_SAMPLECNT);
272 IMMED_NVC0(push, NVC0_3D(SAMPLECNT_ENABLE), 1);
273 }
274 break;
275 case PIPE_QUERY_PRIMITIVES_GENERATED:
276 nvc0_query_get(push, q, 0x10, 0x09005002 | (q->index << 5));
277 break;
278 case PIPE_QUERY_PRIMITIVES_EMITTED:
279 nvc0_query_get(push, q, 0x10, 0x05805002 | (q->index << 5));
280 break;
281 case PIPE_QUERY_SO_STATISTICS:
282 nvc0_query_get(push, q, 0x20, 0x05805002 | (q->index << 5));
283 nvc0_query_get(push, q, 0x30, 0x06805002 | (q->index << 5));
284 break;
285 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
286 nvc0_query_get(push, q, 0x10, 0x03005002 | (q->index << 5));
287 break;
288 case PIPE_QUERY_TIMESTAMP_DISJOINT:
289 case PIPE_QUERY_TIME_ELAPSED:
290 nvc0_query_get(push, q, 0x10, 0x00005002);
291 break;
292 case PIPE_QUERY_PIPELINE_STATISTICS:
293 nvc0_query_get(push, q, 0xc0 + 0x00, 0x00801002); /* VFETCH, VERTICES */
294 nvc0_query_get(push, q, 0xc0 + 0x10, 0x01801002); /* VFETCH, PRIMS */
295 nvc0_query_get(push, q, 0xc0 + 0x20, 0x02802002); /* VP, LAUNCHES */
296 nvc0_query_get(push, q, 0xc0 + 0x30, 0x03806002); /* GP, LAUNCHES */
297 nvc0_query_get(push, q, 0xc0 + 0x40, 0x04806002); /* GP, PRIMS_OUT */
298 nvc0_query_get(push, q, 0xc0 + 0x50, 0x07804002); /* RAST, PRIMS_IN */
299 nvc0_query_get(push, q, 0xc0 + 0x60, 0x08804002); /* RAST, PRIMS_OUT */
300 nvc0_query_get(push, q, 0xc0 + 0x70, 0x0980a002); /* ROP, PIXELS */
301 nvc0_query_get(push, q, 0xc0 + 0x80, 0x0d808002); /* TCP, LAUNCHES */
302 nvc0_query_get(push, q, 0xc0 + 0x90, 0x0e809002); /* TEP, LAUNCHES */
303 break;
304 default:
305 #ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS
306 if (q->type >= NVC0_QUERY_DRV_STAT(0) &&
307 q->type <= NVC0_QUERY_DRV_STAT_LAST) {
308 if (q->index >= 5)
309 q->u.value = nvc0->screen->base.stats.v[q->index];
310 else
311 q->u.value = 0;
312 } else
313 #endif
314 if (q->type >= NVE4_PM_QUERY(0) && q->type <= NVE4_PM_QUERY_LAST) {
315 nve4_mp_pm_query_begin(nvc0, q);
316 }
317 break;
318 }
319 q->state = NVC0_QUERY_STATE_ACTIVE;
320 }
321
322 static void
323 nvc0_query_end(struct pipe_context *pipe, struct pipe_query *pq)
324 {
325 struct nvc0_context *nvc0 = nvc0_context(pipe);
326 struct nouveau_pushbuf *push = nvc0->base.pushbuf;
327 struct nvc0_query *q = nvc0_query(pq);
328
329 if (q->state != NVC0_QUERY_STATE_ACTIVE) {
330 /* some queries don't require 'begin' to be called (e.g. GPU_FINISHED) */
331 if (q->rotate)
332 nvc0_query_rotate(nvc0, q);
333 q->sequence++;
334 }
335 q->state = NVC0_QUERY_STATE_ENDED;
336
337 switch (q->type) {
338 case PIPE_QUERY_OCCLUSION_COUNTER:
339 case PIPE_QUERY_OCCLUSION_PREDICATE:
340 nvc0_query_get(push, q, 0, 0x0100f002);
341 if (--nvc0->screen->num_occlusion_queries_active == 0) {
342 PUSH_SPACE(push, 1);
343 IMMED_NVC0(push, NVC0_3D(SAMPLECNT_ENABLE), 0);
344 }
345 break;
346 case PIPE_QUERY_PRIMITIVES_GENERATED:
347 nvc0_query_get(push, q, 0, 0x09005002 | (q->index << 5));
348 break;
349 case PIPE_QUERY_PRIMITIVES_EMITTED:
350 nvc0_query_get(push, q, 0, 0x05805002 | (q->index << 5));
351 break;
352 case PIPE_QUERY_SO_STATISTICS:
353 nvc0_query_get(push, q, 0x00, 0x05805002 | (q->index << 5));
354 nvc0_query_get(push, q, 0x10, 0x06805002 | (q->index << 5));
355 break;
356 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
357 /* TODO: How do we sum over all streams for render condition ? */
358 /* PRIMS_DROPPED doesn't write sequence, use a ZERO query to sync on */
359 nvc0_query_get(push, q, 0x00, 0x03005002 | (q->index << 5));
360 nvc0_query_get(push, q, 0x20, 0x00005002);
361 break;
362 case PIPE_QUERY_TIMESTAMP:
363 case PIPE_QUERY_TIMESTAMP_DISJOINT:
364 case PIPE_QUERY_TIME_ELAPSED:
365 nvc0_query_get(push, q, 0, 0x00005002);
366 break;
367 case PIPE_QUERY_GPU_FINISHED:
368 nvc0_query_get(push, q, 0, 0x1000f010);
369 break;
370 case PIPE_QUERY_PIPELINE_STATISTICS:
371 nvc0_query_get(push, q, 0x00, 0x00801002); /* VFETCH, VERTICES */
372 nvc0_query_get(push, q, 0x10, 0x01801002); /* VFETCH, PRIMS */
373 nvc0_query_get(push, q, 0x20, 0x02802002); /* VP, LAUNCHES */
374 nvc0_query_get(push, q, 0x30, 0x03806002); /* GP, LAUNCHES */
375 nvc0_query_get(push, q, 0x40, 0x04806002); /* GP, PRIMS_OUT */
376 nvc0_query_get(push, q, 0x50, 0x07804002); /* RAST, PRIMS_IN */
377 nvc0_query_get(push, q, 0x60, 0x08804002); /* RAST, PRIMS_OUT */
378 nvc0_query_get(push, q, 0x70, 0x0980a002); /* ROP, PIXELS */
379 nvc0_query_get(push, q, 0x80, 0x0d808002); /* TCP, LAUNCHES */
380 nvc0_query_get(push, q, 0x90, 0x0e809002); /* TEP, LAUNCHES */
381 break;
382 case NVC0_QUERY_TFB_BUFFER_OFFSET:
383 /* indexed by TFB buffer instead of by vertex stream */
384 nvc0_query_get(push, q, 0x00, 0x0d005002 | (q->index << 5));
385 break;
386 default:
387 #ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS
388 if (q->type >= NVC0_QUERY_DRV_STAT(0) &&
389 q->type <= NVC0_QUERY_DRV_STAT_LAST) {
390 q->u.value = nvc0->screen->base.stats.v[q->index] - q->u.value;
391 return;
392 } else
393 #endif
394 if (q->type >= NVE4_PM_QUERY(0) && q->type <= NVE4_PM_QUERY_LAST)
395 nve4_mp_pm_query_end(nvc0, q);
396 break;
397 }
398 if (q->is64bit)
399 nouveau_fence_ref(nvc0->screen->base.fence.current, &q->fence);
400 }
401
402 static INLINE void
403 nvc0_query_update(struct nouveau_client *cli, struct nvc0_query *q)
404 {
405 if (q->is64bit) {
406 if (nouveau_fence_signalled(q->fence))
407 q->state = NVC0_QUERY_STATE_READY;
408 } else {
409 if (q->data[0] == q->sequence)
410 q->state = NVC0_QUERY_STATE_READY;
411 }
412 }
413
414 static boolean
415 nvc0_query_result(struct pipe_context *pipe, struct pipe_query *pq,
416 boolean wait, union pipe_query_result *result)
417 {
418 struct nvc0_context *nvc0 = nvc0_context(pipe);
419 struct nvc0_query *q = nvc0_query(pq);
420 uint64_t *res64 = (uint64_t*)result;
421 uint32_t *res32 = (uint32_t*)result;
422 boolean *res8 = (boolean*)result;
423 uint64_t *data64 = (uint64_t *)q->data;
424 unsigned i;
425
426 #ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS
427 if (q->type >= NVC0_QUERY_DRV_STAT(0) &&
428 q->type <= NVC0_QUERY_DRV_STAT_LAST) {
429 res64[0] = q->u.value;
430 return TRUE;
431 } else
432 #endif
433 if (q->type >= NVE4_PM_QUERY(0) && q->type <= NVE4_PM_QUERY_LAST) {
434 return nve4_mp_pm_query_result(nvc0, q, result, wait);
435 }
436
437 if (q->state != NVC0_QUERY_STATE_READY)
438 nvc0_query_update(nvc0->screen->base.client, q);
439
440 if (q->state != NVC0_QUERY_STATE_READY) {
441 if (!wait) {
442 if (q->state != NVC0_QUERY_STATE_FLUSHED) {
443 q->state = NVC0_QUERY_STATE_FLUSHED;
444 /* flush for silly apps that spin on GL_QUERY_RESULT_AVAILABLE */
445 PUSH_KICK(nvc0->base.pushbuf);
446 }
447 return FALSE;
448 }
449 if (nouveau_bo_wait(q->bo, NOUVEAU_BO_RD, nvc0->screen->base.client))
450 return FALSE;
451 NOUVEAU_DRV_STAT(&nvc0->screen->base, query_sync_count, 1);
452 }
453 q->state = NVC0_QUERY_STATE_READY;
454
455 switch (q->type) {
456 case PIPE_QUERY_GPU_FINISHED:
457 res8[0] = TRUE;
458 break;
459 case PIPE_QUERY_OCCLUSION_COUNTER: /* u32 sequence, u32 count, u64 time */
460 res64[0] = q->data[1] - q->data[5];
461 break;
462 case PIPE_QUERY_OCCLUSION_PREDICATE:
463 res8[0] = q->data[1] != q->data[5];
464 break;
465 case PIPE_QUERY_PRIMITIVES_GENERATED: /* u64 count, u64 time */
466 case PIPE_QUERY_PRIMITIVES_EMITTED: /* u64 count, u64 time */
467 res64[0] = data64[0] - data64[2];
468 break;
469 case PIPE_QUERY_SO_STATISTICS:
470 res64[0] = data64[0] - data64[4];
471 res64[1] = data64[2] - data64[6];
472 break;
473 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
474 res8[0] = data64[0] != data64[2];
475 break;
476 case PIPE_QUERY_TIMESTAMP:
477 res64[0] = data64[1];
478 break;
479 case PIPE_QUERY_TIMESTAMP_DISJOINT: /* u32 sequence, u32 0, u64 time */
480 res64[0] = 1000000000;
481 res8[8] = (data64[1] == data64[3]) ? FALSE : TRUE;
482 break;
483 case PIPE_QUERY_TIME_ELAPSED:
484 res64[0] = data64[1] - data64[3];
485 break;
486 case PIPE_QUERY_PIPELINE_STATISTICS:
487 for (i = 0; i < 10; ++i)
488 res64[i] = data64[i * 2] - data64[24 + i * 2];
489 break;
490 case NVC0_QUERY_TFB_BUFFER_OFFSET:
491 res32[0] = q->data[1];
492 break;
493 default:
494 assert(0); /* can't happen, we don't create queries with invalid type */
495 return FALSE;
496 }
497
498 return TRUE;
499 }
500
501 void
502 nvc0_query_fifo_wait(struct nouveau_pushbuf *push, struct pipe_query *pq)
503 {
504 struct nvc0_query *q = nvc0_query(pq);
505 unsigned offset = q->offset;
506
507 if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE) offset += 0x20;
508
509 PUSH_SPACE(push, 5);
510 PUSH_REFN (push, q->bo, NOUVEAU_BO_GART | NOUVEAU_BO_RD);
511 BEGIN_NVC0(push, SUBC_3D(NV84_SUBCHAN_SEMAPHORE_ADDRESS_HIGH), 4);
512 PUSH_DATAh(push, q->bo->offset + offset);
513 PUSH_DATA (push, q->bo->offset + offset);
514 PUSH_DATA (push, q->sequence);
515 PUSH_DATA (push, (1 << 12) |
516 NV84_SUBCHAN_SEMAPHORE_TRIGGER_ACQUIRE_EQUAL);
517 }
518
519 static void
520 nvc0_render_condition(struct pipe_context *pipe,
521 struct pipe_query *pq,
522 boolean condition, uint mode)
523 {
524 struct nvc0_context *nvc0 = nvc0_context(pipe);
525 struct nouveau_pushbuf *push = nvc0->base.pushbuf;
526 struct nvc0_query *q;
527 uint32_t cond;
528 boolean negated = FALSE;
529 boolean wait =
530 mode != PIPE_RENDER_COND_NO_WAIT &&
531 mode != PIPE_RENDER_COND_BY_REGION_NO_WAIT;
532
533 nvc0->cond_query = pq;
534 nvc0->cond_cond = condition;
535 nvc0->cond_mode = mode;
536
537 if (!pq) {
538 PUSH_SPACE(push, 1);
539 IMMED_NVC0(push, NVC0_3D(COND_MODE), NVC0_3D_COND_MODE_ALWAYS);
540 return;
541 }
542 q = nvc0_query(pq);
543
544 /* NOTE: comparison of 2 queries only works if both have completed */
545 switch (q->type) {
546 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
547 cond = negated ? NVC0_3D_COND_MODE_EQUAL :
548 NVC0_3D_COND_MODE_NOT_EQUAL;
549 wait = TRUE;
550 break;
551 case PIPE_QUERY_OCCLUSION_COUNTER:
552 case PIPE_QUERY_OCCLUSION_PREDICATE:
553 if (likely(!negated)) {
554 if (unlikely(q->nesting))
555 cond = wait ? NVC0_3D_COND_MODE_NOT_EQUAL :
556 NVC0_3D_COND_MODE_ALWAYS;
557 else
558 cond = NVC0_3D_COND_MODE_RES_NON_ZERO;
559 } else {
560 cond = wait ? NVC0_3D_COND_MODE_EQUAL : NVC0_3D_COND_MODE_ALWAYS;
561 }
562 break;
563 default:
564 assert(!"render condition query not a predicate");
565 mode = NVC0_3D_COND_MODE_ALWAYS;
566 break;
567 }
568
569 if (wait)
570 nvc0_query_fifo_wait(push, pq);
571
572 PUSH_SPACE(push, 4);
573 PUSH_REFN (push, q->bo, NOUVEAU_BO_GART | NOUVEAU_BO_RD);
574 BEGIN_NVC0(push, NVC0_3D(COND_ADDRESS_HIGH), 3);
575 PUSH_DATAh(push, q->bo->offset + q->offset);
576 PUSH_DATA (push, q->bo->offset + q->offset);
577 PUSH_DATA (push, cond);
578 }
579
580 void
581 nvc0_query_pushbuf_submit(struct nouveau_pushbuf *push,
582 struct pipe_query *pq, unsigned result_offset)
583 {
584 struct nvc0_query *q = nvc0_query(pq);
585
586 #define NVC0_IB_ENTRY_1_NO_PREFETCH (1 << (31 - 8))
587
588 nouveau_pushbuf_space(push, 0, 0, 1);
589 nouveau_pushbuf_data(push, q->bo, q->offset + result_offset, 4 |
590 NVC0_IB_ENTRY_1_NO_PREFETCH);
591 }
592
593 void
594 nvc0_so_target_save_offset(struct pipe_context *pipe,
595 struct pipe_stream_output_target *ptarg,
596 unsigned index, boolean *serialize)
597 {
598 struct nvc0_so_target *targ = nvc0_so_target(ptarg);
599
600 if (*serialize) {
601 *serialize = FALSE;
602 PUSH_SPACE(nvc0_context(pipe)->base.pushbuf, 1);
603 IMMED_NVC0(nvc0_context(pipe)->base.pushbuf, NVC0_3D(SERIALIZE), 0);
604
605 NOUVEAU_DRV_STAT(nouveau_screen(pipe->screen), gpu_serialize_count, 1);
606 }
607
608 nvc0_query(targ->pq)->index = index;
609
610 nvc0_query_end(pipe, targ->pq);
611 }
612
613
614 /* === DRIVER STATISTICS === */
615
616 #ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS
617
618 static const char *nvc0_drv_stat_names[] =
619 {
620 "drv-tex_obj_current_count",
621 "drv-tex_obj_current_bytes",
622 "drv-buf_obj_current_count",
623 "drv-buf_obj_current_bytes_vid",
624 "drv-buf_obj_current_bytes_sys",
625 "drv-tex_transfers_rd",
626 "drv-tex_transfers_wr",
627 "drv-tex_copy_count",
628 "drv-tex_blit_count",
629 "drv-tex_cache_flush_count",
630 "drv-buf_transfers_rd",
631 "drv-buf_transfers_wr",
632 "drv-buf_read_bytes_staging_vid",
633 "drv-buf_write_bytes_direct",
634 "drv-buf_write_bytes_staging_vid",
635 "drv-buf_write_bytes_staging_sys",
636 "drv-buf_copy_bytes",
637 "drv-buf_non_kernel_fence_sync_count",
638 "drv-any_non_kernel_fence_sync_count",
639 "drv-query_sync_count",
640 "drv-gpu_serialize_count",
641 "drv-draw_calls_array",
642 "drv-draw_calls_indexed",
643 "drv-draw_calls_fallback_count",
644 "drv-user_buffer_upload_bytes",
645 "drv-constbuf_upload_count",
646 "drv-constbuf_upload_bytes",
647 "drv-pushbuf_count",
648 "drv-resource_validate_count"
649 };
650
651 #endif /* NOUVEAU_ENABLE_DRIVER_STATISTICS */
652
653
654 /* === PERFORMANCE MONITORING COUNTERS === */
655
656 /* Code to read out MP counters: They are accessible via mmio, too, but let's
657 * just avoid mapping registers in userspace. We'd have to know which MPs are
658 * enabled/present, too, and that information is not presently exposed.
659 * We could add a kernel interface for it, but reading the counters like this
660 * has the advantage of being async (if get_result isn't called immediately).
661 */
662 static const uint64_t nve4_read_mp_pm_counters_code[] =
663 {
664 /* sched 0x20 0x20 0x20 0x20 0x20 0x20 0x20
665 * mov b32 $r8 $tidx
666 * mov b32 $r12 $physid
667 * mov b32 $r0 $pm0
668 * mov b32 $r1 $pm1
669 * mov b32 $r2 $pm2
670 * mov b32 $r3 $pm3
671 * mov b32 $r4 $pm4
672 * sched 0x20 0x20 0x23 0x04 0x20 0x04 0x2b
673 * mov b32 $r5 $pm5
674 * mov b32 $r6 $pm6
675 * mov b32 $r7 $pm7
676 * set $p0 0x1 eq u32 $r8 0x0
677 * mov b32 $r10 c0[0x0]
678 * ext u32 $r8 $r12 0x414
679 * mov b32 $r11 c0[0x4]
680 * sched 0x04 0x2e 0x04 0x20 0x20 0x28 0x04
681 * ext u32 $r9 $r12 0x208
682 * (not $p0) exit
683 * set $p1 0x1 eq u32 $r9 0x0
684 * mul $r8 u32 $r8 u32 96
685 * mul $r12 u32 $r9 u32 16
686 * mul $r13 u32 $r9 u32 4
687 * add b32 $r9 $r8 $r13
688 * sched 0x28 0x04 0x2c 0x04 0x2c 0x04 0x2c
689 * add b32 $r8 $r8 $r12
690 * mov b32 $r12 $r10
691 * add b32 $r10 $c $r10 $r8
692 * mov b32 $r13 $r11
693 * add b32 $r11 $r11 0x0 $c
694 * add b32 $r12 $c $r12 $r9
695 * st b128 wt g[$r10d] $r0q
696 * sched 0x4 0x2c 0x20 0x04 0x2e 0x00 0x00
697 * mov b32 $r0 c0[0x8]
698 * add b32 $r13 $r13 0x0 $c
699 * $p1 st b128 wt g[$r12d+0x40] $r4q
700 * st b32 wt g[$r12d+0x50] $r0
701 * exit */
702 0x2202020202020207ULL,
703 0x2c00000084021c04ULL,
704 0x2c0000000c031c04ULL,
705 0x2c00000010001c04ULL,
706 0x2c00000014005c04ULL,
707 0x2c00000018009c04ULL,
708 0x2c0000001c00dc04ULL,
709 0x2c00000020011c04ULL,
710 0x22b0420042320207ULL,
711 0x2c00000024015c04ULL,
712 0x2c00000028019c04ULL,
713 0x2c0000002c01dc04ULL,
714 0x190e0000fc81dc03ULL,
715 0x2800400000029de4ULL,
716 0x7000c01050c21c03ULL,
717 0x280040001002dde4ULL,
718 0x204282020042e047ULL,
719 0x7000c00820c25c03ULL,
720 0x80000000000021e7ULL,
721 0x190e0000fc93dc03ULL,
722 0x1000000180821c02ULL,
723 0x1000000040931c02ULL,
724 0x1000000010935c02ULL,
725 0x4800000034825c03ULL,
726 0x22c042c042c04287ULL,
727 0x4800000030821c03ULL,
728 0x2800000028031de4ULL,
729 0x4801000020a29c03ULL,
730 0x280000002c035de4ULL,
731 0x0800000000b2dc42ULL,
732 0x4801000024c31c03ULL,
733 0x9400000000a01fc5ULL,
734 0x200002e04202c047ULL,
735 0x2800400020001de4ULL,
736 0x0800000000d35c42ULL,
737 0x9400000100c107c5ULL,
738 0x9400000140c01f85ULL,
739 0x8000000000001de7ULL
740 };
741
742 /* NOTE: intentionally using the same names as NV */
743 static const char *nve4_pm_query_names[] =
744 {
745 /* MP counters */
746 "prof_trigger_00",
747 "prof_trigger_01",
748 "prof_trigger_02",
749 "prof_trigger_03",
750 "prof_trigger_04",
751 "prof_trigger_05",
752 "prof_trigger_06",
753 "prof_trigger_07",
754 "warps_launched",
755 "threads_launched",
756 "sm_cta_launched",
757 "inst_issued1",
758 "inst_issued2",
759 "inst_executed",
760 "local_load",
761 "local_store",
762 "shared_load",
763 "shared_store",
764 "l1_local_load_hit",
765 "l1_local_load_miss",
766 "l1_local_store_hit",
767 "l1_local_store_miss",
768 "gld_request",
769 "gst_request",
770 "l1_global_load_hit",
771 "l1_global_load_miss",
772 "uncached_global_load_transaction",
773 "global_store_transaction",
774 "branch",
775 "divergent_branch",
776 "active_warps",
777 "active_cycles",
778 "inst_issued",
779 /* metrics, i.e. functions of the MP counters */
780 "metric-ipc", /* inst_executed, clock */
781 "metric-ipac", /* inst_executed, active_cycles */
782 "metric-ipec", /* inst_executed, (bool)inst_executed */
783 "metric-achieved_occupancy", /* active_warps, active_cycles */
784 "metric-sm_efficiency", /* active_cycles, clock */
785 "metric-inst_replay_overhead" /* inst_issued, inst_executed */
786 };
787
788 /* For simplicity, we will allocate as many group slots as we allocate counter
789 * slots. This means that a single counter which wants to source from 2 groups
790 * will have to be declared as using 2 counter slots. This shouldn't really be
791 * a problem because such queries don't make much sense ... (unless someone is
792 * really creative).
793 */
794 struct nve4_mp_counter_cfg
795 {
796 uint32_t func : 16; /* mask or 4-bit logic op (depending on mode) */
797 uint32_t mode : 4; /* LOGOP,B6,LOGOP_B6(_PULSE) */
798 uint32_t pad : 3;
799 uint32_t sig_dom : 1; /* if 0, MP_PM_A (per warp-sched), if 1, MP_PM_B */
800 uint32_t sig_sel : 8; /* signal group */
801 uint32_t src_sel : 32; /* signal selection for up to 5 sources */
802 };
803
804 #define NVE4_COUNTER_OPn_SUM 0
805 #define NVE4_COUNTER_OPn_OR 1
806 #define NVE4_COUNTER_OPn_AND 2
807 #define NVE4_COUNTER_OP2_REL_SUM_MM 3 /* (sum(ctr0) - sum(ctr1)) / sum(ctr0) */
808 #define NVE4_COUNTER_OP2_DIV_SUM_M0 4 /* sum(ctr0) / ctr1 of MP[0]) */
809 #define NVE4_COUNTER_OP2_AVG_DIV_MM 5 /* avg(ctr0 / ctr1) */
810 #define NVE4_COUNTER_OP2_AVG_DIV_M0 6 /* avg(ctr0) / ctr1 of MP[0]) */
811
812 struct nve4_mp_pm_query_cfg
813 {
814 struct nve4_mp_counter_cfg ctr[4];
815 uint8_t num_counters;
816 uint8_t op;
817 uint8_t norm[2]; /* normalization num,denom */
818 };
819
820 #define _Q1A(n, f, m, g, s, nu, dn) [NVE4_PM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g, s }, {}, {}, {} }, 1, NVE4_COUNTER_OPn_SUM, { nu, dn } }
821 #define _Q1B(n, f, m, g, s, nu, dn) [NVE4_PM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g, s }, {}, {}, {} }, 1, NVE4_COUNTER_OPn_SUM, { nu, dn } }
822 #define _M2A(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_PM_QUERY_METRIC_##n] = { { \
823 { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g0, s0 }, \
824 { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g1, s1 }, \
825 {}, {}, }, 2, NVE4_COUNTER_OP2_##o, { nu, dn } }
826 #define _M2B(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_PM_QUERY_METRIC_##n] = { { \
827 { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g0, s0 }, \
828 { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g1, s1 }, \
829 {}, {}, }, 2, NVE4_COUNTER_OP2_##o, { nu, dn } }
830 #define _M2AB(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_PM_QUERY_METRIC_##n] = { { \
831 { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g0, s0 }, \
832 { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g1, s1 }, \
833 {}, {}, }, 2, NVE4_COUNTER_OP2_##o, { nu, dn } }
834
835 /* NOTES:
836 * active_warps: bit 0 alternates btw 0 and 1 for odd nr of warps
837 * inst_executed etc.: we only count a single warp scheduler
838 * metric-ipXc: we simply multiply by 4 to account for the 4 warp schedulers;
839 * this is inaccurate !
840 */
841 static const struct nve4_mp_pm_query_cfg nve4_mp_pm_queries[] =
842 {
843 _Q1A(PROF_TRIGGER_0, 0x0001, B6, USER, 0x00000000, 1, 1),
844 _Q1A(PROF_TRIGGER_1, 0x0001, B6, USER, 0x00000004, 1, 1),
845 _Q1A(PROF_TRIGGER_2, 0x0001, B6, USER, 0x00000008, 1, 1),
846 _Q1A(PROF_TRIGGER_3, 0x0001, B6, USER, 0x0000000c, 1, 1),
847 _Q1A(PROF_TRIGGER_4, 0x0001, B6, USER, 0x00000010, 1, 1),
848 _Q1A(PROF_TRIGGER_5, 0x0001, B6, USER, 0x00000014, 1, 1),
849 _Q1A(PROF_TRIGGER_6, 0x0001, B6, USER, 0x00000018, 1, 1),
850 _Q1A(PROF_TRIGGER_7, 0x0001, B6, USER, 0x0000001c, 1, 1),
851 _Q1A(LAUNCHED_WARPS, 0x0001, B6, LAUNCH, 0x00000004, 1, 1),
852 _Q1A(LAUNCHED_THREADS, 0x003f, B6, LAUNCH, 0x398a4188, 1, 1),
853 _Q1B(LAUNCHED_CTA, 0x0001, B6, WARP, 0x0000001c, 1, 1),
854 _Q1A(INST_ISSUED1, 0x0001, B6, ISSUE, 0x00000004, 1, 1),
855 _Q1A(INST_ISSUED2, 0x0001, B6, ISSUE, 0x00000008, 1, 1),
856 _Q1A(INST_ISSUED, 0x0003, B6, ISSUE, 0x00000104, 1, 1),
857 _Q1A(INST_EXECUTED, 0x0003, B6, EXEC, 0x00000398, 1, 1),
858 _Q1A(LD_SHARED, 0x0001, B6, LDST, 0x00000000, 1, 1),
859 _Q1A(ST_SHARED, 0x0001, B6, LDST, 0x00000004, 1, 1),
860 _Q1A(LD_LOCAL, 0x0001, B6, LDST, 0x00000008, 1, 1),
861 _Q1A(ST_LOCAL, 0x0001, B6, LDST, 0x0000000c, 1, 1),
862 _Q1A(GLD_REQUEST, 0x0001, B6, LDST, 0x00000010, 1, 1),
863 _Q1A(GST_REQUEST, 0x0001, B6, LDST, 0x00000014, 1, 1),
864 _Q1B(L1_LOCAL_LOAD_HIT, 0x0001, B6, L1, 0x00000000, 1, 1),
865 _Q1B(L1_LOCAL_LOAD_MISS, 0x0001, B6, L1, 0x00000004, 1, 1),
866 _Q1B(L1_LOCAL_STORE_HIT, 0x0001, B6, L1, 0x00000008, 1, 1),
867 _Q1B(L1_LOCAL_STORE_MISS, 0x0001, B6, L1, 0x0000000c, 1, 1),
868 _Q1B(L1_GLOBAL_LOAD_HIT, 0x0001, B6, L1, 0x00000010, 1, 1),
869 _Q1B(L1_GLOBAL_LOAD_MISS, 0x0001, B6, L1, 0x00000014, 1, 1),
870 _Q1B(GLD_TRANSACTIONS_UNCACHED, 0x0001, B6, MEM, 0x00000000, 1, 1),
871 _Q1B(GST_TRANSACTIONS, 0x0001, B6, MEM, 0x00000004, 1, 1),
872 _Q1A(BRANCH, 0x0001, B6, BRANCH, 0x0000000c, 1, 1),
873 _Q1A(BRANCH_DIVERGENT, 0x0001, B6, BRANCH, 0x00000010, 1, 1),
874 _Q1B(ACTIVE_WARPS, 0x003f, B6, WARP, 0x31483104, 2, 1),
875 _Q1B(ACTIVE_CYCLES, 0x0001, B6, WARP, 0x00000000, 1, 1),
876 _M2AB(IPC, 0x3, B6, EXEC, 0x398, 0xffff, LOGOP, WARP, 0x0, DIV_SUM_M0, 10, 1),
877 _M2AB(IPAC, 0x3, B6, EXEC, 0x398, 0x1, B6, WARP, 0x0, AVG_DIV_MM, 10, 1),
878 _M2A(IPEC, 0x3, B6, EXEC, 0x398, 0xe, LOGOP, EXEC, 0x398, AVG_DIV_MM, 10, 1),
879 _M2A(INST_REPLAY_OHEAD, 0x3, B6, ISSUE, 0x104, 0x3, B6, EXEC, 0x398, REL_SUM_MM, 100, 1),
880 _M2B(MP_OCCUPANCY, 0x3f, B6, WARP, 0x31483104, 0x01, B6, WARP, 0x0, AVG_DIV_MM, 200, 64),
881 _M2B(MP_EFFICIENCY, 0x01, B6, WARP, 0x0, 0xffff, LOGOP, WARP, 0x0, AVG_DIV_M0, 100, 1),
882 };
883
884 #undef _Q1A
885 #undef _Q1B
886 #undef _M2A
887 #undef _M2B
888
889 void
890 nve4_mp_pm_query_begin(struct nvc0_context *nvc0, struct nvc0_query *q)
891 {
892 struct nvc0_screen *screen = nvc0->screen;
893 struct nouveau_pushbuf *push = nvc0->base.pushbuf;
894 const struct nve4_mp_pm_query_cfg *cfg;
895 unsigned i, c;
896 unsigned num_ab[2] = { 0, 0 };
897
898 cfg = &nve4_mp_pm_queries[q->type - PIPE_QUERY_DRIVER_SPECIFIC];
899
900 /* check if we have enough free counter slots */
901 for (i = 0; i < cfg->num_counters; ++i)
902 num_ab[cfg->ctr[i].sig_dom]++;
903
904 if (screen->pm.num_mp_pm_active[0] + num_ab[0] > 4 ||
905 screen->pm.num_mp_pm_active[1] + num_ab[1] > 4) {
906 NOUVEAU_ERR("Not enough free MP counter slots !\n");
907 return;
908 }
909
910 assert(cfg->num_counters <= 4);
911 PUSH_SPACE(push, 4 * 8 + 6);
912
913 if (!screen->pm.mp_counters_enabled) {
914 screen->pm.mp_counters_enabled = TRUE;
915 BEGIN_NVC0(push, SUBC_SW(0x06ac), 1);
916 PUSH_DATA (push, 0x1fcb);
917 }
918
919 /* set sequence field to 0 (used to check if result is available) */
920 for (i = 0; i < screen->mp_count; ++i)
921 q->data[i * 10 + 10] = 0;
922
923 for (i = 0; i < cfg->num_counters; ++i) {
924 const unsigned d = cfg->ctr[i].sig_dom;
925
926 if (!screen->pm.num_mp_pm_active[d]) {
927 uint32_t m = (1 << 22) | (1 << (7 + (8 * !d)));
928 if (screen->pm.num_mp_pm_active[!d])
929 m |= 1 << (7 + (8 * d));
930 BEGIN_NVC0(push, SUBC_SW(0x0600), 1);
931 PUSH_DATA (push, m);
932 }
933 screen->pm.num_mp_pm_active[d]++;
934
935 for (c = d * 4; c < (d * 4 + 4); ++c) {
936 if (!screen->pm.mp_counter[c]) {
937 q->ctr[i] = c;
938 screen->pm.mp_counter[c] = (struct pipe_query *)q;
939 break;
940 }
941 }
942 assert(c <= (d * 4 + 3)); /* must succeed, already checked for space */
943
944 /* configure and reset the counter(s) */
945 if (d == 0)
946 BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_A_SIGSEL(c & 3)), 1);
947 else
948 BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_B_SIGSEL(c & 3)), 1);
949 PUSH_DATA (push, cfg->ctr[i].sig_sel);
950 BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_SRCSEL(c)), 1);
951 PUSH_DATA (push, cfg->ctr[i].src_sel + 0x2108421 * (c & 3));
952 BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(c)), 1);
953 PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode);
954 BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_SET(c)), 1);
955 PUSH_DATA (push, 0);
956 }
957 }
958
959 static void
960 nve4_mp_pm_query_end(struct nvc0_context *nvc0, struct nvc0_query *q)
961 {
962 struct nvc0_screen *screen = nvc0->screen;
963 struct pipe_context *pipe = &nvc0->base.pipe;
964 struct nouveau_pushbuf *push = nvc0->base.pushbuf;
965 uint32_t mask;
966 uint32_t input[3];
967 const uint block[3] = { 32, 4, 1 };
968 const uint grid[3] = { screen->mp_count, 1, 1 };
969 unsigned c;
970 const struct nve4_mp_pm_query_cfg *cfg;
971
972 cfg = &nve4_mp_pm_queries[q->type - PIPE_QUERY_DRIVER_SPECIFIC];
973
974 if (unlikely(!screen->pm.prog)) {
975 struct nvc0_program *prog = CALLOC_STRUCT(nvc0_program);
976 prog->type = PIPE_SHADER_COMPUTE;
977 prog->translated = TRUE;
978 prog->num_gprs = 14;
979 prog->code = (uint32_t *)nve4_read_mp_pm_counters_code;
980 prog->code_size = sizeof(nve4_read_mp_pm_counters_code);
981 prog->parm_size = 12;
982 screen->pm.prog = prog;
983 }
984
985 /* disable all counting */
986 PUSH_SPACE(push, 8);
987 for (c = 0; c < 8; ++c)
988 if (screen->pm.mp_counter[c])
989 IMMED_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(c)), 0);
990 /* release counters for this query */
991 for (c = 0; c < 8; ++c) {
992 if (nvc0_query(screen->pm.mp_counter[c]) == q) {
993 screen->pm.num_mp_pm_active[c / 4]--;
994 screen->pm.mp_counter[c] = NULL;
995 }
996 }
997
998 BCTX_REFN_bo(nvc0->bufctx_cp, CP_QUERY, NOUVEAU_BO_GART | NOUVEAU_BO_WR,
999 q->bo);
1000
1001 PUSH_SPACE(push, 1);
1002 IMMED_NVC0(push, SUBC_COMPUTE(NV50_GRAPH_SERIALIZE), 0);
1003
1004 pipe->bind_compute_state(pipe, screen->pm.prog);
1005 input[0] = (q->bo->offset + q->base);
1006 input[1] = (q->bo->offset + q->base) >> 32;
1007 input[2] = q->sequence;
1008 pipe->launch_grid(pipe, block, grid, 0, input);
1009
1010 nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_QUERY);
1011
1012 /* re-activate other counters */
1013 PUSH_SPACE(push, 16);
1014 mask = 0;
1015 for (c = 0; c < 8; ++c) {
1016 unsigned i;
1017 q = nvc0_query(screen->pm.mp_counter[c]);
1018 if (!q)
1019 continue;
1020 cfg = &nve4_mp_pm_queries[q->type - PIPE_QUERY_DRIVER_SPECIFIC];
1021 for (i = 0; i < cfg->num_counters; ++i) {
1022 if (mask & (1 << q->ctr[i]))
1023 break;
1024 mask |= 1 << q->ctr[i];
1025 BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(q->ctr[i])), 1);
1026 PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode);
1027 }
1028 }
1029 }
1030
1031 /* Metric calculations:
1032 * sum(x) ... sum of x over all MPs
1033 * avg(x) ... average of x over all MPs
1034 *
1035 * IPC : sum(inst_executed) / clock
1036 * INST_REPLAY_OHEAD: (sum(inst_issued) - sum(inst_executed)) / sum(inst_issued)
1037 * MP_OCCUPANCY : avg((active_warps / 64) / active_cycles)
1038 * MP_EFFICIENCY : avg(active_cycles / clock)
1039 *
1040 * NOTE: Interpretation of IPC requires knowledge of MP count.
1041 */
1042 static boolean
1043 nve4_mp_pm_query_result(struct nvc0_context *nvc0, struct nvc0_query *q,
1044 void *result, boolean wait)
1045 {
1046 uint32_t count[32][4];
1047 uint64_t value = 0;
1048 unsigned mp_count = MIN2(nvc0->screen->mp_count_compute, 32);
1049 unsigned p, c, d;
1050 const struct nve4_mp_pm_query_cfg *cfg;
1051
1052 cfg = &nve4_mp_pm_queries[q->type - PIPE_QUERY_DRIVER_SPECIFIC];
1053
1054 for (p = 0; p < mp_count; ++p) {
1055 const unsigned b = (0x60 / 4) * p;
1056
1057 for (c = 0; c < cfg->num_counters; ++c) {
1058 count[p][c] = 0;
1059 for (d = 0; d < ((q->ctr[c] & ~3) ? 1 : 4); ++d) {
1060 if (q->data[b + 20 + d] != q->sequence) {
1061 if (!wait)
1062 return FALSE;
1063 if (nouveau_bo_wait(q->bo, NOUVEAU_BO_RD, nvc0->base.client))
1064 return FALSE;
1065 }
1066 if (q->ctr[c] & ~0x3)
1067 count[p][c] = q->data[b + 16 + (q->ctr[c] & 3)];
1068 else
1069 count[p][c] += q->data[b + d * 4 + q->ctr[c]];
1070 }
1071 }
1072 }
1073
1074 if (cfg->op == NVE4_COUNTER_OPn_SUM) {
1075 for (c = 0; c < cfg->num_counters; ++c)
1076 for (p = 0; p < mp_count; ++p)
1077 value += count[p][c];
1078 value = (value * cfg->norm[0]) / cfg->norm[1];
1079 } else
1080 if (cfg->op == NVE4_COUNTER_OPn_OR) {
1081 uint32_t v = 0;
1082 for (c = 0; c < cfg->num_counters; ++c)
1083 for (p = 0; p < mp_count; ++p)
1084 v |= count[p][c];
1085 value = (v * cfg->norm[0]) / cfg->norm[1];
1086 } else
1087 if (cfg->op == NVE4_COUNTER_OPn_AND) {
1088 uint32_t v = ~0;
1089 for (c = 0; c < cfg->num_counters; ++c)
1090 for (p = 0; p < mp_count; ++p)
1091 v &= count[p][c];
1092 value = (v * cfg->norm[0]) / cfg->norm[1];
1093 } else
1094 if (cfg->op == NVE4_COUNTER_OP2_REL_SUM_MM) {
1095 uint64_t v[2] = { 0, 0 };
1096 for (p = 0; p < mp_count; ++p) {
1097 v[0] += count[p][0];
1098 v[1] += count[p][1];
1099 }
1100 if (v[0])
1101 value = ((v[0] - v[1]) * cfg->norm[0]) / (v[0] * cfg->norm[1]);
1102 } else
1103 if (cfg->op == NVE4_COUNTER_OP2_DIV_SUM_M0) {
1104 for (p = 0; p < mp_count; ++p)
1105 value += count[p][0];
1106 if (count[0][1])
1107 value = (value * cfg->norm[0]) / (count[0][1] * cfg->norm[1]);
1108 else
1109 value = 0;
1110 } else
1111 if (cfg->op == NVE4_COUNTER_OP2_AVG_DIV_MM) {
1112 unsigned mp_used = 0;
1113 for (p = 0; p < mp_count; ++p, mp_used += !!count[p][0])
1114 if (count[p][1])
1115 value += (count[p][0] * cfg->norm[0]) / count[p][1];
1116 if (mp_used)
1117 value /= mp_used * cfg->norm[1];
1118 } else
1119 if (cfg->op == NVE4_COUNTER_OP2_AVG_DIV_M0) {
1120 unsigned mp_used = 0;
1121 for (p = 0; p < mp_count; ++p, mp_used += !!count[p][0])
1122 value += count[p][0];
1123 if (count[0][1] && mp_used) {
1124 value *= cfg->norm[0];
1125 value /= count[0][1] * mp_used * cfg->norm[1];
1126 } else {
1127 value = 0;
1128 }
1129 }
1130
1131 *(uint64_t *)result = value;
1132 return TRUE;
1133 }
1134
1135 int
1136 nvc0_screen_get_driver_query_info(struct pipe_screen *pscreen,
1137 unsigned id,
1138 struct pipe_driver_query_info *info)
1139 {
1140 struct nvc0_screen *screen = nvc0_screen(pscreen);
1141 int count = 0;
1142
1143 count += NVC0_QUERY_DRV_STAT_COUNT;
1144
1145 if (screen->base.class_3d >= NVE4_3D_CLASS) {
1146 if (screen->base.device->drm_version >= 0x01000101)
1147 count += NVE4_PM_QUERY_COUNT;
1148 }
1149 if (!info)
1150 return count;
1151
1152 #ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS
1153 if (id < NVC0_QUERY_DRV_STAT_COUNT) {
1154 info->name = nvc0_drv_stat_names[id];
1155 info->query_type = NVC0_QUERY_DRV_STAT(id);
1156 info->max_value = ~0ULL;
1157 info->uses_byte_units = !!strstr(info->name, "bytes");
1158 return 1;
1159 } else
1160 #endif
1161 if (id < count) {
1162 info->name = nve4_pm_query_names[id - NVC0_QUERY_DRV_STAT_COUNT];
1163 info->query_type = NVE4_PM_QUERY(id - NVC0_QUERY_DRV_STAT_COUNT);
1164 info->max_value = (id < NVE4_PM_QUERY_METRIC_MP_OCCUPANCY) ?
1165 ~0ULL : 100;
1166 info->uses_byte_units = FALSE;
1167 return 1;
1168 }
1169 /* user asked for info about non-existing query */
1170 info->name = "this_is_not_the_query_you_are_looking_for";
1171 info->query_type = 0xdeadd01d;
1172 info->max_value = 0;
1173 info->uses_byte_units = FALSE;
1174 return 0;
1175 }
1176
1177 void
1178 nvc0_init_query_functions(struct nvc0_context *nvc0)
1179 {
1180 struct pipe_context *pipe = &nvc0->base.pipe;
1181
1182 pipe->create_query = nvc0_query_create;
1183 pipe->destroy_query = nvc0_query_destroy;
1184 pipe->begin_query = nvc0_query_begin;
1185 pipe->end_query = nvc0_query_end;
1186 pipe->get_query_result = nvc0_query_result;
1187 pipe->render_condition = nvc0_render_condition;
1188 }