1 /* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
4 * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26 * Rob Clark <robclark@freedesktop.org>
29 #include "freedreno_query_hw.h"
30 #include "freedreno_context.h"
31 #include "freedreno_util.h"
33 #include "fd4_query.h"
34 #include "fd4_context.h"
36 #include "fd4_format.h"
39 struct fd_rb_samp_ctrs
{
46 * OCCLUSION_COUNTER and OCCLUSION_PREDICATE differ only in how they
50 static struct fd_hw_sample
*
51 occlusion_get_sample(struct fd_batch
*batch
, struct fd_ringbuffer
*ring
)
53 struct fd_hw_sample
*samp
=
54 fd_hw_sample_init(batch
, sizeof(struct fd_rb_samp_ctrs
));
56 /* low bits of sample addr should be zero (since they are control
57 * flags in RB_SAMPLE_COUNT_CONTROL):
59 debug_assert((samp
->offset
& 0x3) == 0);
61 /* Set RB_SAMPLE_COUNT_ADDR to samp->offset plus value of
62 * HW_QUERY_BASE_REG register:
64 OUT_PKT3(ring
, CP_SET_CONSTANT
, 3);
65 OUT_RING(ring
, CP_REG(REG_A4XX_RB_SAMPLE_COUNT_CONTROL
) | 0x80000000);
66 OUT_RING(ring
, HW_QUERY_BASE_REG
);
67 OUT_RING(ring
, A4XX_RB_SAMPLE_COUNT_CONTROL_COPY
|
70 OUT_PKT3(ring
, CP_DRAW_INDX_OFFSET
, 3);
71 OUT_RING(ring
, DRAW4(DI_PT_POINTLIST_PSIZE
, DI_SRC_SEL_AUTO_INDEX
,
72 INDEX4_SIZE_32_BIT
, USE_VISIBILITY
));
73 OUT_RING(ring
, 1); /* NumInstances */
74 OUT_RING(ring
, 0); /* NumIndices */
76 fd_event_write(batch
, ring
, ZPASS_DONE
);
82 count_samples(const struct fd_rb_samp_ctrs
*start
,
83 const struct fd_rb_samp_ctrs
*end
)
85 return end
->ctr
[0] - start
->ctr
[0];
89 occlusion_counter_accumulate_result(struct fd_context
*ctx
,
90 const void *start
, const void *end
,
91 union pipe_query_result
*result
)
93 uint64_t n
= count_samples(start
, end
);
98 occlusion_predicate_accumulate_result(struct fd_context
*ctx
,
99 const void *start
, const void *end
,
100 union pipe_query_result
*result
)
102 uint64_t n
= count_samples(start
, end
);
103 result
->b
|= (n
> 0);
107 * Time Elapsed Query:
109 * Note: we could in theory support timestamp queries, but they
110 * won't give sensible results for tilers.
114 time_elapsed_enable(struct fd_context
*ctx
, struct fd_ringbuffer
*ring
)
116 /* Right now, the assignment of countable to counter register is
117 * just hard coded. If we start exposing more countables than we
118 * have counters, we will need to be more clever.
120 fd_wfi(ctx
->batch
, ring
);
121 OUT_PKT0(ring
, REG_A4XX_CP_PERFCTR_CP_SEL_0
, 1);
122 OUT_RING(ring
, CP_ALWAYS_COUNT
);
125 static struct fd_hw_sample
*
126 time_elapsed_get_sample(struct fd_batch
*batch
, struct fd_ringbuffer
*ring
)
128 struct fd_hw_sample
*samp
= fd_hw_sample_init(batch
, sizeof(uint64_t));
130 /* use unused part of vsc_size_mem as scratch space, to avoid
133 struct fd_bo
*scratch_bo
= fd4_context(batch
->ctx
)->vsc_size_mem
;
134 const int sample_off
= 128;
135 const int addr_off
= sample_off
+ 8;
137 debug_assert(batch
->ctx
->screen
->max_freq
> 0);
139 /* Basic issue is that we need to read counter value to a relative
140 * destination (with per-tile offset) rather than absolute dest
141 * addr. But there is no pm4 packet that can do that. This is
142 * where it would be *really* nice if we could write our own fw
143 * since afaict implementing the sort of packet we need would be
147 * (1) CP_REG_TO_MEM to do a 64b copy of counter to scratch buffer
148 * (2) CP_MEM_WRITE to write per-sample offset to scratch buffer
149 * (3) CP_REG_TO_MEM w/ accumulate flag to add the per-tile base
150 * address to the per-sample offset in the scratch buffer
151 * (4) CP_MEM_TO_REG to copy resulting address from steps #2 and #3
153 * (5) CP_MEM_TO_REG's to copy saved counter value from scratch
154 * buffer to CP_ME_NRT_DATA to trigger the write out to query
157 * Straightforward, right?
159 * Maybe could swap the order of things in the scratch buffer to
160 * put address first, and copy back to CP_ME_NRT_ADDR+DATA in one
161 * shot, but that's really just polishing a turd..
166 /* copy sample counter _LO and _HI to scratch: */
167 OUT_PKT3(ring
, CP_REG_TO_MEM
, 2);
168 OUT_RING(ring
, CP_REG_TO_MEM_0_REG(REG_A4XX_RBBM_PERFCTR_CP_0_LO
) |
169 CP_REG_TO_MEM_0_64B
|
170 CP_REG_TO_MEM_0_CNT(2-1)); /* write 2 regs to mem */
171 OUT_RELOCW(ring
, scratch_bo
, sample_off
, 0, 0);
173 /* ok... here we really *would* like to use the CP_SET_CONSTANT
174 * mode which can add a constant to value in reg2 and write to
175 * reg1... *but* that only works for banked/context registers,
176 * and CP_ME_NRT_DATA isn't one of those.. so we need to do some
177 * CP math to the scratch buffer instead:
179 * (note first 8 bytes are counter value, use offset 0x8 for
180 * address calculation)
183 /* per-sample offset to scratch bo: */
184 OUT_PKT3(ring
, CP_MEM_WRITE
, 2);
185 OUT_RELOCW(ring
, scratch_bo
, addr_off
, 0, 0);
186 OUT_RING(ring
, samp
->offset
);
188 /* now add to that the per-tile base: */
189 OUT_PKT3(ring
, CP_REG_TO_MEM
, 2);
190 OUT_RING(ring
, CP_REG_TO_MEM_0_REG(HW_QUERY_BASE_REG
) |
191 CP_REG_TO_MEM_0_ACCUMULATE
|
192 CP_REG_TO_MEM_0_CNT(1-1)); /* readback 1 regs */
193 OUT_RELOCW(ring
, scratch_bo
, addr_off
, 0, 0);
195 /* now copy that back to CP_ME_NRT_ADDR: */
196 OUT_PKT3(ring
, CP_MEM_TO_REG
, 2);
197 OUT_RING(ring
, REG_A4XX_CP_ME_NRT_ADDR
);
198 OUT_RELOC(ring
, scratch_bo
, addr_off
, 0, 0);
200 /* and finally, copy sample from scratch buffer to CP_ME_NRT_DATA
201 * to trigger the write to result buffer
203 OUT_PKT3(ring
, CP_MEM_TO_REG
, 2);
204 OUT_RING(ring
, REG_A4XX_CP_ME_NRT_DATA
);
205 OUT_RELOC(ring
, scratch_bo
, sample_off
, 0, 0);
207 /* and again to get the value of the _HI reg from scratch: */
208 OUT_PKT3(ring
, CP_MEM_TO_REG
, 2);
209 OUT_RING(ring
, REG_A4XX_CP_ME_NRT_DATA
);
210 OUT_RELOC(ring
, scratch_bo
, sample_off
+ 0x4, 0, 0);
218 time_elapsed_accumulate_result(struct fd_context
*ctx
,
219 const void *start
, const void *end
,
220 union pipe_query_result
*result
)
222 uint64_t n
= *(uint64_t *)end
- *(uint64_t *)start
;
223 /* max_freq is in Hz, convert cycle count to ns: */
224 result
->u64
+= n
* 1000000000 / ctx
->screen
->max_freq
;
228 timestamp_accumulate_result(struct fd_context
*ctx
,
229 const void *start
, const void *end
,
230 union pipe_query_result
*result
)
232 /* just return the value from fist tile: */
233 if (result
->u64
!= 0)
235 uint64_t n
= *(uint64_t *)start
;
236 /* max_freq is in Hz, convert cycle count to ns: */
237 result
->u64
= n
* 1000000000 / ctx
->screen
->max_freq
;
240 static const struct fd_hw_sample_provider occlusion_counter
= {
241 .query_type
= PIPE_QUERY_OCCLUSION_COUNTER
,
242 .active
= FD_STAGE_DRAW
,
243 .get_sample
= occlusion_get_sample
,
244 .accumulate_result
= occlusion_counter_accumulate_result
,
247 static const struct fd_hw_sample_provider occlusion_predicate
= {
248 .query_type
= PIPE_QUERY_OCCLUSION_PREDICATE
,
249 .active
= FD_STAGE_DRAW
,
250 .get_sample
= occlusion_get_sample
,
251 .accumulate_result
= occlusion_predicate_accumulate_result
,
254 static const struct fd_hw_sample_provider occlusion_predicate
= {
255 .query_type
= PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE
,
256 .active
= FD_STAGE_DRAW
,
257 .get_sample
= occlusion_get_sample
,
258 .accumulate_result
= occlusion_predicate_accumulate_result
,
261 static const struct fd_hw_sample_provider time_elapsed
= {
262 .query_type
= PIPE_QUERY_TIME_ELAPSED
,
263 .active
= FD_STAGE_DRAW
| FD_STAGE_CLEAR
,
264 .enable
= time_elapsed_enable
,
265 .get_sample
= time_elapsed_get_sample
,
266 .accumulate_result
= time_elapsed_accumulate_result
,
269 /* NOTE: timestamp query isn't going to give terribly sensible results
270 * on a tiler. But it is needed by qapitrace profile heatmap. If you
271 * add in a binning pass, the results get even more non-sensical. So
272 * we just return the timestamp on the first tile and hope that is
273 * kind of good enough.
275 static const struct fd_hw_sample_provider timestamp
= {
276 .query_type
= PIPE_QUERY_TIMESTAMP
,
277 .active
= FD_STAGE_ALL
,
278 .enable
= time_elapsed_enable
,
279 .get_sample
= time_elapsed_get_sample
,
280 .accumulate_result
= timestamp_accumulate_result
,
283 void fd4_query_context_init(struct pipe_context
*pctx
)
285 struct fd_context
*ctx
= fd_context(pctx
);
287 ctx
->create_query
= fd_hw_create_query
;
288 ctx
->query_prepare
= fd_hw_query_prepare
;
289 ctx
->query_prepare_tile
= fd_hw_query_prepare_tile
;
290 ctx
->query_set_stage
= fd_hw_query_set_stage
;
292 fd_hw_query_register_provider(pctx
, &occlusion_counter
);
293 fd_hw_query_register_provider(pctx
, &occlusion_predicate
);
294 fd_hw_query_register_provider(pctx
, &occlusion_predicate_conservative
);
295 fd_hw_query_register_provider(pctx
, &time_elapsed
);
296 fd_hw_query_register_provider(pctx
, ×tamp
);