d40e67133e2b55a45573686c8cc9a5cc882513ed
[mesa.git] / src / mesa / drivers / dri / i965 / intel_batchbuffer.c
1 /**************************************************************************
2 *
3 * Copyright 2006 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28 #include "intel_batchbuffer.h"
29 #include "intel_buffer_objects.h"
30 #include "intel_reg.h"
31 #include "intel_bufmgr.h"
32 #include "intel_buffers.h"
33 #include "intel_fbo.h"
34 #include "brw_context.h"
35 #include "brw_defines.h"
36
37 #include <xf86drm.h>
38 #include <i915_drm.h>
39
40 static void
41 intel_batchbuffer_reset(struct brw_context *brw);
42
43 void
44 intel_batchbuffer_init(struct brw_context *brw)
45 {
46 intel_batchbuffer_reset(brw);
47
48 if (!brw->has_llc) {
49 brw->batch.cpu_map = malloc(BATCH_SZ);
50 brw->batch.map = brw->batch.cpu_map;
51 brw->batch.map_next = brw->batch.cpu_map;
52 }
53 }
54
55 static void
56 intel_batchbuffer_reset(struct brw_context *brw)
57 {
58 if (brw->batch.last_bo != NULL) {
59 drm_intel_bo_unreference(brw->batch.last_bo);
60 brw->batch.last_bo = NULL;
61 }
62 brw->batch.last_bo = brw->batch.bo;
63
64 brw_render_cache_set_clear(brw);
65
66 brw->batch.bo = drm_intel_bo_alloc(brw->bufmgr, "batchbuffer",
67 BATCH_SZ, 4096);
68 if (brw->has_llc) {
69 drm_intel_bo_map(brw->batch.bo, true);
70 brw->batch.map = brw->batch.bo->virtual;
71 }
72 brw->batch.map_next = brw->batch.map;
73
74 brw->batch.reserved_space = BATCH_RESERVED;
75 brw->batch.state_batch_offset = brw->batch.bo->size;
76 brw->batch.needs_sol_reset = false;
77
78 /* We don't know what ring the new batch will be sent to until we see the
79 * first BEGIN_BATCH or BEGIN_BATCH_BLT. Mark it as unknown.
80 */
81 brw->batch.ring = UNKNOWN_RING;
82 }
83
84 void
85 intel_batchbuffer_save_state(struct brw_context *brw)
86 {
87 brw->batch.saved.map_next = brw->batch.map_next;
88 brw->batch.saved.reloc_count =
89 drm_intel_gem_bo_get_reloc_count(brw->batch.bo);
90 }
91
92 void
93 intel_batchbuffer_reset_to_saved(struct brw_context *brw)
94 {
95 drm_intel_gem_bo_clear_relocs(brw->batch.bo, brw->batch.saved.reloc_count);
96
97 brw->batch.map_next = brw->batch.saved.map_next;
98 if (USED_BATCH(brw->batch) == 0)
99 brw->batch.ring = UNKNOWN_RING;
100 }
101
102 void
103 intel_batchbuffer_free(struct brw_context *brw)
104 {
105 free(brw->batch.cpu_map);
106 drm_intel_bo_unreference(brw->batch.last_bo);
107 drm_intel_bo_unreference(brw->batch.bo);
108 }
109
110 static void
111 do_batch_dump(struct brw_context *brw)
112 {
113 struct drm_intel_decode *decode;
114 struct intel_batchbuffer *batch = &brw->batch;
115 int ret;
116
117 decode = drm_intel_decode_context_alloc(brw->intelScreen->deviceID);
118 if (!decode)
119 return;
120
121 ret = drm_intel_bo_map(batch->bo, false);
122 if (ret == 0) {
123 drm_intel_decode_set_batch_pointer(decode,
124 batch->bo->virtual,
125 batch->bo->offset64,
126 USED_BATCH(*batch));
127 } else {
128 fprintf(stderr,
129 "WARNING: failed to map batchbuffer (%s), "
130 "dumping uploaded data instead.\n", strerror(ret));
131
132 drm_intel_decode_set_batch_pointer(decode,
133 batch->map,
134 batch->bo->offset64,
135 USED_BATCH(*batch));
136 }
137
138 drm_intel_decode_set_output_file(decode, stderr);
139 drm_intel_decode(decode);
140
141 drm_intel_decode_context_free(decode);
142
143 if (ret == 0) {
144 drm_intel_bo_unmap(batch->bo);
145
146 brw_debug_batch(brw);
147 }
148 }
149
150 void
151 intel_batchbuffer_emit_render_ring_prelude(struct brw_context *brw)
152 {
153 /* We may need to enable and snapshot OA counters. */
154 brw_perf_monitor_new_batch(brw);
155 }
156
157 /**
158 * Called when starting a new batch buffer.
159 */
160 static void
161 brw_new_batch(struct brw_context *brw)
162 {
163 /* Create a new batchbuffer and reset the associated state: */
164 drm_intel_gem_bo_clear_relocs(brw->batch.bo, 0);
165 intel_batchbuffer_reset(brw);
166
167 /* If the kernel supports hardware contexts, then most hardware state is
168 * preserved between batches; we only need to re-emit state that is required
169 * to be in every batch. Otherwise we need to re-emit all the state that
170 * would otherwise be stored in the context (which for all intents and
171 * purposes means everything).
172 */
173 if (brw->hw_ctx == NULL)
174 brw->ctx.NewDriverState |= BRW_NEW_CONTEXT;
175
176 brw->ctx.NewDriverState |= BRW_NEW_BATCH;
177
178 brw->state_batch_count = 0;
179
180 brw->ib.type = -1;
181
182 /* We need to periodically reap the shader time results, because rollover
183 * happens every few seconds. We also want to see results every once in a
184 * while, because many programs won't cleanly destroy our context, so the
185 * end-of-run printout may not happen.
186 */
187 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
188 brw_collect_and_report_shader_time(brw);
189
190 if (INTEL_DEBUG & DEBUG_PERFMON)
191 brw_dump_perf_monitors(brw);
192 }
193
194 /**
195 * Called from intel_batchbuffer_flush before emitting MI_BATCHBUFFER_END and
196 * sending it off.
197 *
198 * This function can emit state (say, to preserve registers that aren't saved
199 * between batches). All of this state MUST fit in the reserved space at the
200 * end of the batchbuffer. If you add more GPU state, increase the reserved
201 * space by updating the BATCH_RESERVED macro.
202 */
203 static void
204 brw_finish_batch(struct brw_context *brw)
205 {
206 /* Capture the closing pipeline statistics register values necessary to
207 * support query objects (in the non-hardware context world).
208 */
209 brw_emit_query_end(brw);
210
211 if (brw->batch.ring == RENDER_RING) {
212 /* We may also need to snapshot and disable OA counters. */
213 brw_perf_monitor_finish_batch(brw);
214
215 if (brw->is_haswell) {
216 /* From the Haswell PRM, Volume 2b, Command Reference: Instructions,
217 * 3DSTATE_CC_STATE_POINTERS > "Note":
218 *
219 * "SW must program 3DSTATE_CC_STATE_POINTERS command at the end of every
220 * 3D batch buffer followed by a PIPE_CONTROL with RC flush and CS stall."
221 *
222 * From the example in the docs, it seems to expect a regular pipe control
223 * flush here as well. We may have done it already, but meh.
224 *
225 * See also WaAvoidRCZCounterRollover.
226 */
227 brw_emit_mi_flush(brw);
228 BEGIN_BATCH(2);
229 OUT_BATCH(_3DSTATE_CC_STATE_POINTERS << 16 | (2 - 2));
230 OUT_BATCH(brw->cc.state_offset | 1);
231 ADVANCE_BATCH();
232 brw_emit_pipe_control_flush(brw, PIPE_CONTROL_RENDER_TARGET_FLUSH |
233 PIPE_CONTROL_CS_STALL);
234 }
235 }
236
237 /* Mark that the current program cache BO has been used by the GPU.
238 * It will be reallocated if we need to put new programs in for the
239 * next batch.
240 */
241 brw->cache.bo_used_by_gpu = true;
242 }
243
244 static void
245 throttle(struct brw_context *brw)
246 {
247 /* Wait for the swapbuffers before the one we just emitted, so we
248 * don't get too many swaps outstanding for apps that are GPU-heavy
249 * but not CPU-heavy.
250 *
251 * We're using intelDRI2Flush (called from the loader before
252 * swapbuffer) and glFlush (for front buffer rendering) as the
253 * indicator that a frame is done and then throttle when we get
254 * here as we prepare to render the next frame. At this point for
255 * round trips for swap/copy and getting new buffers are done and
256 * we'll spend less time waiting on the GPU.
257 *
258 * Unfortunately, we don't have a handle to the batch containing
259 * the swap, and getting our hands on that doesn't seem worth it,
260 * so we just use the first batch we emitted after the last swap.
261 */
262 if (brw->need_swap_throttle && brw->throttle_batch[0]) {
263 if (brw->throttle_batch[1]) {
264 if (!brw->disable_throttling)
265 drm_intel_bo_wait_rendering(brw->throttle_batch[1]);
266 drm_intel_bo_unreference(brw->throttle_batch[1]);
267 }
268 brw->throttle_batch[1] = brw->throttle_batch[0];
269 brw->throttle_batch[0] = NULL;
270 brw->need_swap_throttle = false;
271 /* Throttling here is more precise than the throttle ioctl, so skip it */
272 brw->need_flush_throttle = false;
273 }
274
275 if (brw->need_flush_throttle) {
276 __DRIscreen *psp = brw->intelScreen->driScrnPriv;
277 drmCommandNone(psp->fd, DRM_I915_GEM_THROTTLE);
278 brw->need_flush_throttle = false;
279 }
280 }
281
282 /* Drop when RS headers get pulled to libdrm */
283 #ifndef I915_EXEC_RESOURCE_STREAMER
284 #define I915_EXEC_RESOURCE_STREAMER (1<<15)
285 #endif
286
287 /* TODO: Push this whole function into bufmgr.
288 */
289 static int
290 do_flush_locked(struct brw_context *brw)
291 {
292 struct intel_batchbuffer *batch = &brw->batch;
293 int ret = 0;
294
295 if (brw->has_llc) {
296 drm_intel_bo_unmap(batch->bo);
297 } else {
298 ret = drm_intel_bo_subdata(batch->bo, 0, 4 * USED_BATCH(*batch), batch->map);
299 if (ret == 0 && batch->state_batch_offset != batch->bo->size) {
300 ret = drm_intel_bo_subdata(batch->bo,
301 batch->state_batch_offset,
302 batch->bo->size - batch->state_batch_offset,
303 (char *)batch->map + batch->state_batch_offset);
304 }
305 }
306
307 if (!brw->intelScreen->no_hw) {
308 int flags;
309
310 if (brw->gen >= 6 && batch->ring == BLT_RING) {
311 flags = I915_EXEC_BLT;
312 } else {
313 flags = I915_EXEC_RENDER |
314 (brw->use_resource_streamer ? I915_EXEC_RESOURCE_STREAMER : 0);
315 }
316 if (batch->needs_sol_reset)
317 flags |= I915_EXEC_GEN7_SOL_RESET;
318
319 if (ret == 0) {
320 if (unlikely(INTEL_DEBUG & DEBUG_AUB))
321 brw_annotate_aub(brw);
322
323 if (brw->hw_ctx == NULL || batch->ring != RENDER_RING) {
324 ret = drm_intel_bo_mrb_exec(batch->bo, 4 * USED_BATCH(*batch),
325 NULL, 0, 0, flags);
326 } else {
327 ret = drm_intel_gem_bo_context_exec(batch->bo, brw->hw_ctx,
328 4 * USED_BATCH(*batch), flags);
329 }
330 }
331
332 throttle(brw);
333 }
334
335 if (unlikely(INTEL_DEBUG & DEBUG_BATCH))
336 do_batch_dump(brw);
337
338 if (ret != 0) {
339 fprintf(stderr, "intel_do_flush_locked failed: %s\n", strerror(-ret));
340 exit(1);
341 }
342
343 return ret;
344 }
345
346 int
347 _intel_batchbuffer_flush(struct brw_context *brw,
348 const char *file, int line)
349 {
350 int ret;
351
352 if (USED_BATCH(brw->batch) == 0)
353 return 0;
354
355 if (brw->throttle_batch[0] == NULL) {
356 brw->throttle_batch[0] = brw->batch.bo;
357 drm_intel_bo_reference(brw->throttle_batch[0]);
358 }
359
360 if (unlikely(INTEL_DEBUG & DEBUG_BATCH)) {
361 int bytes_for_commands = 4 * USED_BATCH(brw->batch);
362 int bytes_for_state = brw->batch.bo->size - brw->batch.state_batch_offset;
363 int total_bytes = bytes_for_commands + bytes_for_state;
364 fprintf(stderr, "%s:%d: Batchbuffer flush with %4db (pkt) + "
365 "%4db (state) = %4db (%0.1f%%)\n", file, line,
366 bytes_for_commands, bytes_for_state,
367 total_bytes,
368 100.0f * total_bytes / BATCH_SZ);
369 }
370
371 brw->batch.reserved_space = 0;
372
373 brw_finish_batch(brw);
374
375 /* Mark the end of the buffer. */
376 intel_batchbuffer_emit_dword(brw, MI_BATCH_BUFFER_END);
377 if (USED_BATCH(brw->batch) & 1) {
378 /* Round batchbuffer usage to 2 DWORDs. */
379 intel_batchbuffer_emit_dword(brw, MI_NOOP);
380 }
381
382 intel_upload_finish(brw);
383
384 /* Check that we didn't just wrap our batchbuffer at a bad time. */
385 assert(!brw->no_batch_wrap);
386
387 ret = do_flush_locked(brw);
388
389 if (unlikely(INTEL_DEBUG & DEBUG_SYNC)) {
390 fprintf(stderr, "waiting for idle\n");
391 drm_intel_bo_wait_rendering(brw->batch.bo);
392 }
393
394 /* Start a new batch buffer. */
395 brw_new_batch(brw);
396
397 return ret;
398 }
399
400
401 /* This is the only way buffers get added to the validate list.
402 */
403 uint32_t
404 intel_batchbuffer_reloc(struct brw_context *brw,
405 drm_intel_bo *buffer, uint32_t offset,
406 uint32_t read_domains, uint32_t write_domain,
407 uint32_t delta)
408 {
409 int ret;
410
411 ret = drm_intel_bo_emit_reloc(brw->batch.bo, offset,
412 buffer, delta,
413 read_domains, write_domain);
414 assert(ret == 0);
415 (void)ret;
416
417 /* Using the old buffer offset, write in what the right data would be, in
418 * case the buffer doesn't move and we can short-circuit the relocation
419 * processing in the kernel
420 */
421 return buffer->offset64 + delta;
422 }
423
424 uint64_t
425 intel_batchbuffer_reloc64(struct brw_context *brw,
426 drm_intel_bo *buffer, uint32_t offset,
427 uint32_t read_domains, uint32_t write_domain,
428 uint32_t delta)
429 {
430 int ret = drm_intel_bo_emit_reloc(brw->batch.bo, offset,
431 buffer, delta,
432 read_domains, write_domain);
433 assert(ret == 0);
434 (void) ret;
435
436 /* Using the old buffer offset, write in what the right data would be, in
437 * case the buffer doesn't move and we can short-circuit the relocation
438 * processing in the kernel
439 */
440 return buffer->offset64 + delta;
441 }
442
443
444 void
445 intel_batchbuffer_data(struct brw_context *brw,
446 const void *data, GLuint bytes, enum brw_gpu_ring ring)
447 {
448 assert((bytes & 3) == 0);
449 intel_batchbuffer_require_space(brw, bytes, ring);
450 memcpy(brw->batch.map_next, data, bytes);
451 brw->batch.map_next += bytes >> 2;
452 }
453
454 static void
455 load_sized_register_mem(struct brw_context *brw,
456 uint32_t reg,
457 drm_intel_bo *bo,
458 uint32_t read_domains, uint32_t write_domain,
459 uint32_t offset,
460 int size)
461 {
462 int i;
463
464 /* MI_LOAD_REGISTER_MEM only exists on Gen7+. */
465 assert(brw->gen >= 7);
466
467 if (brw->gen >= 8) {
468 BEGIN_BATCH(4 * size);
469 for (i = 0; i < size; i++) {
470 OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (4 - 2));
471 OUT_BATCH(reg + i * 4);
472 OUT_RELOC64(bo, read_domains, write_domain, offset + i * 4);
473 }
474 ADVANCE_BATCH();
475 } else {
476 BEGIN_BATCH(3 * size);
477 for (i = 0; i < size; i++) {
478 OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (3 - 2));
479 OUT_BATCH(reg + i * 4);
480 OUT_RELOC(bo, read_domains, write_domain, offset + i * 4);
481 }
482 ADVANCE_BATCH();
483 }
484 }
485
486 void
487 brw_load_register_mem(struct brw_context *brw,
488 uint32_t reg,
489 drm_intel_bo *bo,
490 uint32_t read_domains, uint32_t write_domain,
491 uint32_t offset)
492 {
493 load_sized_register_mem(brw, reg, bo, read_domains, write_domain, offset, 1);
494 }
495
496 void
497 brw_load_register_mem64(struct brw_context *brw,
498 uint32_t reg,
499 drm_intel_bo *bo,
500 uint32_t read_domains, uint32_t write_domain,
501 uint32_t offset)
502 {
503 load_sized_register_mem(brw, reg, bo, read_domains, write_domain, offset, 2);
504 }