i965: Move pipecontrol workaround bo to brw_pipe_control
[mesa.git] / src / mesa / drivers / dri / i965 / intel_batchbuffer.c
1 /**************************************************************************
2 *
3 * Copyright 2006 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28 #include "intel_batchbuffer.h"
29 #include "intel_buffer_objects.h"
30 #include "intel_reg.h"
31 #include "intel_bufmgr.h"
32 #include "intel_buffers.h"
33 #include "intel_fbo.h"
34 #include "brw_context.h"
35
36 #include <xf86drm.h>
37 #include <i915_drm.h>
38
39 static void
40 intel_batchbuffer_reset(struct brw_context *brw);
41
42 void
43 intel_batchbuffer_init(struct brw_context *brw)
44 {
45 intel_batchbuffer_reset(brw);
46
47 if (!brw->has_llc) {
48 brw->batch.cpu_map = malloc(BATCH_SZ);
49 brw->batch.map = brw->batch.cpu_map;
50 }
51 }
52
53 static void
54 intel_batchbuffer_reset(struct brw_context *brw)
55 {
56 if (brw->batch.last_bo != NULL) {
57 drm_intel_bo_unreference(brw->batch.last_bo);
58 brw->batch.last_bo = NULL;
59 }
60 brw->batch.last_bo = brw->batch.bo;
61
62 brw_render_cache_set_clear(brw);
63
64 brw->batch.bo = drm_intel_bo_alloc(brw->bufmgr, "batchbuffer",
65 BATCH_SZ, 4096);
66 if (brw->has_llc) {
67 drm_intel_bo_map(brw->batch.bo, true);
68 brw->batch.map = brw->batch.bo->virtual;
69 }
70
71 brw->batch.reserved_space = BATCH_RESERVED;
72 brw->batch.state_batch_offset = brw->batch.bo->size;
73 brw->batch.used = 0;
74 brw->batch.needs_sol_reset = false;
75
76 /* We don't know what ring the new batch will be sent to until we see the
77 * first BEGIN_BATCH or BEGIN_BATCH_BLT. Mark it as unknown.
78 */
79 brw->batch.ring = UNKNOWN_RING;
80 }
81
82 void
83 intel_batchbuffer_save_state(struct brw_context *brw)
84 {
85 brw->batch.saved.used = brw->batch.used;
86 brw->batch.saved.reloc_count =
87 drm_intel_gem_bo_get_reloc_count(brw->batch.bo);
88 }
89
90 void
91 intel_batchbuffer_reset_to_saved(struct brw_context *brw)
92 {
93 drm_intel_gem_bo_clear_relocs(brw->batch.bo, brw->batch.saved.reloc_count);
94
95 brw->batch.used = brw->batch.saved.used;
96 if (brw->batch.used == 0)
97 brw->batch.ring = UNKNOWN_RING;
98 }
99
100 void
101 intel_batchbuffer_free(struct brw_context *brw)
102 {
103 free(brw->batch.cpu_map);
104 drm_intel_bo_unreference(brw->batch.last_bo);
105 drm_intel_bo_unreference(brw->batch.bo);
106 }
107
108 static void
109 do_batch_dump(struct brw_context *brw)
110 {
111 struct drm_intel_decode *decode;
112 struct intel_batchbuffer *batch = &brw->batch;
113 int ret;
114
115 decode = drm_intel_decode_context_alloc(brw->intelScreen->deviceID);
116 if (!decode)
117 return;
118
119 ret = drm_intel_bo_map(batch->bo, false);
120 if (ret == 0) {
121 drm_intel_decode_set_batch_pointer(decode,
122 batch->bo->virtual,
123 batch->bo->offset64,
124 batch->used);
125 } else {
126 fprintf(stderr,
127 "WARNING: failed to map batchbuffer (%s), "
128 "dumping uploaded data instead.\n", strerror(ret));
129
130 drm_intel_decode_set_batch_pointer(decode,
131 batch->map,
132 batch->bo->offset64,
133 batch->used);
134 }
135
136 drm_intel_decode_set_output_file(decode, stderr);
137 drm_intel_decode(decode);
138
139 drm_intel_decode_context_free(decode);
140
141 if (ret == 0) {
142 drm_intel_bo_unmap(batch->bo);
143
144 brw_debug_batch(brw);
145 }
146 }
147
148 void
149 intel_batchbuffer_emit_render_ring_prelude(struct brw_context *brw)
150 {
151 /* We may need to enable and snapshot OA counters. */
152 brw_perf_monitor_new_batch(brw);
153 }
154
155 /**
156 * Called when starting a new batch buffer.
157 */
158 static void
159 brw_new_batch(struct brw_context *brw)
160 {
161 /* Create a new batchbuffer and reset the associated state: */
162 drm_intel_gem_bo_clear_relocs(brw->batch.bo, 0);
163 intel_batchbuffer_reset(brw);
164
165 /* If the kernel supports hardware contexts, then most hardware state is
166 * preserved between batches; we only need to re-emit state that is required
167 * to be in every batch. Otherwise we need to re-emit all the state that
168 * would otherwise be stored in the context (which for all intents and
169 * purposes means everything).
170 */
171 if (brw->hw_ctx == NULL)
172 brw->ctx.NewDriverState |= BRW_NEW_CONTEXT;
173
174 brw->ctx.NewDriverState |= BRW_NEW_BATCH;
175
176 brw->state_batch_count = 0;
177
178 brw->ib.type = -1;
179
180 /* We need to periodically reap the shader time results, because rollover
181 * happens every few seconds. We also want to see results every once in a
182 * while, because many programs won't cleanly destroy our context, so the
183 * end-of-run printout may not happen.
184 */
185 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
186 brw_collect_and_report_shader_time(brw);
187
188 if (INTEL_DEBUG & DEBUG_PERFMON)
189 brw_dump_perf_monitors(brw);
190 }
191
192 /**
193 * Called from intel_batchbuffer_flush before emitting MI_BATCHBUFFER_END and
194 * sending it off.
195 *
196 * This function can emit state (say, to preserve registers that aren't saved
197 * between batches). All of this state MUST fit in the reserved space at the
198 * end of the batchbuffer. If you add more GPU state, increase the reserved
199 * space by updating the BATCH_RESERVED macro.
200 */
201 static void
202 brw_finish_batch(struct brw_context *brw)
203 {
204 /* Capture the closing pipeline statistics register values necessary to
205 * support query objects (in the non-hardware context world).
206 */
207 brw_emit_query_end(brw);
208
209 /* We may also need to snapshot and disable OA counters. */
210 if (brw->batch.ring == RENDER_RING)
211 brw_perf_monitor_finish_batch(brw);
212
213 /* Mark that the current program cache BO has been used by the GPU.
214 * It will be reallocated if we need to put new programs in for the
215 * next batch.
216 */
217 brw->cache.bo_used_by_gpu = true;
218 }
219
220 static void
221 throttle(struct brw_context *brw)
222 {
223 /* Wait for the swapbuffers before the one we just emitted, so we
224 * don't get too many swaps outstanding for apps that are GPU-heavy
225 * but not CPU-heavy.
226 *
227 * We're using intelDRI2Flush (called from the loader before
228 * swapbuffer) and glFlush (for front buffer rendering) as the
229 * indicator that a frame is done and then throttle when we get
230 * here as we prepare to render the next frame. At this point for
231 * round trips for swap/copy and getting new buffers are done and
232 * we'll spend less time waiting on the GPU.
233 *
234 * Unfortunately, we don't have a handle to the batch containing
235 * the swap, and getting our hands on that doesn't seem worth it,
236 * so we just use the first batch we emitted after the last swap.
237 */
238 if (brw->need_swap_throttle && brw->throttle_batch[0]) {
239 if (brw->throttle_batch[1]) {
240 if (!brw->disable_throttling)
241 drm_intel_bo_wait_rendering(brw->throttle_batch[1]);
242 drm_intel_bo_unreference(brw->throttle_batch[1]);
243 }
244 brw->throttle_batch[1] = brw->throttle_batch[0];
245 brw->throttle_batch[0] = NULL;
246 brw->need_swap_throttle = false;
247 /* Throttling here is more precise than the throttle ioctl, so skip it */
248 brw->need_flush_throttle = false;
249 }
250
251 if (brw->need_flush_throttle) {
252 __DRIscreen *psp = brw->intelScreen->driScrnPriv;
253 drmCommandNone(psp->fd, DRM_I915_GEM_THROTTLE);
254 brw->need_flush_throttle = false;
255 }
256 }
257
258 /* TODO: Push this whole function into bufmgr.
259 */
260 static int
261 do_flush_locked(struct brw_context *brw)
262 {
263 struct intel_batchbuffer *batch = &brw->batch;
264 int ret = 0;
265
266 if (brw->has_llc) {
267 drm_intel_bo_unmap(batch->bo);
268 } else {
269 ret = drm_intel_bo_subdata(batch->bo, 0, 4*batch->used, batch->map);
270 if (ret == 0 && batch->state_batch_offset != batch->bo->size) {
271 ret = drm_intel_bo_subdata(batch->bo,
272 batch->state_batch_offset,
273 batch->bo->size - batch->state_batch_offset,
274 (char *)batch->map + batch->state_batch_offset);
275 }
276 }
277
278 if (!brw->intelScreen->no_hw) {
279 int flags;
280
281 if (brw->gen >= 6 && batch->ring == BLT_RING) {
282 flags = I915_EXEC_BLT;
283 } else {
284 flags = I915_EXEC_RENDER;
285 }
286 if (batch->needs_sol_reset)
287 flags |= I915_EXEC_GEN7_SOL_RESET;
288
289 if (ret == 0) {
290 if (unlikely(INTEL_DEBUG & DEBUG_AUB))
291 brw_annotate_aub(brw);
292
293 if (brw->hw_ctx == NULL || batch->ring != RENDER_RING) {
294 ret = drm_intel_bo_mrb_exec(batch->bo, 4 * batch->used, NULL, 0, 0,
295 flags);
296 } else {
297 ret = drm_intel_gem_bo_context_exec(batch->bo, brw->hw_ctx,
298 4 * batch->used, flags);
299 }
300 }
301
302 throttle(brw);
303 }
304
305 if (unlikely(INTEL_DEBUG & DEBUG_BATCH))
306 do_batch_dump(brw);
307
308 if (ret != 0) {
309 fprintf(stderr, "intel_do_flush_locked failed: %s\n", strerror(-ret));
310 exit(1);
311 }
312
313 return ret;
314 }
315
316 int
317 _intel_batchbuffer_flush(struct brw_context *brw,
318 const char *file, int line)
319 {
320 int ret;
321
322 if (brw->batch.used == 0)
323 return 0;
324
325 if (brw->throttle_batch[0] == NULL) {
326 brw->throttle_batch[0] = brw->batch.bo;
327 drm_intel_bo_reference(brw->throttle_batch[0]);
328 }
329
330 if (unlikely(INTEL_DEBUG & DEBUG_BATCH)) {
331 int bytes_for_commands = 4 * brw->batch.used;
332 int bytes_for_state = brw->batch.bo->size - brw->batch.state_batch_offset;
333 int total_bytes = bytes_for_commands + bytes_for_state;
334 fprintf(stderr, "%s:%d: Batchbuffer flush with %4db (pkt) + "
335 "%4db (state) = %4db (%0.1f%%)\n", file, line,
336 bytes_for_commands, bytes_for_state,
337 total_bytes,
338 100.0f * total_bytes / BATCH_SZ);
339 }
340
341 brw->batch.reserved_space = 0;
342
343 brw_finish_batch(brw);
344
345 /* Mark the end of the buffer. */
346 intel_batchbuffer_emit_dword(brw, MI_BATCH_BUFFER_END);
347 if (brw->batch.used & 1) {
348 /* Round batchbuffer usage to 2 DWORDs. */
349 intel_batchbuffer_emit_dword(brw, MI_NOOP);
350 }
351
352 intel_upload_finish(brw);
353
354 /* Check that we didn't just wrap our batchbuffer at a bad time. */
355 assert(!brw->no_batch_wrap);
356
357 ret = do_flush_locked(brw);
358
359 if (unlikely(INTEL_DEBUG & DEBUG_SYNC)) {
360 fprintf(stderr, "waiting for idle\n");
361 drm_intel_bo_wait_rendering(brw->batch.bo);
362 }
363
364 /* Start a new batch buffer. */
365 brw_new_batch(brw);
366
367 return ret;
368 }
369
370
371 /* This is the only way buffers get added to the validate list.
372 */
373 bool
374 intel_batchbuffer_emit_reloc(struct brw_context *brw,
375 drm_intel_bo *buffer,
376 uint32_t read_domains, uint32_t write_domain,
377 uint32_t delta)
378 {
379 int ret;
380
381 ret = drm_intel_bo_emit_reloc(brw->batch.bo, 4*brw->batch.used,
382 buffer, delta,
383 read_domains, write_domain);
384 assert(ret == 0);
385 (void)ret;
386
387 /* Using the old buffer offset, write in what the right data would be, in
388 * case the buffer doesn't move and we can short-circuit the relocation
389 * processing in the kernel
390 */
391 intel_batchbuffer_emit_dword(brw, buffer->offset64 + delta);
392
393 return true;
394 }
395
396 bool
397 intel_batchbuffer_emit_reloc64(struct brw_context *brw,
398 drm_intel_bo *buffer,
399 uint32_t read_domains, uint32_t write_domain,
400 uint32_t delta)
401 {
402 int ret = drm_intel_bo_emit_reloc(brw->batch.bo, 4*brw->batch.used,
403 buffer, delta,
404 read_domains, write_domain);
405 assert(ret == 0);
406 (void) ret;
407
408 /* Using the old buffer offset, write in what the right data would be, in
409 * case the buffer doesn't move and we can short-circuit the relocation
410 * processing in the kernel
411 */
412 uint64_t offset = buffer->offset64 + delta;
413 intel_batchbuffer_emit_dword(brw, offset);
414 intel_batchbuffer_emit_dword(brw, offset >> 32);
415
416 return true;
417 }
418
419
420 void
421 intel_batchbuffer_data(struct brw_context *brw,
422 const void *data, GLuint bytes, enum brw_gpu_ring ring)
423 {
424 assert((bytes & 3) == 0);
425 intel_batchbuffer_require_space(brw, bytes, ring);
426 memcpy(brw->batch.map + brw->batch.used, data, bytes);
427 brw->batch.used += bytes >> 2;
428 }
429
430 static void
431 load_sized_register_mem(struct brw_context *brw,
432 uint32_t reg,
433 drm_intel_bo *bo,
434 uint32_t read_domains, uint32_t write_domain,
435 uint32_t offset,
436 int size)
437 {
438 int i;
439
440 /* MI_LOAD_REGISTER_MEM only exists on Gen7+. */
441 assert(brw->gen >= 7);
442
443 if (brw->gen >= 8) {
444 BEGIN_BATCH(4 * size);
445 for (i = 0; i < size; i++) {
446 OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (4 - 2));
447 OUT_BATCH(reg + i * 4);
448 OUT_RELOC64(bo, read_domains, write_domain, offset + i * 4);
449 }
450 ADVANCE_BATCH();
451 } else {
452 BEGIN_BATCH(3 * size);
453 for (i = 0; i < size; i++) {
454 OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (3 - 2));
455 OUT_BATCH(reg + i * 4);
456 OUT_RELOC(bo, read_domains, write_domain, offset + i * 4);
457 }
458 ADVANCE_BATCH();
459 }
460 }
461
462 void
463 brw_load_register_mem(struct brw_context *brw,
464 uint32_t reg,
465 drm_intel_bo *bo,
466 uint32_t read_domains, uint32_t write_domain,
467 uint32_t offset)
468 {
469 load_sized_register_mem(brw, reg, bo, read_domains, write_domain, offset, 1);
470 }
471
472 void
473 brw_load_register_mem64(struct brw_context *brw,
474 uint32_t reg,
475 drm_intel_bo *bo,
476 uint32_t read_domains, uint32_t write_domain,
477 uint32_t offset)
478 {
479 load_sized_register_mem(brw, reg, bo, read_domains, write_domain, offset, 2);
480 }