c7d7029fbd0e1bd926b7241dc55c78c6820586c1
[mesa.git] / src / mesa / drivers / dri / i965 / intel_batchbuffer.c
1 /*
2 * Copyright 2006 VMware, Inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the
7 * "Software"), to deal in the Software without restriction, including
8 * without limitation the rights to use, copy, modify, merge, publish,
9 * distribute, sublicense, and/or sell copies of the Software, and to
10 * permit persons to whom the Software is furnished to do so, subject to
11 * the following conditions:
12 *
13 * The above copyright notice and this permission notice (including the
14 * next paragraph) shall be included in all copies or substantial portions
15 * of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
20 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
21 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
22 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
23 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24 */
25
26 #include "intel_batchbuffer.h"
27 #include "intel_buffer_objects.h"
28 #include "brw_bufmgr.h"
29 #include "intel_buffers.h"
30 #include "intel_fbo.h"
31 #include "brw_context.h"
32 #include "brw_defines.h"
33 #include "brw_state.h"
34 #include "common/gen_decoder.h"
35
36 #include "util/hash_table.h"
37
38 #include <xf86drm.h>
39 #include <i915_drm.h>
40
41 #define FILE_DEBUG_FLAG DEBUG_BUFMGR
42
43 static void
44 intel_batchbuffer_reset(struct intel_batchbuffer *batch,
45 struct brw_bufmgr *bufmgr,
46 bool has_llc);
47
48 static bool
49 uint_key_compare(const void *a, const void *b)
50 {
51 return a == b;
52 }
53
54 static uint32_t
55 uint_key_hash(const void *key)
56 {
57 return (uintptr_t) key;
58 }
59
60 void
61 intel_batchbuffer_init(struct intel_screen *screen,
62 struct intel_batchbuffer *batch)
63 {
64 struct brw_bufmgr *bufmgr = screen->bufmgr;
65 const struct gen_device_info *devinfo = &screen->devinfo;
66
67 if (!devinfo->has_llc) {
68 batch->cpu_map = malloc(BATCH_SZ);
69 batch->map = batch->cpu_map;
70 batch->map_next = batch->cpu_map;
71 }
72
73 batch->reloc_count = 0;
74 batch->reloc_array_size = 250;
75 batch->relocs = malloc(batch->reloc_array_size *
76 sizeof(struct drm_i915_gem_relocation_entry));
77 batch->exec_count = 0;
78 batch->exec_array_size = 100;
79 batch->exec_bos =
80 malloc(batch->exec_array_size * sizeof(batch->exec_bos[0]));
81 batch->validation_list =
82 malloc(batch->exec_array_size * sizeof(batch->validation_list[0]));
83
84 if (INTEL_DEBUG & DEBUG_BATCH) {
85 batch->state_batch_sizes =
86 _mesa_hash_table_create(NULL, uint_key_hash, uint_key_compare);
87 }
88
89 batch->use_batch_first =
90 screen->kernel_features & KERNEL_ALLOWS_EXEC_BATCH_FIRST;
91
92 /* PIPE_CONTROL needs a w/a but only on gen6 */
93 batch->valid_reloc_flags = EXEC_OBJECT_WRITE;
94 if (devinfo->gen == 6)
95 batch->valid_reloc_flags |= EXEC_OBJECT_NEEDS_GTT;
96
97 intel_batchbuffer_reset(batch, bufmgr, devinfo->has_llc);
98 }
99
100 #define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x))
101
102 static unsigned
103 add_exec_bo(struct intel_batchbuffer *batch, struct brw_bo *bo)
104 {
105 unsigned index = READ_ONCE(bo->index);
106
107 if (index < batch->exec_count && batch->exec_bos[index] == bo)
108 return index;
109
110 /* May have been shared between multiple active batches */
111 for (index = 0; index < batch->exec_count; index++) {
112 if (batch->exec_bos[index] == bo)
113 return index;
114 }
115
116 if (bo != batch->bo)
117 brw_bo_reference(bo);
118
119 if (batch->exec_count == batch->exec_array_size) {
120 batch->exec_array_size *= 2;
121 batch->exec_bos =
122 realloc(batch->exec_bos,
123 batch->exec_array_size * sizeof(batch->exec_bos[0]));
124 batch->validation_list =
125 realloc(batch->validation_list,
126 batch->exec_array_size * sizeof(batch->validation_list[0]));
127 }
128
129 batch->validation_list[batch->exec_count] =
130 (struct drm_i915_gem_exec_object2) {
131 .handle = bo->gem_handle,
132 .alignment = bo->align,
133 .offset = bo->offset64,
134 .flags = bo->kflags,
135 };
136
137 bo->index = batch->exec_count;
138 batch->exec_bos[batch->exec_count] = bo;
139 batch->aperture_space += bo->size;
140
141 return batch->exec_count++;
142 }
143
144 static void
145 intel_batchbuffer_reset(struct intel_batchbuffer *batch,
146 struct brw_bufmgr *bufmgr,
147 bool has_llc)
148 {
149 if (batch->last_bo != NULL) {
150 brw_bo_unreference(batch->last_bo);
151 batch->last_bo = NULL;
152 }
153 batch->last_bo = batch->bo;
154
155 batch->bo = brw_bo_alloc(bufmgr, "batchbuffer", BATCH_SZ, 4096);
156 if (has_llc) {
157 batch->map = brw_bo_map(NULL, batch->bo, MAP_READ | MAP_WRITE);
158 }
159 batch->map_next = batch->map;
160
161 add_exec_bo(batch, batch->bo);
162 assert(batch->bo->index == 0);
163
164 batch->reserved_space = BATCH_RESERVED;
165 batch->state_batch_offset = batch->bo->size;
166 batch->needs_sol_reset = false;
167 batch->state_base_address_emitted = false;
168
169 /* We don't know what ring the new batch will be sent to until we see the
170 * first BEGIN_BATCH or BEGIN_BATCH_BLT. Mark it as unknown.
171 */
172 batch->ring = UNKNOWN_RING;
173
174 if (batch->state_batch_sizes)
175 _mesa_hash_table_clear(batch->state_batch_sizes, NULL);
176 }
177
178 static void
179 intel_batchbuffer_reset_and_clear_render_cache(struct brw_context *brw)
180 {
181 intel_batchbuffer_reset(&brw->batch, brw->bufmgr, brw->has_llc);
182 brw_render_cache_set_clear(brw);
183 }
184
185 void
186 intel_batchbuffer_save_state(struct brw_context *brw)
187 {
188 brw->batch.saved.map_next = brw->batch.map_next;
189 brw->batch.saved.reloc_count = brw->batch.reloc_count;
190 brw->batch.saved.exec_count = brw->batch.exec_count;
191 }
192
193 void
194 intel_batchbuffer_reset_to_saved(struct brw_context *brw)
195 {
196 for (int i = brw->batch.saved.exec_count;
197 i < brw->batch.exec_count; i++) {
198 if (brw->batch.exec_bos[i] != brw->batch.bo) {
199 brw_bo_unreference(brw->batch.exec_bos[i]);
200 }
201 }
202 brw->batch.reloc_count = brw->batch.saved.reloc_count;
203 brw->batch.exec_count = brw->batch.saved.exec_count;
204
205 brw->batch.map_next = brw->batch.saved.map_next;
206 if (USED_BATCH(brw->batch) == 0)
207 brw->batch.ring = UNKNOWN_RING;
208 }
209
210 void
211 intel_batchbuffer_free(struct intel_batchbuffer *batch)
212 {
213 free(batch->cpu_map);
214
215 for (int i = 0; i < batch->exec_count; i++) {
216 if (batch->exec_bos[i] != batch->bo) {
217 brw_bo_unreference(batch->exec_bos[i]);
218 }
219 }
220 free(batch->relocs);
221 free(batch->exec_bos);
222 free(batch->validation_list);
223
224 brw_bo_unreference(batch->last_bo);
225 brw_bo_unreference(batch->bo);
226 if (batch->state_batch_sizes)
227 _mesa_hash_table_destroy(batch->state_batch_sizes, NULL);
228 }
229
230 void
231 intel_batchbuffer_require_space(struct brw_context *brw, GLuint sz,
232 enum brw_gpu_ring ring)
233 {
234 const struct gen_device_info *devinfo = &brw->screen->devinfo;
235
236 /* If we're switching rings, implicitly flush the batch. */
237 if (unlikely(ring != brw->batch.ring) && brw->batch.ring != UNKNOWN_RING &&
238 devinfo->gen >= 6) {
239 intel_batchbuffer_flush(brw);
240 }
241
242 #ifdef DEBUG
243 assert(sz < BATCH_SZ - BATCH_RESERVED);
244 #endif
245 if (intel_batchbuffer_space(&brw->batch) < sz)
246 intel_batchbuffer_flush(brw);
247
248 /* The intel_batchbuffer_flush() calls above might have changed
249 * brw->batch.ring to UNKNOWN_RING, so we need to set it here at the end.
250 */
251 brw->batch.ring = ring;
252 }
253
254 #ifdef DEBUG
255 #define CSI "\e["
256 #define BLUE_HEADER CSI "0;44m"
257 #define NORMAL CSI "0m"
258
259
260 static void
261 decode_struct(struct brw_context *brw, struct gen_spec *spec,
262 const char *struct_name, uint32_t *data,
263 uint32_t gtt_offset, uint32_t offset, bool color)
264 {
265 struct gen_group *group = gen_spec_find_struct(spec, struct_name);
266 if (!group)
267 return;
268
269 fprintf(stderr, "%s\n", struct_name);
270 gen_print_group(stderr, group, gtt_offset + offset,
271 &data[offset / 4], color);
272 }
273
274 static void
275 decode_structs(struct brw_context *brw, struct gen_spec *spec,
276 const char *struct_name,
277 uint32_t *data, uint32_t gtt_offset, uint32_t offset,
278 int struct_size, bool color)
279 {
280 struct gen_group *group = gen_spec_find_struct(spec, struct_name);
281 if (!group)
282 return;
283
284 int entries = brw_state_batch_size(brw, offset) / struct_size;
285 for (int i = 0; i < entries; i++) {
286 fprintf(stderr, "%s %d\n", struct_name, i);
287 gen_print_group(stderr, group, gtt_offset + offset,
288 &data[(offset + i * struct_size) / 4], color);
289 }
290 }
291
292 static void
293 do_batch_dump(struct brw_context *brw)
294 {
295 const struct gen_device_info *devinfo = &brw->screen->devinfo;
296 struct intel_batchbuffer *batch = &brw->batch;
297 struct gen_spec *spec = gen_spec_load(&brw->screen->devinfo);
298
299 if (batch->ring != RENDER_RING)
300 return;
301
302 void *map = brw_bo_map(brw, batch->bo, MAP_READ);
303 if (map == NULL) {
304 fprintf(stderr,
305 "WARNING: failed to map batchbuffer, "
306 "dumping uploaded data instead.\n");
307 }
308
309 uint32_t *data = map ? map : batch->map;
310 uint32_t *end = data + USED_BATCH(*batch);
311 uint32_t gtt_offset = map ? batch->bo->offset64 : 0;
312 int length;
313
314 bool color = INTEL_DEBUG & DEBUG_COLOR;
315 const char *header_color = color ? BLUE_HEADER : "";
316 const char *reset_color = color ? NORMAL : "";
317
318 for (uint32_t *p = data; p < end; p += length) {
319 struct gen_group *inst = gen_spec_find_instruction(spec, p);
320 length = gen_group_get_length(inst, p);
321 assert(inst == NULL || length > 0);
322 length = MAX2(1, length);
323 if (inst == NULL) {
324 fprintf(stderr, "unknown instruction %08x\n", p[0]);
325 continue;
326 }
327
328 uint64_t offset = gtt_offset + 4 * (p - data);
329
330 fprintf(stderr, "%s0x%08"PRIx64": 0x%08x: %-80s%s\n", header_color,
331 offset, p[0], gen_group_get_name(inst), reset_color);
332
333 gen_print_group(stderr, inst, offset, p, color);
334
335 switch (gen_group_get_opcode(inst) >> 16) {
336 case _3DSTATE_PIPELINED_POINTERS:
337 /* Note: these Gen4-5 pointers are full relocations rather than
338 * offsets from the start of the batch. So we need to subtract
339 * gtt_offset (the start of the batch) to obtain an offset we
340 * can add to the map and get at the data.
341 */
342 decode_struct(brw, spec, "VS_STATE", data, gtt_offset,
343 (p[1] & ~0x1fu) - gtt_offset, color);
344 if (p[2] & 1) {
345 decode_struct(brw, spec, "GS_STATE", data, gtt_offset,
346 (p[2] & ~0x1fu) - gtt_offset, color);
347 }
348 if (p[3] & 1) {
349 decode_struct(brw, spec, "CLIP_STATE", data, gtt_offset,
350 (p[3] & ~0x1fu) - gtt_offset, color);
351 }
352 decode_struct(brw, spec, "SF_STATE", data, gtt_offset,
353 (p[4] & ~0x1fu) - gtt_offset, color);
354 decode_struct(brw, spec, "WM_STATE", data, gtt_offset,
355 (p[5] & ~0x1fu) - gtt_offset, color);
356 decode_struct(brw, spec, "COLOR_CALC_STATE", data, gtt_offset,
357 (p[6] & ~0x3fu) - gtt_offset, color);
358 break;
359 case _3DSTATE_BINDING_TABLE_POINTERS_VS:
360 case _3DSTATE_BINDING_TABLE_POINTERS_HS:
361 case _3DSTATE_BINDING_TABLE_POINTERS_DS:
362 case _3DSTATE_BINDING_TABLE_POINTERS_GS:
363 case _3DSTATE_BINDING_TABLE_POINTERS_PS: {
364 struct gen_group *group =
365 gen_spec_find_struct(spec, "RENDER_SURFACE_STATE");
366 if (!group)
367 break;
368
369 uint32_t bt_offset = p[1] & ~0x1fu;
370 int bt_entries = brw_state_batch_size(brw, bt_offset) / 4;
371 uint32_t *bt_pointers = &data[bt_offset / 4];
372 for (int i = 0; i < bt_entries; i++) {
373 fprintf(stderr, "SURFACE_STATE - BTI = %d\n", i);
374 gen_print_group(stderr, group, gtt_offset + bt_pointers[i],
375 &data[bt_pointers[i] / 4], color);
376 }
377 break;
378 }
379 case _3DSTATE_SAMPLER_STATE_POINTERS_VS:
380 case _3DSTATE_SAMPLER_STATE_POINTERS_HS:
381 case _3DSTATE_SAMPLER_STATE_POINTERS_DS:
382 case _3DSTATE_SAMPLER_STATE_POINTERS_GS:
383 case _3DSTATE_SAMPLER_STATE_POINTERS_PS:
384 decode_structs(brw, spec, "SAMPLER_STATE", data,
385 gtt_offset, p[1] & ~0x1fu, 4 * 4, color);
386 break;
387 case _3DSTATE_VIEWPORT_STATE_POINTERS:
388 decode_structs(brw, spec, "CLIP_VIEWPORT", data,
389 gtt_offset, p[1] & ~0x3fu, 4 * 4, color);
390 decode_structs(brw, spec, "SF_VIEWPORT", data,
391 gtt_offset, p[1] & ~0x3fu, 8 * 4, color);
392 decode_structs(brw, spec, "CC_VIEWPORT", data,
393 gtt_offset, p[3] & ~0x3fu, 2 * 4, color);
394 break;
395 case _3DSTATE_VIEWPORT_STATE_POINTERS_CC:
396 decode_structs(brw, spec, "CC_VIEWPORT", data,
397 gtt_offset, p[1] & ~0x3fu, 2 * 4, color);
398 break;
399 case _3DSTATE_VIEWPORT_STATE_POINTERS_SF_CL:
400 decode_structs(brw, spec, "SF_CLIP_VIEWPORT", data,
401 gtt_offset, p[1] & ~0x3fu, 16 * 4, color);
402 break;
403 case _3DSTATE_SCISSOR_STATE_POINTERS:
404 decode_structs(brw, spec, "SCISSOR_RECT", data,
405 gtt_offset, p[1] & ~0x1fu, 2 * 4, color);
406 break;
407 case _3DSTATE_BLEND_STATE_POINTERS:
408 /* TODO: handle Gen8+ extra dword at the beginning */
409 decode_structs(brw, spec, "BLEND_STATE", data,
410 gtt_offset, p[1] & ~0x3fu, 8 * 4, color);
411 break;
412 case _3DSTATE_CC_STATE_POINTERS:
413 if (devinfo->gen >= 7) {
414 decode_struct(brw, spec, "COLOR_CALC_STATE", data,
415 gtt_offset, p[1] & ~0x3fu, color);
416 } else if (devinfo->gen == 6) {
417 decode_structs(brw, spec, "BLEND_STATE", data,
418 gtt_offset, p[1] & ~0x3fu, 2 * 4, color);
419 decode_struct(brw, spec, "DEPTH_STENCIL_STATE", data,
420 gtt_offset, p[2] & ~0x3fu, color);
421 decode_struct(brw, spec, "COLOR_CALC_STATE", data,
422 gtt_offset, p[3] & ~0x3fu, color);
423 }
424 break;
425 case _3DSTATE_DEPTH_STENCIL_STATE_POINTERS:
426 decode_struct(brw, spec, "DEPTH_STENCIL_STATE", data,
427 gtt_offset, p[1] & ~0x3fu, color);
428 break;
429 }
430 }
431
432 if (map != NULL) {
433 brw_bo_unmap(batch->bo);
434 }
435 }
436 #else
437 static void do_batch_dump(struct brw_context *brw) { }
438 #endif
439
440 /**
441 * Called when starting a new batch buffer.
442 */
443 static void
444 brw_new_batch(struct brw_context *brw)
445 {
446 /* Unreference any BOs held by the previous batch, and reset counts. */
447 for (int i = 0; i < brw->batch.exec_count; i++) {
448 if (brw->batch.exec_bos[i] != brw->batch.bo) {
449 brw_bo_unreference(brw->batch.exec_bos[i]);
450 }
451 brw->batch.exec_bos[i] = NULL;
452 }
453 brw->batch.reloc_count = 0;
454 brw->batch.exec_count = 0;
455 brw->batch.aperture_space = BATCH_SZ;
456
457 /* Create a new batchbuffer and reset the associated state: */
458 intel_batchbuffer_reset_and_clear_render_cache(brw);
459
460 /* If the kernel supports hardware contexts, then most hardware state is
461 * preserved between batches; we only need to re-emit state that is required
462 * to be in every batch. Otherwise we need to re-emit all the state that
463 * would otherwise be stored in the context (which for all intents and
464 * purposes means everything).
465 */
466 if (brw->hw_ctx == 0)
467 brw->ctx.NewDriverState |= BRW_NEW_CONTEXT;
468
469 brw->ctx.NewDriverState |= BRW_NEW_BATCH;
470
471 brw->ib.index_size = -1;
472
473 /* We need to periodically reap the shader time results, because rollover
474 * happens every few seconds. We also want to see results every once in a
475 * while, because many programs won't cleanly destroy our context, so the
476 * end-of-run printout may not happen.
477 */
478 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
479 brw_collect_and_report_shader_time(brw);
480 }
481
482 /**
483 * Called from intel_batchbuffer_flush before emitting MI_BATCHBUFFER_END and
484 * sending it off.
485 *
486 * This function can emit state (say, to preserve registers that aren't saved
487 * between batches). All of this state MUST fit in the reserved space at the
488 * end of the batchbuffer. If you add more GPU state, increase the reserved
489 * space by updating the BATCH_RESERVED macro.
490 */
491 static void
492 brw_finish_batch(struct brw_context *brw)
493 {
494 const struct gen_device_info *devinfo = &brw->screen->devinfo;
495
496 /* Capture the closing pipeline statistics register values necessary to
497 * support query objects (in the non-hardware context world).
498 */
499 brw_emit_query_end(brw);
500
501 if (brw->batch.ring == RENDER_RING) {
502 /* Work around L3 state leaks into contexts set MI_RESTORE_INHIBIT which
503 * assume that the L3 cache is configured according to the hardware
504 * defaults.
505 */
506 if (devinfo->gen >= 7)
507 gen7_restore_default_l3_config(brw);
508
509 if (devinfo->is_haswell) {
510 /* From the Haswell PRM, Volume 2b, Command Reference: Instructions,
511 * 3DSTATE_CC_STATE_POINTERS > "Note":
512 *
513 * "SW must program 3DSTATE_CC_STATE_POINTERS command at the end of every
514 * 3D batch buffer followed by a PIPE_CONTROL with RC flush and CS stall."
515 *
516 * From the example in the docs, it seems to expect a regular pipe control
517 * flush here as well. We may have done it already, but meh.
518 *
519 * See also WaAvoidRCZCounterRollover.
520 */
521 brw_emit_mi_flush(brw);
522 BEGIN_BATCH(2);
523 OUT_BATCH(_3DSTATE_CC_STATE_POINTERS << 16 | (2 - 2));
524 OUT_BATCH(brw->cc.state_offset | 1);
525 ADVANCE_BATCH();
526 brw_emit_pipe_control_flush(brw, PIPE_CONTROL_RENDER_TARGET_FLUSH |
527 PIPE_CONTROL_CS_STALL);
528 }
529 }
530 }
531
532 static void
533 throttle(struct brw_context *brw)
534 {
535 /* Wait for the swapbuffers before the one we just emitted, so we
536 * don't get too many swaps outstanding for apps that are GPU-heavy
537 * but not CPU-heavy.
538 *
539 * We're using intelDRI2Flush (called from the loader before
540 * swapbuffer) and glFlush (for front buffer rendering) as the
541 * indicator that a frame is done and then throttle when we get
542 * here as we prepare to render the next frame. At this point for
543 * round trips for swap/copy and getting new buffers are done and
544 * we'll spend less time waiting on the GPU.
545 *
546 * Unfortunately, we don't have a handle to the batch containing
547 * the swap, and getting our hands on that doesn't seem worth it,
548 * so we just use the first batch we emitted after the last swap.
549 */
550 if (brw->need_swap_throttle && brw->throttle_batch[0]) {
551 if (brw->throttle_batch[1]) {
552 if (!brw->disable_throttling) {
553 /* Pass NULL rather than brw so we avoid perf_debug warnings;
554 * stalling is common and expected here...
555 */
556 brw_bo_wait_rendering(brw->throttle_batch[1]);
557 }
558 brw_bo_unreference(brw->throttle_batch[1]);
559 }
560 brw->throttle_batch[1] = brw->throttle_batch[0];
561 brw->throttle_batch[0] = NULL;
562 brw->need_swap_throttle = false;
563 /* Throttling here is more precise than the throttle ioctl, so skip it */
564 brw->need_flush_throttle = false;
565 }
566
567 if (brw->need_flush_throttle) {
568 __DRIscreen *dri_screen = brw->screen->driScrnPriv;
569 drmCommandNone(dri_screen->fd, DRM_I915_GEM_THROTTLE);
570 brw->need_flush_throttle = false;
571 }
572 }
573
574 static int
575 execbuffer(int fd,
576 struct intel_batchbuffer *batch,
577 uint32_t ctx_id,
578 int used,
579 int in_fence,
580 int *out_fence,
581 int flags)
582 {
583 struct drm_i915_gem_execbuffer2 execbuf = {
584 .buffers_ptr = (uintptr_t) batch->validation_list,
585 .buffer_count = batch->exec_count,
586 .batch_start_offset = 0,
587 .batch_len = used,
588 .flags = flags,
589 .rsvd1 = ctx_id, /* rsvd1 is actually the context ID */
590 };
591
592 unsigned long cmd = DRM_IOCTL_I915_GEM_EXECBUFFER2;
593
594 if (in_fence != -1) {
595 execbuf.rsvd2 = in_fence;
596 execbuf.flags |= I915_EXEC_FENCE_IN;
597 }
598
599 if (out_fence != NULL) {
600 cmd = DRM_IOCTL_I915_GEM_EXECBUFFER2_WR;
601 *out_fence = -1;
602 execbuf.flags |= I915_EXEC_FENCE_OUT;
603 }
604
605 int ret = drmIoctl(fd, cmd, &execbuf);
606 if (ret != 0)
607 ret = -errno;
608
609 for (int i = 0; i < batch->exec_count; i++) {
610 struct brw_bo *bo = batch->exec_bos[i];
611
612 bo->idle = false;
613 bo->index = -1;
614
615 /* Update brw_bo::offset64 */
616 if (batch->validation_list[i].offset != bo->offset64) {
617 DBG("BO %d migrated: 0x%" PRIx64 " -> 0x%llx\n",
618 bo->gem_handle, bo->offset64, batch->validation_list[i].offset);
619 bo->offset64 = batch->validation_list[i].offset;
620 }
621 }
622
623 if (ret == 0 && out_fence != NULL)
624 *out_fence = execbuf.rsvd2 >> 32;
625
626 return ret;
627 }
628
629 static int
630 do_flush_locked(struct brw_context *brw, int in_fence_fd, int *out_fence_fd)
631 {
632 const struct gen_device_info *devinfo = &brw->screen->devinfo;
633 __DRIscreen *dri_screen = brw->screen->driScrnPriv;
634 struct intel_batchbuffer *batch = &brw->batch;
635 int ret = 0;
636
637 if (brw->has_llc) {
638 brw_bo_unmap(batch->bo);
639 } else {
640 ret = brw_bo_subdata(batch->bo, 0, 4 * USED_BATCH(*batch), batch->map);
641 if (ret == 0 && batch->state_batch_offset != batch->bo->size) {
642 ret = brw_bo_subdata(batch->bo,
643 batch->state_batch_offset,
644 batch->bo->size - batch->state_batch_offset,
645 (char *)batch->map + batch->state_batch_offset);
646 }
647 }
648
649 if (!brw->screen->no_hw) {
650 /* The requirement for using I915_EXEC_NO_RELOC are:
651 *
652 * The addresses written in the objects must match the corresponding
653 * reloc.presumed_offset which in turn must match the corresponding
654 * execobject.offset.
655 *
656 * Any render targets written to in the batch must be flagged with
657 * EXEC_OBJECT_WRITE.
658 *
659 * To avoid stalling, execobject.offset should match the current
660 * address of that object within the active context.
661 */
662 int flags = I915_EXEC_NO_RELOC;
663
664 if (devinfo->gen >= 6 && batch->ring == BLT_RING) {
665 flags |= I915_EXEC_BLT;
666 } else {
667 flags |= I915_EXEC_RENDER;
668 }
669 if (batch->needs_sol_reset)
670 flags |= I915_EXEC_GEN7_SOL_RESET;
671
672 if (ret == 0) {
673 uint32_t hw_ctx = batch->ring == RENDER_RING ? brw->hw_ctx : 0;
674
675 struct drm_i915_gem_exec_object2 *entry = &batch->validation_list[0];
676 assert(entry->handle == batch->bo->gem_handle);
677 entry->relocation_count = batch->reloc_count;
678 entry->relocs_ptr = (uintptr_t) batch->relocs;
679
680 if (batch->use_batch_first) {
681 flags |= I915_EXEC_BATCH_FIRST | I915_EXEC_HANDLE_LUT;
682 } else {
683 /* Move the batch to the end of the validation list */
684 struct drm_i915_gem_exec_object2 tmp;
685 const unsigned index = batch->exec_count - 1;
686
687 tmp = *entry;
688 *entry = batch->validation_list[index];
689 batch->validation_list[index] = tmp;
690 }
691
692 ret = execbuffer(dri_screen->fd, batch, hw_ctx,
693 4 * USED_BATCH(*batch),
694 in_fence_fd, out_fence_fd, flags);
695 }
696
697 throttle(brw);
698 }
699
700 if (unlikely(INTEL_DEBUG & DEBUG_BATCH))
701 do_batch_dump(brw);
702
703 if (brw->ctx.Const.ResetStrategy == GL_LOSE_CONTEXT_ON_RESET_ARB)
704 brw_check_for_reset(brw);
705
706 if (ret != 0) {
707 fprintf(stderr, "intel_do_flush_locked failed: %s\n", strerror(-ret));
708 exit(1);
709 }
710
711 return ret;
712 }
713
714 /**
715 * The in_fence_fd is ignored if -1. Otherwise this function takes ownership
716 * of the fd.
717 *
718 * The out_fence_fd is ignored if NULL. Otherwise, the caller takes ownership
719 * of the returned fd.
720 */
721 int
722 _intel_batchbuffer_flush_fence(struct brw_context *brw,
723 int in_fence_fd, int *out_fence_fd,
724 const char *file, int line)
725 {
726 int ret;
727
728 if (USED_BATCH(brw->batch) == 0)
729 return 0;
730
731 if (brw->throttle_batch[0] == NULL) {
732 brw->throttle_batch[0] = brw->batch.bo;
733 brw_bo_reference(brw->throttle_batch[0]);
734 }
735
736 if (unlikely(INTEL_DEBUG & DEBUG_BATCH)) {
737 int bytes_for_commands = 4 * USED_BATCH(brw->batch);
738 int bytes_for_state = brw->batch.bo->size - brw->batch.state_batch_offset;
739 int total_bytes = bytes_for_commands + bytes_for_state;
740 fprintf(stderr, "%s:%d: Batchbuffer flush with %4db (pkt) + "
741 "%4db (state) = %4db (%0.1f%%)\n", file, line,
742 bytes_for_commands, bytes_for_state,
743 total_bytes,
744 100.0f * total_bytes / BATCH_SZ);
745 }
746
747 brw->batch.reserved_space = 0;
748
749 brw_finish_batch(brw);
750
751 /* Mark the end of the buffer. */
752 intel_batchbuffer_emit_dword(&brw->batch, MI_BATCH_BUFFER_END);
753 if (USED_BATCH(brw->batch) & 1) {
754 /* Round batchbuffer usage to 2 DWORDs. */
755 intel_batchbuffer_emit_dword(&brw->batch, MI_NOOP);
756 }
757
758 intel_upload_finish(brw);
759
760 /* Check that we didn't just wrap our batchbuffer at a bad time. */
761 assert(!brw->no_batch_wrap);
762
763 ret = do_flush_locked(brw, in_fence_fd, out_fence_fd);
764
765 if (unlikely(INTEL_DEBUG & DEBUG_SYNC)) {
766 fprintf(stderr, "waiting for idle\n");
767 brw_bo_wait_rendering(brw->batch.bo);
768 }
769
770 /* Start a new batch buffer. */
771 brw_new_batch(brw);
772
773 return ret;
774 }
775
776 bool
777 brw_batch_has_aperture_space(struct brw_context *brw, unsigned extra_space)
778 {
779 return brw->batch.aperture_space + extra_space <=
780 brw->screen->aperture_threshold;
781 }
782
783 bool
784 brw_batch_references(struct intel_batchbuffer *batch, struct brw_bo *bo)
785 {
786 unsigned index = READ_ONCE(bo->index);
787 if (index < batch->exec_count && batch->exec_bos[index] == bo)
788 return true;
789
790 for (int i = 0; i < batch->exec_count; i++) {
791 if (batch->exec_bos[i] == bo)
792 return true;
793 }
794 return false;
795 }
796
797 /* This is the only way buffers get added to the validate list.
798 */
799 uint64_t
800 brw_emit_reloc(struct intel_batchbuffer *batch, uint32_t batch_offset,
801 struct brw_bo *target, uint32_t target_offset,
802 unsigned int reloc_flags)
803 {
804 assert(target != NULL);
805
806 if (batch->reloc_count == batch->reloc_array_size) {
807 batch->reloc_array_size *= 2;
808 batch->relocs = realloc(batch->relocs,
809 batch->reloc_array_size *
810 sizeof(struct drm_i915_gem_relocation_entry));
811 }
812
813 /* Check args */
814 assert(batch_offset <= BATCH_SZ - sizeof(uint32_t));
815
816 unsigned int index = add_exec_bo(batch, target);
817 struct drm_i915_gem_exec_object2 *entry = &batch->validation_list[index];
818
819 if (reloc_flags)
820 entry->flags |= reloc_flags & batch->valid_reloc_flags;
821
822 batch->relocs[batch->reloc_count++] =
823 (struct drm_i915_gem_relocation_entry) {
824 .offset = batch_offset,
825 .delta = target_offset,
826 .target_handle = batch->use_batch_first ? index : target->gem_handle,
827 .presumed_offset = entry->offset,
828 };
829
830 /* Using the old buffer offset, write in what the right data would be, in
831 * case the buffer doesn't move and we can short-circuit the relocation
832 * processing in the kernel
833 */
834 return entry->offset + target_offset;
835 }
836
837 void
838 intel_batchbuffer_data(struct brw_context *brw,
839 const void *data, GLuint bytes, enum brw_gpu_ring ring)
840 {
841 assert((bytes & 3) == 0);
842 intel_batchbuffer_require_space(brw, bytes, ring);
843 memcpy(brw->batch.map_next, data, bytes);
844 brw->batch.map_next += bytes >> 2;
845 }
846
847 static void
848 load_sized_register_mem(struct brw_context *brw,
849 uint32_t reg,
850 struct brw_bo *bo,
851 uint32_t offset,
852 int size)
853 {
854 const struct gen_device_info *devinfo = &brw->screen->devinfo;
855 int i;
856
857 /* MI_LOAD_REGISTER_MEM only exists on Gen7+. */
858 assert(devinfo->gen >= 7);
859
860 if (devinfo->gen >= 8) {
861 BEGIN_BATCH(4 * size);
862 for (i = 0; i < size; i++) {
863 OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (4 - 2));
864 OUT_BATCH(reg + i * 4);
865 OUT_RELOC64(bo, 0, offset + i * 4);
866 }
867 ADVANCE_BATCH();
868 } else {
869 BEGIN_BATCH(3 * size);
870 for (i = 0; i < size; i++) {
871 OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (3 - 2));
872 OUT_BATCH(reg + i * 4);
873 OUT_RELOC(bo, 0, offset + i * 4);
874 }
875 ADVANCE_BATCH();
876 }
877 }
878
879 void
880 brw_load_register_mem(struct brw_context *brw,
881 uint32_t reg,
882 struct brw_bo *bo,
883 uint32_t offset)
884 {
885 load_sized_register_mem(brw, reg, bo, offset, 1);
886 }
887
888 void
889 brw_load_register_mem64(struct brw_context *brw,
890 uint32_t reg,
891 struct brw_bo *bo,
892 uint32_t offset)
893 {
894 load_sized_register_mem(brw, reg, bo, offset, 2);
895 }
896
897 /*
898 * Write an arbitrary 32-bit register to a buffer via MI_STORE_REGISTER_MEM.
899 */
900 void
901 brw_store_register_mem32(struct brw_context *brw,
902 struct brw_bo *bo, uint32_t reg, uint32_t offset)
903 {
904 const struct gen_device_info *devinfo = &brw->screen->devinfo;
905
906 assert(devinfo->gen >= 6);
907
908 if (devinfo->gen >= 8) {
909 BEGIN_BATCH(4);
910 OUT_BATCH(MI_STORE_REGISTER_MEM | (4 - 2));
911 OUT_BATCH(reg);
912 OUT_RELOC64(bo, RELOC_WRITE, offset);
913 ADVANCE_BATCH();
914 } else {
915 BEGIN_BATCH(3);
916 OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
917 OUT_BATCH(reg);
918 OUT_RELOC(bo, RELOC_WRITE | RELOC_NEEDS_GGTT, offset);
919 ADVANCE_BATCH();
920 }
921 }
922
923 /*
924 * Write an arbitrary 64-bit register to a buffer via MI_STORE_REGISTER_MEM.
925 */
926 void
927 brw_store_register_mem64(struct brw_context *brw,
928 struct brw_bo *bo, uint32_t reg, uint32_t offset)
929 {
930 const struct gen_device_info *devinfo = &brw->screen->devinfo;
931
932 assert(devinfo->gen >= 6);
933
934 /* MI_STORE_REGISTER_MEM only stores a single 32-bit value, so to
935 * read a full 64-bit register, we need to do two of them.
936 */
937 if (devinfo->gen >= 8) {
938 BEGIN_BATCH(8);
939 OUT_BATCH(MI_STORE_REGISTER_MEM | (4 - 2));
940 OUT_BATCH(reg);
941 OUT_RELOC64(bo, RELOC_WRITE, offset);
942 OUT_BATCH(MI_STORE_REGISTER_MEM | (4 - 2));
943 OUT_BATCH(reg + sizeof(uint32_t));
944 OUT_RELOC64(bo, RELOC_WRITE, offset + sizeof(uint32_t));
945 ADVANCE_BATCH();
946 } else {
947 BEGIN_BATCH(6);
948 OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
949 OUT_BATCH(reg);
950 OUT_RELOC(bo, RELOC_WRITE | RELOC_NEEDS_GGTT, offset);
951 OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
952 OUT_BATCH(reg + sizeof(uint32_t));
953 OUT_RELOC(bo, RELOC_WRITE | RELOC_NEEDS_GGTT, offset + sizeof(uint32_t));
954 ADVANCE_BATCH();
955 }
956 }
957
958 /*
959 * Write a 32-bit register using immediate data.
960 */
961 void
962 brw_load_register_imm32(struct brw_context *brw, uint32_t reg, uint32_t imm)
963 {
964 const struct gen_device_info *devinfo = &brw->screen->devinfo;
965
966 assert(devinfo->gen >= 6);
967
968 BEGIN_BATCH(3);
969 OUT_BATCH(MI_LOAD_REGISTER_IMM | (3 - 2));
970 OUT_BATCH(reg);
971 OUT_BATCH(imm);
972 ADVANCE_BATCH();
973 }
974
975 /*
976 * Write a 64-bit register using immediate data.
977 */
978 void
979 brw_load_register_imm64(struct brw_context *brw, uint32_t reg, uint64_t imm)
980 {
981 const struct gen_device_info *devinfo = &brw->screen->devinfo;
982
983 assert(devinfo->gen >= 6);
984
985 BEGIN_BATCH(5);
986 OUT_BATCH(MI_LOAD_REGISTER_IMM | (5 - 2));
987 OUT_BATCH(reg);
988 OUT_BATCH(imm & 0xffffffff);
989 OUT_BATCH(reg + 4);
990 OUT_BATCH(imm >> 32);
991 ADVANCE_BATCH();
992 }
993
994 /*
995 * Copies a 32-bit register.
996 */
997 void
998 brw_load_register_reg(struct brw_context *brw, uint32_t src, uint32_t dest)
999 {
1000 const struct gen_device_info *devinfo = &brw->screen->devinfo;
1001
1002 assert(devinfo->gen >= 8 || devinfo->is_haswell);
1003
1004 BEGIN_BATCH(3);
1005 OUT_BATCH(MI_LOAD_REGISTER_REG | (3 - 2));
1006 OUT_BATCH(src);
1007 OUT_BATCH(dest);
1008 ADVANCE_BATCH();
1009 }
1010
1011 /*
1012 * Copies a 64-bit register.
1013 */
1014 void
1015 brw_load_register_reg64(struct brw_context *brw, uint32_t src, uint32_t dest)
1016 {
1017 const struct gen_device_info *devinfo = &brw->screen->devinfo;
1018
1019 assert(devinfo->gen >= 8 || devinfo->is_haswell);
1020
1021 BEGIN_BATCH(6);
1022 OUT_BATCH(MI_LOAD_REGISTER_REG | (3 - 2));
1023 OUT_BATCH(src);
1024 OUT_BATCH(dest);
1025 OUT_BATCH(MI_LOAD_REGISTER_REG | (3 - 2));
1026 OUT_BATCH(src + sizeof(uint32_t));
1027 OUT_BATCH(dest + sizeof(uint32_t));
1028 ADVANCE_BATCH();
1029 }
1030
1031 /*
1032 * Write 32-bits of immediate data to a GPU memory buffer.
1033 */
1034 void
1035 brw_store_data_imm32(struct brw_context *brw, struct brw_bo *bo,
1036 uint32_t offset, uint32_t imm)
1037 {
1038 const struct gen_device_info *devinfo = &brw->screen->devinfo;
1039
1040 assert(devinfo->gen >= 6);
1041
1042 BEGIN_BATCH(4);
1043 OUT_BATCH(MI_STORE_DATA_IMM | (4 - 2));
1044 if (devinfo->gen >= 8)
1045 OUT_RELOC64(bo, RELOC_WRITE, offset);
1046 else {
1047 OUT_BATCH(0); /* MBZ */
1048 OUT_RELOC(bo, RELOC_WRITE, offset);
1049 }
1050 OUT_BATCH(imm);
1051 ADVANCE_BATCH();
1052 }
1053
1054 /*
1055 * Write 64-bits of immediate data to a GPU memory buffer.
1056 */
1057 void
1058 brw_store_data_imm64(struct brw_context *brw, struct brw_bo *bo,
1059 uint32_t offset, uint64_t imm)
1060 {
1061 const struct gen_device_info *devinfo = &brw->screen->devinfo;
1062
1063 assert(devinfo->gen >= 6);
1064
1065 BEGIN_BATCH(5);
1066 OUT_BATCH(MI_STORE_DATA_IMM | (5 - 2));
1067 if (devinfo->gen >= 8)
1068 OUT_RELOC64(bo, 0, offset);
1069 else {
1070 OUT_BATCH(0); /* MBZ */
1071 OUT_RELOC(bo, RELOC_WRITE, offset);
1072 }
1073 OUT_BATCH(imm & 0xffffffffu);
1074 OUT_BATCH(imm >> 32);
1075 ADVANCE_BATCH();
1076 }