i965: Drop a useless ret == 0 check.
[mesa.git] / src / mesa / drivers / dri / i965 / intel_batchbuffer.c
1 /*
2 * Copyright 2006 VMware, Inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the
7 * "Software"), to deal in the Software without restriction, including
8 * without limitation the rights to use, copy, modify, merge, publish,
9 * distribute, sublicense, and/or sell copies of the Software, and to
10 * permit persons to whom the Software is furnished to do so, subject to
11 * the following conditions:
12 *
13 * The above copyright notice and this permission notice (including the
14 * next paragraph) shall be included in all copies or substantial portions
15 * of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
20 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
21 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
22 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
23 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24 */
25
26 #include "intel_batchbuffer.h"
27 #include "intel_buffer_objects.h"
28 #include "brw_bufmgr.h"
29 #include "intel_buffers.h"
30 #include "intel_fbo.h"
31 #include "brw_context.h"
32 #include "brw_defines.h"
33 #include "brw_state.h"
34 #include "common/gen_decoder.h"
35
36 #include "util/hash_table.h"
37
38 #include <xf86drm.h>
39 #include <i915_drm.h>
40
41 #define FILE_DEBUG_FLAG DEBUG_BUFMGR
42
43 #define BATCH_SZ (8192*sizeof(uint32_t))
44
45 static void
46 intel_batchbuffer_reset(struct intel_batchbuffer *batch,
47 struct brw_bufmgr *bufmgr,
48 bool has_llc);
49
50 static bool
51 uint_key_compare(const void *a, const void *b)
52 {
53 return a == b;
54 }
55
56 static uint32_t
57 uint_key_hash(const void *key)
58 {
59 return (uintptr_t) key;
60 }
61
62 void
63 intel_batchbuffer_init(struct intel_screen *screen,
64 struct intel_batchbuffer *batch)
65 {
66 struct brw_bufmgr *bufmgr = screen->bufmgr;
67 const struct gen_device_info *devinfo = &screen->devinfo;
68
69 if (!devinfo->has_llc) {
70 batch->cpu_map = malloc(BATCH_SZ);
71 batch->map = batch->cpu_map;
72 batch->map_next = batch->cpu_map;
73 }
74
75 batch->reloc_count = 0;
76 batch->reloc_array_size = 250;
77 batch->relocs = malloc(batch->reloc_array_size *
78 sizeof(struct drm_i915_gem_relocation_entry));
79 batch->exec_count = 0;
80 batch->exec_array_size = 100;
81 batch->exec_bos =
82 malloc(batch->exec_array_size * sizeof(batch->exec_bos[0]));
83 batch->validation_list =
84 malloc(batch->exec_array_size * sizeof(batch->validation_list[0]));
85
86 if (INTEL_DEBUG & DEBUG_BATCH) {
87 batch->state_batch_sizes =
88 _mesa_hash_table_create(NULL, uint_key_hash, uint_key_compare);
89 }
90
91 batch->use_batch_first =
92 screen->kernel_features & KERNEL_ALLOWS_EXEC_BATCH_FIRST;
93
94 /* PIPE_CONTROL needs a w/a but only on gen6 */
95 batch->valid_reloc_flags = EXEC_OBJECT_WRITE;
96 if (devinfo->gen == 6)
97 batch->valid_reloc_flags |= EXEC_OBJECT_NEEDS_GTT;
98
99 intel_batchbuffer_reset(batch, bufmgr, devinfo->has_llc);
100 }
101
102 #define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x))
103
104 static unsigned
105 add_exec_bo(struct intel_batchbuffer *batch, struct brw_bo *bo)
106 {
107 unsigned index = READ_ONCE(bo->index);
108
109 if (index < batch->exec_count && batch->exec_bos[index] == bo)
110 return index;
111
112 /* May have been shared between multiple active batches */
113 for (index = 0; index < batch->exec_count; index++) {
114 if (batch->exec_bos[index] == bo)
115 return index;
116 }
117
118 brw_bo_reference(bo);
119
120 if (batch->exec_count == batch->exec_array_size) {
121 batch->exec_array_size *= 2;
122 batch->exec_bos =
123 realloc(batch->exec_bos,
124 batch->exec_array_size * sizeof(batch->exec_bos[0]));
125 batch->validation_list =
126 realloc(batch->validation_list,
127 batch->exec_array_size * sizeof(batch->validation_list[0]));
128 }
129
130 batch->validation_list[batch->exec_count] =
131 (struct drm_i915_gem_exec_object2) {
132 .handle = bo->gem_handle,
133 .alignment = bo->align,
134 .offset = bo->gtt_offset,
135 .flags = bo->kflags,
136 };
137
138 bo->index = batch->exec_count;
139 batch->exec_bos[batch->exec_count] = bo;
140 batch->aperture_space += bo->size;
141
142 return batch->exec_count++;
143 }
144
145 static void
146 intel_batchbuffer_reset(struct intel_batchbuffer *batch,
147 struct brw_bufmgr *bufmgr,
148 bool has_llc)
149 {
150 if (batch->last_bo != NULL) {
151 brw_bo_unreference(batch->last_bo);
152 batch->last_bo = NULL;
153 }
154 batch->last_bo = batch->bo;
155
156 batch->bo = brw_bo_alloc(bufmgr, "batchbuffer", BATCH_SZ, 4096);
157 if (has_llc) {
158 batch->map = brw_bo_map(NULL, batch->bo, MAP_READ | MAP_WRITE);
159 }
160 batch->map_next = batch->map;
161
162 add_exec_bo(batch, batch->bo);
163 assert(batch->bo->index == 0);
164
165 batch->reserved_space = BATCH_RESERVED;
166 batch->state_batch_offset = batch->bo->size;
167 batch->needs_sol_reset = false;
168 batch->state_base_address_emitted = false;
169
170 /* We don't know what ring the new batch will be sent to until we see the
171 * first BEGIN_BATCH or BEGIN_BATCH_BLT. Mark it as unknown.
172 */
173 batch->ring = UNKNOWN_RING;
174
175 if (batch->state_batch_sizes)
176 _mesa_hash_table_clear(batch->state_batch_sizes, NULL);
177 }
178
179 static void
180 intel_batchbuffer_reset_and_clear_render_cache(struct brw_context *brw)
181 {
182 const struct gen_device_info *devinfo = &brw->screen->devinfo;
183
184 intel_batchbuffer_reset(&brw->batch, brw->bufmgr, devinfo->has_llc);
185 brw_render_cache_set_clear(brw);
186 }
187
188 void
189 intel_batchbuffer_save_state(struct brw_context *brw)
190 {
191 brw->batch.saved.map_next = brw->batch.map_next;
192 brw->batch.saved.reloc_count = brw->batch.reloc_count;
193 brw->batch.saved.exec_count = brw->batch.exec_count;
194 }
195
196 void
197 intel_batchbuffer_reset_to_saved(struct brw_context *brw)
198 {
199 for (int i = brw->batch.saved.exec_count;
200 i < brw->batch.exec_count; i++) {
201 brw_bo_unreference(brw->batch.exec_bos[i]);
202 }
203 brw->batch.reloc_count = brw->batch.saved.reloc_count;
204 brw->batch.exec_count = brw->batch.saved.exec_count;
205
206 brw->batch.map_next = brw->batch.saved.map_next;
207 if (USED_BATCH(brw->batch) == 0)
208 brw->batch.ring = UNKNOWN_RING;
209 }
210
211 void
212 intel_batchbuffer_free(struct intel_batchbuffer *batch)
213 {
214 free(batch->cpu_map);
215
216 for (int i = 0; i < batch->exec_count; i++) {
217 brw_bo_unreference(batch->exec_bos[i]);
218 }
219 free(batch->relocs);
220 free(batch->exec_bos);
221 free(batch->validation_list);
222
223 brw_bo_unreference(batch->last_bo);
224 brw_bo_unreference(batch->bo);
225 if (batch->state_batch_sizes)
226 _mesa_hash_table_destroy(batch->state_batch_sizes, NULL);
227 }
228
229 void
230 intel_batchbuffer_require_space(struct brw_context *brw, GLuint sz,
231 enum brw_gpu_ring ring)
232 {
233 const struct gen_device_info *devinfo = &brw->screen->devinfo;
234
235 /* If we're switching rings, implicitly flush the batch. */
236 if (unlikely(ring != brw->batch.ring) && brw->batch.ring != UNKNOWN_RING &&
237 devinfo->gen >= 6) {
238 intel_batchbuffer_flush(brw);
239 }
240
241 if (intel_batchbuffer_space(&brw->batch) < sz)
242 intel_batchbuffer_flush(brw);
243
244 /* The intel_batchbuffer_flush() calls above might have changed
245 * brw->batch.ring to UNKNOWN_RING, so we need to set it here at the end.
246 */
247 brw->batch.ring = ring;
248 }
249
250 #ifdef DEBUG
251 #define CSI "\e["
252 #define BLUE_HEADER CSI "0;44m"
253 #define NORMAL CSI "0m"
254
255
256 static void
257 decode_struct(struct brw_context *brw, struct gen_spec *spec,
258 const char *struct_name, uint32_t *data,
259 uint32_t gtt_offset, uint32_t offset, bool color)
260 {
261 struct gen_group *group = gen_spec_find_struct(spec, struct_name);
262 if (!group)
263 return;
264
265 fprintf(stderr, "%s\n", struct_name);
266 gen_print_group(stderr, group, gtt_offset + offset,
267 &data[offset / 4], color);
268 }
269
270 static void
271 decode_structs(struct brw_context *brw, struct gen_spec *spec,
272 const char *struct_name,
273 uint32_t *data, uint32_t gtt_offset, uint32_t offset,
274 int struct_size, bool color)
275 {
276 struct gen_group *group = gen_spec_find_struct(spec, struct_name);
277 if (!group)
278 return;
279
280 int entries = brw_state_batch_size(brw, offset) / struct_size;
281 for (int i = 0; i < entries; i++) {
282 fprintf(stderr, "%s %d\n", struct_name, i);
283 gen_print_group(stderr, group, gtt_offset + offset,
284 &data[(offset + i * struct_size) / 4], color);
285 }
286 }
287
288 static void
289 do_batch_dump(struct brw_context *brw)
290 {
291 const struct gen_device_info *devinfo = &brw->screen->devinfo;
292 struct intel_batchbuffer *batch = &brw->batch;
293 struct gen_spec *spec = gen_spec_load(&brw->screen->devinfo);
294
295 if (batch->ring != RENDER_RING)
296 return;
297
298 void *map = brw_bo_map(brw, batch->bo, MAP_READ);
299 if (map == NULL) {
300 fprintf(stderr,
301 "WARNING: failed to map batchbuffer, "
302 "dumping uploaded data instead.\n");
303 }
304
305 uint32_t *data = map ? map : batch->map;
306 uint32_t *end = data + USED_BATCH(*batch);
307 uint32_t gtt_offset = map ? batch->bo->gtt_offset : 0;
308 int length;
309
310 bool color = INTEL_DEBUG & DEBUG_COLOR;
311 const char *header_color = color ? BLUE_HEADER : "";
312 const char *reset_color = color ? NORMAL : "";
313
314 for (uint32_t *p = data; p < end; p += length) {
315 struct gen_group *inst = gen_spec_find_instruction(spec, p);
316 length = gen_group_get_length(inst, p);
317 assert(inst == NULL || length > 0);
318 length = MAX2(1, length);
319 if (inst == NULL) {
320 fprintf(stderr, "unknown instruction %08x\n", p[0]);
321 continue;
322 }
323
324 uint64_t offset = gtt_offset + 4 * (p - data);
325
326 fprintf(stderr, "%s0x%08"PRIx64": 0x%08x: %-80s%s\n", header_color,
327 offset, p[0], gen_group_get_name(inst), reset_color);
328
329 gen_print_group(stderr, inst, offset, p, color);
330
331 switch (gen_group_get_opcode(inst) >> 16) {
332 case _3DSTATE_PIPELINED_POINTERS:
333 /* Note: these Gen4-5 pointers are full relocations rather than
334 * offsets from the start of the batch. So we need to subtract
335 * gtt_offset (the start of the batch) to obtain an offset we
336 * can add to the map and get at the data.
337 */
338 decode_struct(brw, spec, "VS_STATE", data, gtt_offset,
339 (p[1] & ~0x1fu) - gtt_offset, color);
340 if (p[2] & 1) {
341 decode_struct(brw, spec, "GS_STATE", data, gtt_offset,
342 (p[2] & ~0x1fu) - gtt_offset, color);
343 }
344 if (p[3] & 1) {
345 decode_struct(brw, spec, "CLIP_STATE", data, gtt_offset,
346 (p[3] & ~0x1fu) - gtt_offset, color);
347 }
348 decode_struct(brw, spec, "SF_STATE", data, gtt_offset,
349 (p[4] & ~0x1fu) - gtt_offset, color);
350 decode_struct(brw, spec, "WM_STATE", data, gtt_offset,
351 (p[5] & ~0x1fu) - gtt_offset, color);
352 decode_struct(brw, spec, "COLOR_CALC_STATE", data, gtt_offset,
353 (p[6] & ~0x3fu) - gtt_offset, color);
354 break;
355 case _3DSTATE_BINDING_TABLE_POINTERS_VS:
356 case _3DSTATE_BINDING_TABLE_POINTERS_HS:
357 case _3DSTATE_BINDING_TABLE_POINTERS_DS:
358 case _3DSTATE_BINDING_TABLE_POINTERS_GS:
359 case _3DSTATE_BINDING_TABLE_POINTERS_PS: {
360 struct gen_group *group =
361 gen_spec_find_struct(spec, "RENDER_SURFACE_STATE");
362 if (!group)
363 break;
364
365 uint32_t bt_offset = p[1] & ~0x1fu;
366 int bt_entries = brw_state_batch_size(brw, bt_offset) / 4;
367 uint32_t *bt_pointers = &data[bt_offset / 4];
368 for (int i = 0; i < bt_entries; i++) {
369 fprintf(stderr, "SURFACE_STATE - BTI = %d\n", i);
370 gen_print_group(stderr, group, gtt_offset + bt_pointers[i],
371 &data[bt_pointers[i] / 4], color);
372 }
373 break;
374 }
375 case _3DSTATE_SAMPLER_STATE_POINTERS_VS:
376 case _3DSTATE_SAMPLER_STATE_POINTERS_HS:
377 case _3DSTATE_SAMPLER_STATE_POINTERS_DS:
378 case _3DSTATE_SAMPLER_STATE_POINTERS_GS:
379 case _3DSTATE_SAMPLER_STATE_POINTERS_PS:
380 decode_structs(brw, spec, "SAMPLER_STATE", data,
381 gtt_offset, p[1] & ~0x1fu, 4 * 4, color);
382 break;
383 case _3DSTATE_VIEWPORT_STATE_POINTERS:
384 decode_structs(brw, spec, "CLIP_VIEWPORT", data,
385 gtt_offset, p[1] & ~0x3fu, 4 * 4, color);
386 decode_structs(brw, spec, "SF_VIEWPORT", data,
387 gtt_offset, p[1] & ~0x3fu, 8 * 4, color);
388 decode_structs(brw, spec, "CC_VIEWPORT", data,
389 gtt_offset, p[3] & ~0x3fu, 2 * 4, color);
390 break;
391 case _3DSTATE_VIEWPORT_STATE_POINTERS_CC:
392 decode_structs(brw, spec, "CC_VIEWPORT", data,
393 gtt_offset, p[1] & ~0x3fu, 2 * 4, color);
394 break;
395 case _3DSTATE_VIEWPORT_STATE_POINTERS_SF_CL:
396 decode_structs(brw, spec, "SF_CLIP_VIEWPORT", data,
397 gtt_offset, p[1] & ~0x3fu, 16 * 4, color);
398 break;
399 case _3DSTATE_SCISSOR_STATE_POINTERS:
400 decode_structs(brw, spec, "SCISSOR_RECT", data,
401 gtt_offset, p[1] & ~0x1fu, 2 * 4, color);
402 break;
403 case _3DSTATE_BLEND_STATE_POINTERS:
404 /* TODO: handle Gen8+ extra dword at the beginning */
405 decode_structs(brw, spec, "BLEND_STATE", data,
406 gtt_offset, p[1] & ~0x3fu, 8 * 4, color);
407 break;
408 case _3DSTATE_CC_STATE_POINTERS:
409 if (devinfo->gen >= 7) {
410 decode_struct(brw, spec, "COLOR_CALC_STATE", data,
411 gtt_offset, p[1] & ~0x3fu, color);
412 } else if (devinfo->gen == 6) {
413 decode_structs(brw, spec, "BLEND_STATE", data,
414 gtt_offset, p[1] & ~0x3fu, 2 * 4, color);
415 decode_struct(brw, spec, "DEPTH_STENCIL_STATE", data,
416 gtt_offset, p[2] & ~0x3fu, color);
417 decode_struct(brw, spec, "COLOR_CALC_STATE", data,
418 gtt_offset, p[3] & ~0x3fu, color);
419 }
420 break;
421 case _3DSTATE_DEPTH_STENCIL_STATE_POINTERS:
422 decode_struct(brw, spec, "DEPTH_STENCIL_STATE", data,
423 gtt_offset, p[1] & ~0x3fu, color);
424 break;
425 }
426 }
427
428 if (map != NULL) {
429 brw_bo_unmap(batch->bo);
430 }
431 }
432 #else
433 static void do_batch_dump(struct brw_context *brw) { }
434 #endif
435
436 /**
437 * Called when starting a new batch buffer.
438 */
439 static void
440 brw_new_batch(struct brw_context *brw)
441 {
442 /* Unreference any BOs held by the previous batch, and reset counts. */
443 for (int i = 0; i < brw->batch.exec_count; i++) {
444 brw_bo_unreference(brw->batch.exec_bos[i]);
445 brw->batch.exec_bos[i] = NULL;
446 }
447 brw->batch.reloc_count = 0;
448 brw->batch.exec_count = 0;
449 brw->batch.aperture_space = 0;
450
451 /* Create a new batchbuffer and reset the associated state: */
452 intel_batchbuffer_reset_and_clear_render_cache(brw);
453
454 /* If the kernel supports hardware contexts, then most hardware state is
455 * preserved between batches; we only need to re-emit state that is required
456 * to be in every batch. Otherwise we need to re-emit all the state that
457 * would otherwise be stored in the context (which for all intents and
458 * purposes means everything).
459 */
460 if (brw->hw_ctx == 0)
461 brw->ctx.NewDriverState |= BRW_NEW_CONTEXT;
462
463 brw->ctx.NewDriverState |= BRW_NEW_BATCH;
464
465 brw->ib.index_size = -1;
466
467 /* We need to periodically reap the shader time results, because rollover
468 * happens every few seconds. We also want to see results every once in a
469 * while, because many programs won't cleanly destroy our context, so the
470 * end-of-run printout may not happen.
471 */
472 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
473 brw_collect_and_report_shader_time(brw);
474 }
475
476 /**
477 * Called from intel_batchbuffer_flush before emitting MI_BATCHBUFFER_END and
478 * sending it off.
479 *
480 * This function can emit state (say, to preserve registers that aren't saved
481 * between batches). All of this state MUST fit in the reserved space at the
482 * end of the batchbuffer. If you add more GPU state, increase the reserved
483 * space by updating the BATCH_RESERVED macro.
484 */
485 static void
486 brw_finish_batch(struct brw_context *brw)
487 {
488 const struct gen_device_info *devinfo = &brw->screen->devinfo;
489
490 /* Capture the closing pipeline statistics register values necessary to
491 * support query objects (in the non-hardware context world).
492 */
493 brw_emit_query_end(brw);
494
495 if (brw->batch.ring == RENDER_RING) {
496 /* Work around L3 state leaks into contexts set MI_RESTORE_INHIBIT which
497 * assume that the L3 cache is configured according to the hardware
498 * defaults.
499 */
500 if (devinfo->gen >= 7)
501 gen7_restore_default_l3_config(brw);
502
503 if (devinfo->is_haswell) {
504 /* From the Haswell PRM, Volume 2b, Command Reference: Instructions,
505 * 3DSTATE_CC_STATE_POINTERS > "Note":
506 *
507 * "SW must program 3DSTATE_CC_STATE_POINTERS command at the end of every
508 * 3D batch buffer followed by a PIPE_CONTROL with RC flush and CS stall."
509 *
510 * From the example in the docs, it seems to expect a regular pipe control
511 * flush here as well. We may have done it already, but meh.
512 *
513 * See also WaAvoidRCZCounterRollover.
514 */
515 brw_emit_mi_flush(brw);
516 BEGIN_BATCH(2);
517 OUT_BATCH(_3DSTATE_CC_STATE_POINTERS << 16 | (2 - 2));
518 OUT_BATCH(brw->cc.state_offset | 1);
519 ADVANCE_BATCH();
520 brw_emit_pipe_control_flush(brw, PIPE_CONTROL_RENDER_TARGET_FLUSH |
521 PIPE_CONTROL_CS_STALL);
522 }
523 }
524 }
525
526 static void
527 throttle(struct brw_context *brw)
528 {
529 /* Wait for the swapbuffers before the one we just emitted, so we
530 * don't get too many swaps outstanding for apps that are GPU-heavy
531 * but not CPU-heavy.
532 *
533 * We're using intelDRI2Flush (called from the loader before
534 * swapbuffer) and glFlush (for front buffer rendering) as the
535 * indicator that a frame is done and then throttle when we get
536 * here as we prepare to render the next frame. At this point for
537 * round trips for swap/copy and getting new buffers are done and
538 * we'll spend less time waiting on the GPU.
539 *
540 * Unfortunately, we don't have a handle to the batch containing
541 * the swap, and getting our hands on that doesn't seem worth it,
542 * so we just use the first batch we emitted after the last swap.
543 */
544 if (brw->need_swap_throttle && brw->throttle_batch[0]) {
545 if (brw->throttle_batch[1]) {
546 if (!brw->disable_throttling) {
547 /* Pass NULL rather than brw so we avoid perf_debug warnings;
548 * stalling is common and expected here...
549 */
550 brw_bo_wait_rendering(brw->throttle_batch[1]);
551 }
552 brw_bo_unreference(brw->throttle_batch[1]);
553 }
554 brw->throttle_batch[1] = brw->throttle_batch[0];
555 brw->throttle_batch[0] = NULL;
556 brw->need_swap_throttle = false;
557 /* Throttling here is more precise than the throttle ioctl, so skip it */
558 brw->need_flush_throttle = false;
559 }
560
561 if (brw->need_flush_throttle) {
562 __DRIscreen *dri_screen = brw->screen->driScrnPriv;
563 drmCommandNone(dri_screen->fd, DRM_I915_GEM_THROTTLE);
564 brw->need_flush_throttle = false;
565 }
566 }
567
568 static int
569 execbuffer(int fd,
570 struct intel_batchbuffer *batch,
571 uint32_t ctx_id,
572 int used,
573 int in_fence,
574 int *out_fence,
575 int flags)
576 {
577 struct drm_i915_gem_execbuffer2 execbuf = {
578 .buffers_ptr = (uintptr_t) batch->validation_list,
579 .buffer_count = batch->exec_count,
580 .batch_start_offset = 0,
581 .batch_len = used,
582 .flags = flags,
583 .rsvd1 = ctx_id, /* rsvd1 is actually the context ID */
584 };
585
586 unsigned long cmd = DRM_IOCTL_I915_GEM_EXECBUFFER2;
587
588 if (in_fence != -1) {
589 execbuf.rsvd2 = in_fence;
590 execbuf.flags |= I915_EXEC_FENCE_IN;
591 }
592
593 if (out_fence != NULL) {
594 cmd = DRM_IOCTL_I915_GEM_EXECBUFFER2_WR;
595 *out_fence = -1;
596 execbuf.flags |= I915_EXEC_FENCE_OUT;
597 }
598
599 int ret = drmIoctl(fd, cmd, &execbuf);
600 if (ret != 0)
601 ret = -errno;
602
603 for (int i = 0; i < batch->exec_count; i++) {
604 struct brw_bo *bo = batch->exec_bos[i];
605
606 bo->idle = false;
607 bo->index = -1;
608
609 /* Update brw_bo::gtt_offset */
610 if (batch->validation_list[i].offset != bo->gtt_offset) {
611 DBG("BO %d migrated: 0x%" PRIx64 " -> 0x%llx\n",
612 bo->gem_handle, bo->gtt_offset,
613 batch->validation_list[i].offset);
614 bo->gtt_offset = batch->validation_list[i].offset;
615 }
616 }
617
618 if (ret == 0 && out_fence != NULL)
619 *out_fence = execbuf.rsvd2 >> 32;
620
621 return ret;
622 }
623
624 static int
625 do_flush_locked(struct brw_context *brw, int in_fence_fd, int *out_fence_fd)
626 {
627 const struct gen_device_info *devinfo = &brw->screen->devinfo;
628 __DRIscreen *dri_screen = brw->screen->driScrnPriv;
629 struct intel_batchbuffer *batch = &brw->batch;
630 int ret = 0;
631
632 if (batch->cpu_map) {
633 void *bo_map = brw_bo_map(brw, batch->bo, MAP_WRITE);
634 memcpy(bo_map, batch->cpu_map, 4 * USED_BATCH(*batch));
635 memcpy(bo_map + batch->state_batch_offset,
636 (char *) batch->cpu_map + batch->state_batch_offset,
637 batch->bo->size - batch->state_batch_offset);
638 }
639
640 brw_bo_unmap(batch->bo);
641
642 if (!brw->screen->no_hw) {
643 /* The requirement for using I915_EXEC_NO_RELOC are:
644 *
645 * The addresses written in the objects must match the corresponding
646 * reloc.gtt_offset which in turn must match the corresponding
647 * execobject.offset.
648 *
649 * Any render targets written to in the batch must be flagged with
650 * EXEC_OBJECT_WRITE.
651 *
652 * To avoid stalling, execobject.offset should match the current
653 * address of that object within the active context.
654 */
655 int flags = I915_EXEC_NO_RELOC;
656
657 if (devinfo->gen >= 6 && batch->ring == BLT_RING) {
658 flags |= I915_EXEC_BLT;
659 } else {
660 flags |= I915_EXEC_RENDER;
661 }
662 if (batch->needs_sol_reset)
663 flags |= I915_EXEC_GEN7_SOL_RESET;
664
665 uint32_t hw_ctx = batch->ring == RENDER_RING ? brw->hw_ctx : 0;
666
667 struct drm_i915_gem_exec_object2 *entry = &batch->validation_list[0];
668 assert(entry->handle == batch->bo->gem_handle);
669 entry->relocation_count = batch->reloc_count;
670 entry->relocs_ptr = (uintptr_t) batch->relocs;
671
672 if (batch->use_batch_first) {
673 flags |= I915_EXEC_BATCH_FIRST | I915_EXEC_HANDLE_LUT;
674 } else {
675 /* Move the batch to the end of the validation list */
676 struct drm_i915_gem_exec_object2 tmp;
677 const unsigned index = batch->exec_count - 1;
678
679 tmp = *entry;
680 *entry = batch->validation_list[index];
681 batch->validation_list[index] = tmp;
682 }
683
684 ret = execbuffer(dri_screen->fd, batch, hw_ctx,
685 4 * USED_BATCH(*batch),
686 in_fence_fd, out_fence_fd, flags);
687
688 throttle(brw);
689 }
690
691 if (unlikely(INTEL_DEBUG & DEBUG_BATCH))
692 do_batch_dump(brw);
693
694 if (brw->ctx.Const.ResetStrategy == GL_LOSE_CONTEXT_ON_RESET_ARB)
695 brw_check_for_reset(brw);
696
697 if (ret != 0) {
698 fprintf(stderr, "intel_do_flush_locked failed: %s\n", strerror(-ret));
699 exit(1);
700 }
701
702 return ret;
703 }
704
705 /**
706 * The in_fence_fd is ignored if -1. Otherwise this function takes ownership
707 * of the fd.
708 *
709 * The out_fence_fd is ignored if NULL. Otherwise, the caller takes ownership
710 * of the returned fd.
711 */
712 int
713 _intel_batchbuffer_flush_fence(struct brw_context *brw,
714 int in_fence_fd, int *out_fence_fd,
715 const char *file, int line)
716 {
717 int ret;
718
719 if (USED_BATCH(brw->batch) == 0)
720 return 0;
721
722 if (brw->throttle_batch[0] == NULL) {
723 brw->throttle_batch[0] = brw->batch.bo;
724 brw_bo_reference(brw->throttle_batch[0]);
725 }
726
727 if (unlikely(INTEL_DEBUG & (DEBUG_BATCH | DEBUG_SUBMIT))) {
728 int bytes_for_commands = 4 * USED_BATCH(brw->batch);
729 int bytes_for_state = brw->batch.bo->size - brw->batch.state_batch_offset;
730 int total_bytes = bytes_for_commands + bytes_for_state;
731 fprintf(stderr, "%s:%d: Batchbuffer flush with %4db (pkt) + "
732 "%4db (state) = %4db (%0.1f%%)\n", file, line,
733 bytes_for_commands, bytes_for_state,
734 total_bytes,
735 100.0f * total_bytes / BATCH_SZ);
736 }
737
738 brw->batch.reserved_space = 0;
739
740 brw_finish_batch(brw);
741
742 /* Mark the end of the buffer. */
743 intel_batchbuffer_emit_dword(&brw->batch, MI_BATCH_BUFFER_END);
744 if (USED_BATCH(brw->batch) & 1) {
745 /* Round batchbuffer usage to 2 DWORDs. */
746 intel_batchbuffer_emit_dword(&brw->batch, MI_NOOP);
747 }
748
749 intel_upload_finish(brw);
750
751 /* Check that we didn't just wrap our batchbuffer at a bad time. */
752 assert(!brw->no_batch_wrap);
753
754 ret = do_flush_locked(brw, in_fence_fd, out_fence_fd);
755
756 if (unlikely(INTEL_DEBUG & DEBUG_SYNC)) {
757 fprintf(stderr, "waiting for idle\n");
758 brw_bo_wait_rendering(brw->batch.bo);
759 }
760
761 /* Start a new batch buffer. */
762 brw_new_batch(brw);
763
764 return ret;
765 }
766
767 bool
768 brw_batch_has_aperture_space(struct brw_context *brw, unsigned extra_space)
769 {
770 return brw->batch.aperture_space + extra_space <=
771 brw->screen->aperture_threshold;
772 }
773
774 bool
775 brw_batch_references(struct intel_batchbuffer *batch, struct brw_bo *bo)
776 {
777 unsigned index = READ_ONCE(bo->index);
778 if (index < batch->exec_count && batch->exec_bos[index] == bo)
779 return true;
780
781 for (int i = 0; i < batch->exec_count; i++) {
782 if (batch->exec_bos[i] == bo)
783 return true;
784 }
785 return false;
786 }
787
788 /* This is the only way buffers get added to the validate list.
789 */
790 uint64_t
791 brw_emit_reloc(struct intel_batchbuffer *batch, uint32_t batch_offset,
792 struct brw_bo *target, uint32_t target_offset,
793 unsigned int reloc_flags)
794 {
795 assert(target != NULL);
796
797 if (batch->reloc_count == batch->reloc_array_size) {
798 batch->reloc_array_size *= 2;
799 batch->relocs = realloc(batch->relocs,
800 batch->reloc_array_size *
801 sizeof(struct drm_i915_gem_relocation_entry));
802 }
803
804 /* Check args */
805 assert(batch_offset <= batch->bo->size - sizeof(uint32_t));
806
807 unsigned int index = add_exec_bo(batch, target);
808 struct drm_i915_gem_exec_object2 *entry = &batch->validation_list[index];
809
810 if (reloc_flags)
811 entry->flags |= reloc_flags & batch->valid_reloc_flags;
812
813 batch->relocs[batch->reloc_count++] =
814 (struct drm_i915_gem_relocation_entry) {
815 .offset = batch_offset,
816 .delta = target_offset,
817 .target_handle = batch->use_batch_first ? index : target->gem_handle,
818 .presumed_offset = entry->offset,
819 };
820
821 /* Using the old buffer offset, write in what the right data would be, in
822 * case the buffer doesn't move and we can short-circuit the relocation
823 * processing in the kernel
824 */
825 return entry->offset + target_offset;
826 }
827
828 void
829 intel_batchbuffer_data(struct brw_context *brw,
830 const void *data, GLuint bytes, enum brw_gpu_ring ring)
831 {
832 assert((bytes & 3) == 0);
833 intel_batchbuffer_require_space(brw, bytes, ring);
834 memcpy(brw->batch.map_next, data, bytes);
835 brw->batch.map_next += bytes >> 2;
836 }
837
838 static void
839 load_sized_register_mem(struct brw_context *brw,
840 uint32_t reg,
841 struct brw_bo *bo,
842 uint32_t offset,
843 int size)
844 {
845 const struct gen_device_info *devinfo = &brw->screen->devinfo;
846 int i;
847
848 /* MI_LOAD_REGISTER_MEM only exists on Gen7+. */
849 assert(devinfo->gen >= 7);
850
851 if (devinfo->gen >= 8) {
852 BEGIN_BATCH(4 * size);
853 for (i = 0; i < size; i++) {
854 OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (4 - 2));
855 OUT_BATCH(reg + i * 4);
856 OUT_RELOC64(bo, 0, offset + i * 4);
857 }
858 ADVANCE_BATCH();
859 } else {
860 BEGIN_BATCH(3 * size);
861 for (i = 0; i < size; i++) {
862 OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (3 - 2));
863 OUT_BATCH(reg + i * 4);
864 OUT_RELOC(bo, 0, offset + i * 4);
865 }
866 ADVANCE_BATCH();
867 }
868 }
869
870 void
871 brw_load_register_mem(struct brw_context *brw,
872 uint32_t reg,
873 struct brw_bo *bo,
874 uint32_t offset)
875 {
876 load_sized_register_mem(brw, reg, bo, offset, 1);
877 }
878
879 void
880 brw_load_register_mem64(struct brw_context *brw,
881 uint32_t reg,
882 struct brw_bo *bo,
883 uint32_t offset)
884 {
885 load_sized_register_mem(brw, reg, bo, offset, 2);
886 }
887
888 /*
889 * Write an arbitrary 32-bit register to a buffer via MI_STORE_REGISTER_MEM.
890 */
891 void
892 brw_store_register_mem32(struct brw_context *brw,
893 struct brw_bo *bo, uint32_t reg, uint32_t offset)
894 {
895 const struct gen_device_info *devinfo = &brw->screen->devinfo;
896
897 assert(devinfo->gen >= 6);
898
899 if (devinfo->gen >= 8) {
900 BEGIN_BATCH(4);
901 OUT_BATCH(MI_STORE_REGISTER_MEM | (4 - 2));
902 OUT_BATCH(reg);
903 OUT_RELOC64(bo, RELOC_WRITE, offset);
904 ADVANCE_BATCH();
905 } else {
906 BEGIN_BATCH(3);
907 OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
908 OUT_BATCH(reg);
909 OUT_RELOC(bo, RELOC_WRITE | RELOC_NEEDS_GGTT, offset);
910 ADVANCE_BATCH();
911 }
912 }
913
914 /*
915 * Write an arbitrary 64-bit register to a buffer via MI_STORE_REGISTER_MEM.
916 */
917 void
918 brw_store_register_mem64(struct brw_context *brw,
919 struct brw_bo *bo, uint32_t reg, uint32_t offset)
920 {
921 const struct gen_device_info *devinfo = &brw->screen->devinfo;
922
923 assert(devinfo->gen >= 6);
924
925 /* MI_STORE_REGISTER_MEM only stores a single 32-bit value, so to
926 * read a full 64-bit register, we need to do two of them.
927 */
928 if (devinfo->gen >= 8) {
929 BEGIN_BATCH(8);
930 OUT_BATCH(MI_STORE_REGISTER_MEM | (4 - 2));
931 OUT_BATCH(reg);
932 OUT_RELOC64(bo, RELOC_WRITE, offset);
933 OUT_BATCH(MI_STORE_REGISTER_MEM | (4 - 2));
934 OUT_BATCH(reg + sizeof(uint32_t));
935 OUT_RELOC64(bo, RELOC_WRITE, offset + sizeof(uint32_t));
936 ADVANCE_BATCH();
937 } else {
938 BEGIN_BATCH(6);
939 OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
940 OUT_BATCH(reg);
941 OUT_RELOC(bo, RELOC_WRITE | RELOC_NEEDS_GGTT, offset);
942 OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
943 OUT_BATCH(reg + sizeof(uint32_t));
944 OUT_RELOC(bo, RELOC_WRITE | RELOC_NEEDS_GGTT, offset + sizeof(uint32_t));
945 ADVANCE_BATCH();
946 }
947 }
948
949 /*
950 * Write a 32-bit register using immediate data.
951 */
952 void
953 brw_load_register_imm32(struct brw_context *brw, uint32_t reg, uint32_t imm)
954 {
955 const struct gen_device_info *devinfo = &brw->screen->devinfo;
956
957 assert(devinfo->gen >= 6);
958
959 BEGIN_BATCH(3);
960 OUT_BATCH(MI_LOAD_REGISTER_IMM | (3 - 2));
961 OUT_BATCH(reg);
962 OUT_BATCH(imm);
963 ADVANCE_BATCH();
964 }
965
966 /*
967 * Write a 64-bit register using immediate data.
968 */
969 void
970 brw_load_register_imm64(struct brw_context *brw, uint32_t reg, uint64_t imm)
971 {
972 const struct gen_device_info *devinfo = &brw->screen->devinfo;
973
974 assert(devinfo->gen >= 6);
975
976 BEGIN_BATCH(5);
977 OUT_BATCH(MI_LOAD_REGISTER_IMM | (5 - 2));
978 OUT_BATCH(reg);
979 OUT_BATCH(imm & 0xffffffff);
980 OUT_BATCH(reg + 4);
981 OUT_BATCH(imm >> 32);
982 ADVANCE_BATCH();
983 }
984
985 /*
986 * Copies a 32-bit register.
987 */
988 void
989 brw_load_register_reg(struct brw_context *brw, uint32_t src, uint32_t dest)
990 {
991 const struct gen_device_info *devinfo = &brw->screen->devinfo;
992
993 assert(devinfo->gen >= 8 || devinfo->is_haswell);
994
995 BEGIN_BATCH(3);
996 OUT_BATCH(MI_LOAD_REGISTER_REG | (3 - 2));
997 OUT_BATCH(src);
998 OUT_BATCH(dest);
999 ADVANCE_BATCH();
1000 }
1001
1002 /*
1003 * Copies a 64-bit register.
1004 */
1005 void
1006 brw_load_register_reg64(struct brw_context *brw, uint32_t src, uint32_t dest)
1007 {
1008 const struct gen_device_info *devinfo = &brw->screen->devinfo;
1009
1010 assert(devinfo->gen >= 8 || devinfo->is_haswell);
1011
1012 BEGIN_BATCH(6);
1013 OUT_BATCH(MI_LOAD_REGISTER_REG | (3 - 2));
1014 OUT_BATCH(src);
1015 OUT_BATCH(dest);
1016 OUT_BATCH(MI_LOAD_REGISTER_REG | (3 - 2));
1017 OUT_BATCH(src + sizeof(uint32_t));
1018 OUT_BATCH(dest + sizeof(uint32_t));
1019 ADVANCE_BATCH();
1020 }
1021
1022 /*
1023 * Write 32-bits of immediate data to a GPU memory buffer.
1024 */
1025 void
1026 brw_store_data_imm32(struct brw_context *brw, struct brw_bo *bo,
1027 uint32_t offset, uint32_t imm)
1028 {
1029 const struct gen_device_info *devinfo = &brw->screen->devinfo;
1030
1031 assert(devinfo->gen >= 6);
1032
1033 BEGIN_BATCH(4);
1034 OUT_BATCH(MI_STORE_DATA_IMM | (4 - 2));
1035 if (devinfo->gen >= 8)
1036 OUT_RELOC64(bo, RELOC_WRITE, offset);
1037 else {
1038 OUT_BATCH(0); /* MBZ */
1039 OUT_RELOC(bo, RELOC_WRITE, offset);
1040 }
1041 OUT_BATCH(imm);
1042 ADVANCE_BATCH();
1043 }
1044
1045 /*
1046 * Write 64-bits of immediate data to a GPU memory buffer.
1047 */
1048 void
1049 brw_store_data_imm64(struct brw_context *brw, struct brw_bo *bo,
1050 uint32_t offset, uint64_t imm)
1051 {
1052 const struct gen_device_info *devinfo = &brw->screen->devinfo;
1053
1054 assert(devinfo->gen >= 6);
1055
1056 BEGIN_BATCH(5);
1057 OUT_BATCH(MI_STORE_DATA_IMM | (5 - 2));
1058 if (devinfo->gen >= 8)
1059 OUT_RELOC64(bo, 0, offset);
1060 else {
1061 OUT_BATCH(0); /* MBZ */
1062 OUT_RELOC(bo, RELOC_WRITE, offset);
1063 }
1064 OUT_BATCH(imm & 0xffffffffu);
1065 OUT_BATCH(imm >> 32);
1066 ADVANCE_BATCH();
1067 }