winsys/radeon: add usage parameter to cs_is_buffer_referenced
[mesa.git] / src / gallium / winsys / radeon / drm / radeon_drm_cs.c
1 /*
2 * Copyright © 2008 Jérôme Glisse
3 * Copyright © 2010 Marek Olšák <maraeo@gmail.com>
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining
7 * a copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
16 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
18 * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * The above copyright notice and this permission notice (including the
24 * next paragraph) shall be included in all copies or substantial portions
25 * of the Software.
26 */
27 /*
28 * Authors:
29 * Marek Olšák <maraeo@gmail.com>
30 *
31 * Based on work from libdrm_radeon by:
32 * Aapo Tahkola <aet@rasterburn.org>
33 * Nicolai Haehnle <prefect_@gmx.net>
34 * Jérôme Glisse <glisse@freedesktop.org>
35 */
36
37 /*
38 This file replaces libdrm's radeon_cs_gem with our own implemention.
39 It's optimized specifically for Radeon DRM.
40 Reloc writes and space checking are faster and simpler than their
41 counterparts in libdrm (the time complexity of all the functions
42 is O(1) in nearly all scenarios, thanks to hashing).
43
44 It works like this:
45
46 cs_add_reloc(cs, buf, read_domain, write_domain) adds a new relocation and
47 also adds the size of 'buf' to the used_gart and used_vram winsys variables
48 based on the domains, which are simply or'd for the accounting purposes.
49 The adding is skipped if the reloc is already present in the list, but it
50 accounts any newly-referenced domains.
51
52 cs_validate is then called, which just checks:
53 used_vram/gart < vram/gart_size * 0.8
54 The 0.8 number allows for some memory fragmentation. If the validation
55 fails, the pipe driver flushes CS and tries do the validation again,
56 i.e. it validates only that one operation. If it fails again, it drops
57 the operation on the floor and prints some nasty message to stderr.
58 (done in the pipe driver)
59
60 cs_write_reloc(cs, buf) just writes a reloc that has been added using
61 cs_add_reloc. The read_domain and write_domain parameters have been removed,
62 because we already specify them in cs_add_reloc.
63 */
64
65 #include "radeon_drm_cs.h"
66
67 #include "util/u_memory.h"
68
69 #include <stdio.h>
70 #include <stdlib.h>
71 #include <stdint.h>
72 #include <xf86drm.h>
73
74 /*
75 * this are copy from radeon_drm, once an updated libdrm is released
76 * we should bump configure.ac requirement for it and remove the following
77 * field
78 */
79 #ifndef RADEON_CHUNK_ID_FLAGS
80 #define RADEON_CHUNK_ID_FLAGS 0x03
81
82 /* The first dword of RADEON_CHUNK_ID_FLAGS is a uint32 of these flags: */
83 #define RADEON_CS_KEEP_TILING_FLAGS 0x01
84 #endif
85
86 #ifndef RADEON_CS_USE_VM
87 #define RADEON_CS_USE_VM 0x02
88 /* The second dword of RADEON_CHUNK_ID_FLAGS is a uint32 that sets the ring type */
89 #define RADEON_CS_RING_GFX 0
90 #define RADEON_CS_RING_COMPUTE 1
91 #endif
92
93
94 #define RELOC_DWORDS (sizeof(struct drm_radeon_cs_reloc) / sizeof(uint32_t))
95
96 static boolean radeon_init_cs_context(struct radeon_cs_context *csc,
97 struct radeon_drm_winsys *ws)
98 {
99 csc->fd = ws->fd;
100 csc->nrelocs = 512;
101 csc->relocs_bo = (struct radeon_bo**)
102 CALLOC(1, csc->nrelocs * sizeof(struct radeon_bo*));
103 if (!csc->relocs_bo) {
104 return FALSE;
105 }
106
107 csc->relocs = (struct drm_radeon_cs_reloc*)
108 CALLOC(1, csc->nrelocs * sizeof(struct drm_radeon_cs_reloc));
109 if (!csc->relocs) {
110 FREE(csc->relocs_bo);
111 return FALSE;
112 }
113
114 csc->chunks[0].chunk_id = RADEON_CHUNK_ID_IB;
115 csc->chunks[0].length_dw = 0;
116 csc->chunks[0].chunk_data = (uint64_t)(uintptr_t)csc->buf;
117 csc->chunks[1].chunk_id = RADEON_CHUNK_ID_RELOCS;
118 csc->chunks[1].length_dw = 0;
119 csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
120 csc->chunks[2].chunk_id = RADEON_CHUNK_ID_FLAGS;
121 csc->chunks[2].length_dw = 1;
122 csc->chunks[2].chunk_data = (uint64_t)(uintptr_t)&csc->flags;
123
124 csc->chunk_array[0] = (uint64_t)(uintptr_t)&csc->chunks[0];
125 csc->chunk_array[1] = (uint64_t)(uintptr_t)&csc->chunks[1];
126 csc->chunk_array[2] = (uint64_t)(uintptr_t)&csc->chunks[2];
127
128 csc->cs.chunks = (uint64_t)(uintptr_t)csc->chunk_array;
129 return TRUE;
130 }
131
132 static void radeon_cs_context_cleanup(struct radeon_cs_context *csc)
133 {
134 unsigned i;
135
136 for (i = 0; i < csc->crelocs; i++) {
137 p_atomic_dec(&csc->relocs_bo[i]->num_cs_references);
138 radeon_bo_reference(&csc->relocs_bo[i], NULL);
139 }
140
141 csc->crelocs = 0;
142 csc->validated_crelocs = 0;
143 csc->chunks[0].length_dw = 0;
144 csc->chunks[1].length_dw = 0;
145 csc->used_gart = 0;
146 csc->used_vram = 0;
147 memset(csc->is_handle_added, 0, sizeof(csc->is_handle_added));
148 }
149
150 static void radeon_destroy_cs_context(struct radeon_cs_context *csc)
151 {
152 radeon_cs_context_cleanup(csc);
153 FREE(csc->relocs_bo);
154 FREE(csc->relocs);
155 }
156
157 DEBUG_GET_ONCE_BOOL_OPTION(thread, "RADEON_THREAD", TRUE)
158 static PIPE_THREAD_ROUTINE(radeon_drm_cs_emit_ioctl, param);
159
160 static struct radeon_winsys_cs *radeon_drm_cs_create(struct radeon_winsys *rws)
161 {
162 struct radeon_drm_winsys *ws = radeon_drm_winsys(rws);
163 struct radeon_drm_cs *cs;
164
165 cs = CALLOC_STRUCT(radeon_drm_cs);
166 if (!cs) {
167 return NULL;
168 }
169 pipe_semaphore_init(&cs->flush_queued, 0);
170 pipe_semaphore_init(&cs->flush_completed, 0);
171
172 cs->ws = ws;
173
174 if (!radeon_init_cs_context(&cs->csc1, cs->ws)) {
175 FREE(cs);
176 return NULL;
177 }
178 if (!radeon_init_cs_context(&cs->csc2, cs->ws)) {
179 radeon_destroy_cs_context(&cs->csc1);
180 FREE(cs);
181 return NULL;
182 }
183
184 /* Set the first command buffer as current. */
185 cs->csc = &cs->csc1;
186 cs->cst = &cs->csc2;
187 cs->base.buf = cs->csc->buf;
188
189 p_atomic_inc(&ws->num_cs);
190 if (cs->ws->num_cpus > 1 && debug_get_option_thread())
191 cs->thread = pipe_thread_create(radeon_drm_cs_emit_ioctl, cs);
192 return &cs->base;
193 }
194
195 #define OUT_CS(cs, value) (cs)->buf[(cs)->cdw++] = (value)
196
197 static INLINE void update_reloc_domains(struct drm_radeon_cs_reloc *reloc,
198 enum radeon_bo_domain rd,
199 enum radeon_bo_domain wd,
200 enum radeon_bo_domain *added_domains)
201 {
202 *added_domains = (rd | wd) & ~(reloc->read_domains | reloc->write_domain);
203
204 reloc->read_domains |= rd;
205 reloc->write_domain |= wd;
206 }
207
208 int radeon_get_reloc(struct radeon_cs_context *csc, struct radeon_bo *bo)
209 {
210 struct drm_radeon_cs_reloc *reloc;
211 unsigned i;
212 unsigned hash = bo->handle & (sizeof(csc->is_handle_added)-1);
213
214 if (csc->is_handle_added[hash]) {
215 reloc = csc->relocs_hashlist[hash];
216 if (reloc->handle == bo->handle) {
217 return csc->reloc_indices_hashlist[hash];
218 }
219
220 /* Hash collision, look for the BO in the list of relocs linearly. */
221 for (i = csc->crelocs; i != 0;) {
222 --i;
223 reloc = &csc->relocs[i];
224 if (reloc->handle == bo->handle) {
225 /* Put this reloc in the hash list.
226 * This will prevent additional hash collisions if there are
227 * several consecutive get_reloc calls for the same buffer.
228 *
229 * Example: Assuming buffers A,B,C collide in the hash list,
230 * the following sequence of relocs:
231 * AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC
232 * will collide here: ^ and here: ^,
233 * meaning that we should get very few collisions in the end. */
234 csc->relocs_hashlist[hash] = reloc;
235 csc->reloc_indices_hashlist[hash] = i;
236 /*printf("write_reloc collision, hash: %i, handle: %i\n", hash, bo->handle);*/
237 return i;
238 }
239 }
240 }
241
242 return -1;
243 }
244
245 static unsigned radeon_add_reloc(struct radeon_cs_context *csc,
246 struct radeon_bo *bo,
247 enum radeon_bo_usage usage,
248 enum radeon_bo_domain domains,
249 enum radeon_bo_domain *added_domains)
250 {
251 struct drm_radeon_cs_reloc *reloc;
252 unsigned i;
253 unsigned hash = bo->handle & (sizeof(csc->is_handle_added)-1);
254 enum radeon_bo_domain rd = usage & RADEON_USAGE_READ ? domains : 0;
255 enum radeon_bo_domain wd = usage & RADEON_USAGE_WRITE ? domains : 0;
256
257 if (csc->is_handle_added[hash]) {
258 reloc = csc->relocs_hashlist[hash];
259 if (reloc->handle == bo->handle) {
260 update_reloc_domains(reloc, rd, wd, added_domains);
261 return csc->reloc_indices_hashlist[hash];
262 }
263
264 /* Hash collision, look for the BO in the list of relocs linearly. */
265 for (i = csc->crelocs; i != 0;) {
266 --i;
267 reloc = &csc->relocs[i];
268 if (reloc->handle == bo->handle) {
269 update_reloc_domains(reloc, rd, wd, added_domains);
270
271 csc->relocs_hashlist[hash] = reloc;
272 csc->reloc_indices_hashlist[hash] = i;
273 /*printf("write_reloc collision, hash: %i, handle: %i\n", hash, bo->handle);*/
274 return i;
275 }
276 }
277 }
278
279 /* New relocation, check if the backing array is large enough. */
280 if (csc->crelocs >= csc->nrelocs) {
281 uint32_t size;
282 csc->nrelocs += 10;
283
284 size = csc->nrelocs * sizeof(struct radeon_bo*);
285 csc->relocs_bo = (struct radeon_bo**)realloc(csc->relocs_bo, size);
286
287 size = csc->nrelocs * sizeof(struct drm_radeon_cs_reloc);
288 csc->relocs = (struct drm_radeon_cs_reloc*)realloc(csc->relocs, size);
289
290 csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
291 }
292
293 /* Initialize the new relocation. */
294 csc->relocs_bo[csc->crelocs] = NULL;
295 radeon_bo_reference(&csc->relocs_bo[csc->crelocs], bo);
296 p_atomic_inc(&bo->num_cs_references);
297 reloc = &csc->relocs[csc->crelocs];
298 reloc->handle = bo->handle;
299 reloc->read_domains = rd;
300 reloc->write_domain = wd;
301 reloc->flags = 0;
302
303 csc->is_handle_added[hash] = TRUE;
304 csc->relocs_hashlist[hash] = reloc;
305 csc->reloc_indices_hashlist[hash] = csc->crelocs;
306
307 csc->chunks[1].length_dw += RELOC_DWORDS;
308
309 *added_domains = rd | wd;
310 return csc->crelocs++;
311 }
312
313 static unsigned radeon_drm_cs_add_reloc(struct radeon_winsys_cs *rcs,
314 struct radeon_winsys_cs_handle *buf,
315 enum radeon_bo_usage usage,
316 enum radeon_bo_domain domains)
317 {
318 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
319 struct radeon_bo *bo = (struct radeon_bo*)buf;
320 enum radeon_bo_domain added_domains;
321
322 unsigned index = radeon_add_reloc(cs->csc, bo, usage, domains, &added_domains);
323
324 if (added_domains & RADEON_DOMAIN_GTT)
325 cs->csc->used_gart += bo->base.size;
326 if (added_domains & RADEON_DOMAIN_VRAM)
327 cs->csc->used_vram += bo->base.size;
328
329 return index;
330 }
331
332 static boolean radeon_drm_cs_validate(struct radeon_winsys_cs *rcs)
333 {
334 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
335 boolean status =
336 cs->csc->used_gart < cs->ws->info.gart_size * 0.8 &&
337 cs->csc->used_vram < cs->ws->info.vram_size * 0.8;
338
339 if (status) {
340 cs->csc->validated_crelocs = cs->csc->crelocs;
341 } else {
342 /* Remove lately-added relocations. The validation failed with them
343 * and the CS is about to be flushed because of that. Keep only
344 * the already-validated relocations. */
345 unsigned i;
346
347 for (i = cs->csc->validated_crelocs; i < cs->csc->crelocs; i++) {
348 p_atomic_dec(&cs->csc->relocs_bo[i]->num_cs_references);
349 radeon_bo_reference(&cs->csc->relocs_bo[i], NULL);
350 }
351 cs->csc->crelocs = cs->csc->validated_crelocs;
352
353 /* Flush if there are any relocs. Clean up otherwise. */
354 if (cs->csc->crelocs) {
355 cs->flush_cs(cs->flush_data, RADEON_FLUSH_ASYNC);
356 } else {
357 radeon_cs_context_cleanup(cs->csc);
358
359 assert(cs->base.cdw == 0);
360 if (cs->base.cdw != 0) {
361 fprintf(stderr, "radeon: Unexpected error in %s.\n", __func__);
362 }
363 }
364 }
365 return status;
366 }
367
368 static void radeon_drm_cs_write_reloc(struct radeon_winsys_cs *rcs,
369 struct radeon_winsys_cs_handle *buf)
370 {
371 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
372 struct radeon_bo *bo = (struct radeon_bo*)buf;
373
374 unsigned index = radeon_get_reloc(cs->csc, bo);
375
376 if (index == -1) {
377 fprintf(stderr, "radeon: Cannot get a relocation in %s.\n", __func__);
378 return;
379 }
380
381 OUT_CS(&cs->base, 0xc0001000);
382 OUT_CS(&cs->base, index * RELOC_DWORDS);
383 }
384
385 static void radeon_drm_cs_emit_ioctl_oneshot(struct radeon_cs_context *csc)
386 {
387 unsigned i;
388
389 if (drmCommandWriteRead(csc->fd, DRM_RADEON_CS,
390 &csc->cs, sizeof(struct drm_radeon_cs))) {
391 if (debug_get_bool_option("RADEON_DUMP_CS", FALSE)) {
392 unsigned i;
393
394 fprintf(stderr, "radeon: The kernel rejected CS, dumping...\n");
395 for (i = 0; i < csc->chunks[0].length_dw; i++) {
396 fprintf(stderr, "0x%08X\n", csc->buf[i]);
397 }
398 } else {
399 fprintf(stderr, "radeon: The kernel rejected CS, "
400 "see dmesg for more information.\n");
401 }
402 }
403
404 for (i = 0; i < csc->crelocs; i++)
405 p_atomic_dec(&csc->relocs_bo[i]->num_active_ioctls);
406
407 radeon_cs_context_cleanup(csc);
408 }
409
410 static PIPE_THREAD_ROUTINE(radeon_drm_cs_emit_ioctl, param)
411 {
412 struct radeon_drm_cs *cs = (struct radeon_drm_cs*)param;
413
414 while (1) {
415 pipe_semaphore_wait(&cs->flush_queued);
416 if (cs->kill_thread)
417 break;
418 radeon_drm_cs_emit_ioctl_oneshot(cs->cst);
419 pipe_semaphore_signal(&cs->flush_completed);
420 }
421 pipe_semaphore_signal(&cs->flush_completed);
422 return NULL;
423 }
424
425 void radeon_drm_cs_sync_flush(struct radeon_drm_cs *cs)
426 {
427 /* Wait for any pending ioctl to complete. */
428 if (cs->thread && cs->flush_started) {
429 pipe_semaphore_wait(&cs->flush_completed);
430 cs->flush_started = 0;
431 }
432 }
433
434 static void radeon_drm_cs_flush(struct radeon_winsys_cs *rcs, unsigned flags)
435 {
436 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
437 struct radeon_cs_context *tmp;
438
439 radeon_drm_cs_sync_flush(cs);
440
441 /* Flip command streams. */
442 tmp = cs->csc;
443 cs->csc = cs->cst;
444 cs->cst = tmp;
445
446 /* If the CS is not empty, emit it in a separate thread. */
447 if (cs->base.cdw) {
448 unsigned i, crelocs = cs->cst->crelocs;
449
450 cs->cst->chunks[0].length_dw = cs->base.cdw;
451
452 for (i = 0; i < crelocs; i++) {
453 /* Update the number of active asynchronous CS ioctls for the buffer. */
454 p_atomic_inc(&cs->cst->relocs_bo[i]->num_active_ioctls);
455 }
456
457 cs->cst->flags = 0;
458 cs->cst->cs.num_chunks = 2;
459 if (flags & RADEON_FLUSH_KEEP_TILING_FLAGS) {
460 cs->cst->flags |= RADEON_CS_KEEP_TILING_FLAGS;
461 cs->cst->cs.num_chunks = 3;
462 }
463 if (cs->ws->info.r600_virtual_address) {
464 cs->cst->cs.num_chunks = 3;
465 cs->cst->flags |= RADEON_CS_USE_VM;
466 }
467
468 if (cs->thread &&
469 (flags & RADEON_FLUSH_ASYNC)) {
470 cs->flush_started = 1;
471 pipe_semaphore_signal(&cs->flush_queued);
472 } else {
473 radeon_drm_cs_emit_ioctl_oneshot(cs->cst);
474 }
475 } else {
476 radeon_cs_context_cleanup(cs->cst);
477 }
478
479 /* Prepare a new CS. */
480 cs->base.buf = cs->csc->buf;
481 cs->base.cdw = 0;
482 }
483
484 static void radeon_drm_cs_destroy(struct radeon_winsys_cs *rcs)
485 {
486 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
487 radeon_drm_cs_sync_flush(cs);
488 if (cs->thread) {
489 cs->kill_thread = 1;
490 pipe_semaphore_signal(&cs->flush_queued);
491 pipe_semaphore_wait(&cs->flush_completed);
492 pipe_thread_wait(cs->thread);
493 pipe_thread_destroy(cs->thread);
494 }
495 pipe_semaphore_destroy(&cs->flush_queued);
496 pipe_semaphore_destroy(&cs->flush_completed);
497 radeon_cs_context_cleanup(&cs->csc1);
498 radeon_cs_context_cleanup(&cs->csc2);
499 p_atomic_dec(&cs->ws->num_cs);
500 radeon_destroy_cs_context(&cs->csc1);
501 radeon_destroy_cs_context(&cs->csc2);
502 FREE(cs);
503 }
504
505 static void radeon_drm_cs_set_flush(struct radeon_winsys_cs *rcs,
506 void (*flush)(void *ctx, unsigned flags),
507 void *user)
508 {
509 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
510 cs->flush_cs = flush;
511 cs->flush_data = user;
512 }
513
514 static boolean radeon_bo_is_referenced(struct radeon_winsys_cs *rcs,
515 struct radeon_winsys_cs_handle *_buf,
516 enum radeon_bo_usage usage)
517 {
518 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
519 struct radeon_bo *bo = (struct radeon_bo*)_buf;
520 int index;
521
522 if (!bo->num_cs_references)
523 return FALSE;
524
525 index = radeon_get_reloc(cs->csc, bo);
526 if (index == -1)
527 return FALSE;
528
529 if ((usage & RADEON_USAGE_WRITE) && cs->csc->relocs[index].write_domain)
530 return TRUE;
531 if ((usage & RADEON_USAGE_READ) && cs->csc->relocs[index].read_domains)
532 return TRUE;
533
534 return FALSE;
535 }
536
537 void radeon_drm_cs_init_functions(struct radeon_drm_winsys *ws)
538 {
539 ws->base.cs_create = radeon_drm_cs_create;
540 ws->base.cs_destroy = radeon_drm_cs_destroy;
541 ws->base.cs_add_reloc = radeon_drm_cs_add_reloc;
542 ws->base.cs_validate = radeon_drm_cs_validate;
543 ws->base.cs_write_reloc = radeon_drm_cs_write_reloc;
544 ws->base.cs_flush = radeon_drm_cs_flush;
545 ws->base.cs_set_flush_callback = radeon_drm_cs_set_flush;
546 ws->base.cs_is_buffer_referenced = radeon_bo_is_referenced;
547 }