gallium/util: Clarify comment in util_init_thread_pinning
[mesa.git] / src / gallium / auxiliary / util / u_helpers.c
1 /**************************************************************************
2 *
3 * Copyright 2012 Marek Olšák <maraeo@gmail.com>
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL THE AUTHORS AND/OR THEIR SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28 #include "util/u_cpu_detect.h"
29 #include "util/u_helpers.h"
30 #include "util/u_inlines.h"
31 #include "util/u_upload_mgr.h"
32 #include "util/u_thread.h"
33 #include "util/os_time.h"
34 #include <inttypes.h>
35
36 /**
37 * This function is used to copy an array of pipe_vertex_buffer structures,
38 * while properly referencing the pipe_vertex_buffer::buffer member.
39 *
40 * enabled_buffers is updated such that the bits corresponding to the indices
41 * of disabled buffers are set to 0 and the enabled ones are set to 1.
42 *
43 * \sa util_copy_framebuffer_state
44 */
45 void util_set_vertex_buffers_mask(struct pipe_vertex_buffer *dst,
46 uint32_t *enabled_buffers,
47 const struct pipe_vertex_buffer *src,
48 unsigned start_slot, unsigned count)
49 {
50 unsigned i;
51 uint32_t bitmask = 0;
52
53 dst += start_slot;
54
55 if (src) {
56 for (i = 0; i < count; i++) {
57 if (src[i].buffer.resource)
58 bitmask |= 1 << i;
59
60 pipe_vertex_buffer_unreference(&dst[i]);
61
62 if (!src[i].is_user_buffer)
63 pipe_resource_reference(&dst[i].buffer.resource, src[i].buffer.resource);
64 }
65
66 /* Copy over the other members of pipe_vertex_buffer. */
67 memcpy(dst, src, count * sizeof(struct pipe_vertex_buffer));
68
69 *enabled_buffers &= ~(((1ull << count) - 1) << start_slot);
70 *enabled_buffers |= bitmask << start_slot;
71 }
72 else {
73 /* Unreference the buffers. */
74 for (i = 0; i < count; i++)
75 pipe_vertex_buffer_unreference(&dst[i]);
76
77 *enabled_buffers &= ~(((1ull << count) - 1) << start_slot);
78 }
79 }
80
81 /**
82 * Same as util_set_vertex_buffers_mask, but it only returns the number
83 * of bound buffers.
84 */
85 void util_set_vertex_buffers_count(struct pipe_vertex_buffer *dst,
86 unsigned *dst_count,
87 const struct pipe_vertex_buffer *src,
88 unsigned start_slot, unsigned count)
89 {
90 unsigned i;
91 uint32_t enabled_buffers = 0;
92
93 for (i = 0; i < *dst_count; i++) {
94 if (dst[i].buffer.resource)
95 enabled_buffers |= (1ull << i);
96 }
97
98 util_set_vertex_buffers_mask(dst, &enabled_buffers, src, start_slot,
99 count);
100
101 *dst_count = util_last_bit(enabled_buffers);
102 }
103
104 /**
105 * Given a user index buffer, save the structure to "saved", and upload it.
106 */
107 bool
108 util_upload_index_buffer(struct pipe_context *pipe,
109 const struct pipe_draw_info *info,
110 struct pipe_resource **out_buffer,
111 unsigned *out_offset)
112 {
113 unsigned start_offset = info->start * info->index_size;
114
115 u_upload_data(pipe->stream_uploader, start_offset,
116 info->count * info->index_size, 4,
117 (char*)info->index.user + start_offset,
118 out_offset, out_buffer);
119 u_upload_unmap(pipe->stream_uploader);
120 *out_offset -= start_offset;
121 return *out_buffer != NULL;
122 }
123
124 #ifdef HAVE_PTHREAD_SETAFFINITY
125
126 static unsigned L3_cache_number;
127 static once_flag thread_pinning_once_flag = ONCE_FLAG_INIT;
128
129 static void
130 util_set_full_cpu_affinity(void)
131 {
132 cpu_set_t cpuset;
133
134 CPU_ZERO(&cpuset);
135 for (unsigned i = 0; i < CPU_SETSIZE; i++)
136 CPU_SET(i, &cpuset);
137
138 pthread_setaffinity_np(pthread_self(), sizeof(cpuset), &cpuset);
139 }
140
141 static void
142 util_init_thread_pinning(void)
143 {
144 /* Get a semi-random number. */
145 int64_t t = os_time_get_nano();
146 L3_cache_number = (t ^ (t >> 8) ^ (t >> 16));
147
148 /* Reset thread affinity for all child processes to prevent them from
149 * inheriting the current thread's affinity.
150 *
151 * XXX: If the driver is unloaded after this, and the app later calls
152 * fork(), the child process will likely crash before fork() returns,
153 * because the address where util_set_full_cpu_affinity was located
154 * will either be unmapped or point to random other contents.
155 */
156 pthread_atfork(NULL, NULL, util_set_full_cpu_affinity);
157 }
158
159 #endif
160
161 /**
162 * Called by MakeCurrent. Used to notify the driver that the application
163 * thread may have been changed.
164 *
165 * The function pins the current thread and driver threads to a group of
166 * CPU cores that share the same L3 cache. This is needed for good multi-
167 * threading performance on AMD Zen CPUs.
168 *
169 * \param upper_thread thread in the state tracker that also needs to be
170 * pinned.
171 */
172 void
173 util_context_thread_changed(struct pipe_context *ctx, thrd_t *upper_thread)
174 {
175 #ifdef HAVE_PTHREAD_SETAFFINITY
176 /* If pinning has no effect, don't do anything. */
177 if (util_cpu_caps.nr_cpus == util_cpu_caps.cores_per_L3)
178 return;
179
180 thrd_t current = thrd_current();
181 int cache = util_get_L3_for_pinned_thread(current,
182 util_cpu_caps.cores_per_L3);
183
184 call_once(&thread_pinning_once_flag, util_init_thread_pinning);
185
186 /* If the main thread is not pinned, choose the L3 cache. */
187 if (cache == -1) {
188 unsigned num_L3_caches = util_cpu_caps.nr_cpus /
189 util_cpu_caps.cores_per_L3;
190
191 /* Choose a different L3 cache for each subsequent MakeCurrent. */
192 cache = p_atomic_inc_return(&L3_cache_number) % num_L3_caches;
193 util_pin_thread_to_L3(current, cache, util_cpu_caps.cores_per_L3);
194 }
195
196 /* Tell the driver to pin its threads to the same L3 cache. */
197 if (ctx->set_context_param) {
198 ctx->set_context_param(ctx, PIPE_CONTEXT_PARAM_PIN_THREADS_TO_L3_CACHE,
199 cache);
200 }
201
202 /* Do the same for the upper level thread if there is any (e.g. glthread) */
203 if (upper_thread)
204 util_pin_thread_to_L3(*upper_thread, cache, util_cpu_caps.cores_per_L3);
205 #endif
206 }
207
208 /* This is a helper for hardware bring-up. Don't remove. */
209 struct pipe_query *
210 util_begin_pipestat_query(struct pipe_context *ctx)
211 {
212 struct pipe_query *q =
213 ctx->create_query(ctx, PIPE_QUERY_PIPELINE_STATISTICS, 0);
214 if (!q)
215 return NULL;
216
217 ctx->begin_query(ctx, q);
218 return q;
219 }
220
221 /* This is a helper for hardware bring-up. Don't remove. */
222 void
223 util_end_pipestat_query(struct pipe_context *ctx, struct pipe_query *q,
224 FILE *f)
225 {
226 static unsigned counter;
227 struct pipe_query_data_pipeline_statistics stats;
228
229 ctx->end_query(ctx, q);
230 ctx->get_query_result(ctx, q, true, (void*)&stats);
231 ctx->destroy_query(ctx, q);
232
233 fprintf(f,
234 "Draw call %u:\n"
235 " ia_vertices = %"PRIu64"\n"
236 " ia_primitives = %"PRIu64"\n"
237 " vs_invocations = %"PRIu64"\n"
238 " gs_invocations = %"PRIu64"\n"
239 " gs_primitives = %"PRIu64"\n"
240 " c_invocations = %"PRIu64"\n"
241 " c_primitives = %"PRIu64"\n"
242 " ps_invocations = %"PRIu64"\n"
243 " hs_invocations = %"PRIu64"\n"
244 " ds_invocations = %"PRIu64"\n"
245 " cs_invocations = %"PRIu64"\n",
246 p_atomic_inc_return(&counter),
247 stats.ia_vertices,
248 stats.ia_primitives,
249 stats.vs_invocations,
250 stats.gs_invocations,
251 stats.gs_primitives,
252 stats.c_invocations,
253 stats.c_primitives,
254 stats.ps_invocations,
255 stats.hs_invocations,
256 stats.ds_invocations,
257 stats.cs_invocations);
258 }
259
260 /* This is a helper for hardware bring-up. Don't remove. */
261 void
262 util_wait_for_idle(struct pipe_context *ctx)
263 {
264 struct pipe_fence_handle *fence = NULL;
265
266 ctx->flush(ctx, &fence, 0);
267 ctx->screen->fence_finish(ctx->screen, NULL, fence, PIPE_TIMEOUT_INFINITE);
268 }
269
270 void
271 util_throttle_init(struct util_throttle *t, uint64_t max_mem_usage)
272 {
273 t->max_mem_usage = max_mem_usage;
274 }
275
276 void
277 util_throttle_deinit(struct pipe_screen *screen, struct util_throttle *t)
278 {
279 for (unsigned i = 0; i < ARRAY_SIZE(t->ring); i++)
280 screen->fence_reference(screen, &t->ring[i].fence, NULL);
281 }
282
283 static uint64_t
284 util_get_throttle_total_memory_usage(struct util_throttle *t)
285 {
286 uint64_t total_usage = 0;
287
288 for (unsigned i = 0; i < ARRAY_SIZE(t->ring); i++)
289 total_usage += t->ring[i].mem_usage;
290 return total_usage;
291 }
292
293 static void util_dump_throttle_ring(struct util_throttle *t)
294 {
295 printf("Throttle:\n");
296 for (unsigned i = 0; i < ARRAY_SIZE(t->ring); i++) {
297 printf(" ring[%u]: fence = %s, mem_usage = %"PRIu64"%s%s\n",
298 i, t->ring[i].fence ? "yes" : " no",
299 t->ring[i].mem_usage,
300 t->flush_index == i ? " [flush]" : "",
301 t->wait_index == i ? " [wait]" : "");
302 }
303 }
304
305 /**
306 * Notify util_throttle that the next operation allocates memory.
307 * util_throttle tracks memory usage and waits for fences until its tracked
308 * memory usage decreases.
309 *
310 * Example:
311 * util_throttle_memory_usage(..., w*h*d*Bpp);
312 * TexSubImage(..., w, h, d, ...);
313 *
314 * This means that TexSubImage can't allocate more memory its maximum limit
315 * set during initialization.
316 */
317 void
318 util_throttle_memory_usage(struct pipe_context *pipe,
319 struct util_throttle *t, uint64_t memory_size)
320 {
321 (void)util_dump_throttle_ring; /* silence warning */
322
323 if (!t->max_mem_usage)
324 return;
325
326 struct pipe_screen *screen = pipe->screen;
327 struct pipe_fence_handle **fence = NULL;
328 unsigned ring_size = ARRAY_SIZE(t->ring);
329 uint64_t total = util_get_throttle_total_memory_usage(t);
330
331 /* If there is not enough memory, walk the list of fences and find
332 * the latest one that we need to wait for.
333 */
334 while (t->wait_index != t->flush_index &&
335 total && total + memory_size > t->max_mem_usage) {
336 assert(t->ring[t->wait_index].fence);
337
338 /* Release an older fence if we need to wait for a newer one. */
339 if (fence)
340 screen->fence_reference(screen, fence, NULL);
341
342 fence = &t->ring[t->wait_index].fence;
343 t->ring[t->wait_index].mem_usage = 0;
344 t->wait_index = (t->wait_index + 1) % ring_size;
345
346 total = util_get_throttle_total_memory_usage(t);
347 }
348
349 /* Wait for the fence to decrease memory usage. */
350 if (fence) {
351 screen->fence_finish(screen, pipe, *fence, PIPE_TIMEOUT_INFINITE);
352 screen->fence_reference(screen, fence, NULL);
353 }
354
355 /* Flush and get a fence if we've exhausted memory usage for the current
356 * slot.
357 */
358 if (t->ring[t->flush_index].mem_usage &&
359 t->ring[t->flush_index].mem_usage + memory_size >
360 t->max_mem_usage / (ring_size / 2)) {
361 struct pipe_fence_handle **fence =
362 &t->ring[t->flush_index].fence;
363
364 /* Expect that the current flush slot doesn't have a fence yet. */
365 assert(!*fence);
366
367 pipe->flush(pipe, fence, PIPE_FLUSH_ASYNC);
368 t->flush_index = (t->flush_index + 1) % ring_size;
369
370 /* Vacate the next slot if it's occupied. This should be rare. */
371 if (t->flush_index == t->wait_index) {
372 struct pipe_fence_handle **fence =
373 &t->ring[t->wait_index].fence;
374
375 t->ring[t->wait_index].mem_usage = 0;
376 t->wait_index = (t->wait_index + 1) % ring_size;
377
378 assert(*fence);
379 screen->fence_finish(screen, pipe, *fence, PIPE_TIMEOUT_INFINITE);
380 screen->fence_reference(screen, fence, NULL);
381 }
382
383 assert(!t->ring[t->flush_index].mem_usage);
384 assert(!t->ring[t->flush_index].fence);
385 }
386
387 t->ring[t->flush_index].mem_usage += memory_size;
388 }