clover: remove util/compat
[mesa.git] / src / gallium / state_trackers / clover / core / kernel.cpp
1 //
2 // Copyright 2012 Francisco Jerez
3 //
4 // Permission is hereby granted, free of charge, to any person obtaining a
5 // copy of this software and associated documentation files (the "Software"),
6 // to deal in the Software without restriction, including without limitation
7 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 // and/or sell copies of the Software, and to permit persons to whom the
9 // Software is furnished to do so, subject to the following conditions:
10 //
11 // The above copyright notice and this permission notice shall be included in
12 // all copies or substantial portions of the Software.
13 //
14 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 // OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 // OTHER DEALINGS IN THE SOFTWARE.
21 //
22
23 #include "core/kernel.hpp"
24 #include "core/resource.hpp"
25 #include "util/factor.hpp"
26 #include "util/u_math.h"
27 #include "pipe/p_context.h"
28
29 using namespace clover;
30
31 kernel::kernel(clover::program &prog, const std::string &name,
32 const std::vector<module::argument> &margs) :
33 program(prog), _name(name), exec(*this),
34 program_ref(prog._kernel_ref_counter) {
35 for (auto &marg : margs) {
36 if (marg.semantic == module::argument::general)
37 _args.emplace_back(argument::create(marg));
38 }
39 }
40
41 template<typename V>
42 static inline std::vector<uint>
43 pad_vector(command_queue &q, const V &v, uint x) {
44 std::vector<uint> w { v.begin(), v.end() };
45 w.resize(q.device().max_block_size().size(), x);
46 return w;
47 }
48
49 void
50 kernel::launch(command_queue &q,
51 const std::vector<size_t> &grid_offset,
52 const std::vector<size_t> &grid_size,
53 const std::vector<size_t> &block_size) {
54 const auto m = program().binary(q.device());
55 const auto reduced_grid_size =
56 map(divides(), grid_size, block_size);
57 void *st = exec.bind(&q, grid_offset);
58
59 // The handles are created during exec_context::bind(), so we need make
60 // sure to call exec_context::bind() before retrieving them.
61 std::vector<uint32_t *> g_handles = map([&](size_t h) {
62 return (uint32_t *)&exec.input[h];
63 }, exec.g_handles);
64
65 q.pipe->bind_compute_state(q.pipe, st);
66 q.pipe->bind_sampler_states(q.pipe, PIPE_SHADER_COMPUTE,
67 0, exec.samplers.size(),
68 exec.samplers.data());
69
70 q.pipe->set_sampler_views(q.pipe, PIPE_SHADER_COMPUTE, 0,
71 exec.sviews.size(), exec.sviews.data());
72 q.pipe->set_compute_resources(q.pipe, 0, exec.resources.size(),
73 exec.resources.data());
74 q.pipe->set_global_binding(q.pipe, 0, exec.g_buffers.size(),
75 exec.g_buffers.data(), g_handles.data());
76
77 q.pipe->launch_grid(q.pipe,
78 pad_vector(q, block_size, 1).data(),
79 pad_vector(q, reduced_grid_size, 1).data(),
80 find(name_equals(_name), m.syms).offset,
81 exec.input.data());
82
83 q.pipe->set_global_binding(q.pipe, 0, exec.g_buffers.size(), NULL, NULL);
84 q.pipe->set_compute_resources(q.pipe, 0, exec.resources.size(), NULL);
85 q.pipe->set_sampler_views(q.pipe, PIPE_SHADER_COMPUTE, 0,
86 exec.sviews.size(), NULL);
87 q.pipe->bind_sampler_states(q.pipe, PIPE_SHADER_COMPUTE, 0,
88 exec.samplers.size(), NULL);
89 exec.unbind();
90 }
91
92 size_t
93 kernel::mem_local() const {
94 size_t sz = 0;
95
96 for (auto &arg : args()) {
97 if (dynamic_cast<local_argument *>(&arg))
98 sz += arg.storage();
99 }
100
101 return sz;
102 }
103
104 size_t
105 kernel::mem_private() const {
106 return 0;
107 }
108
109 const std::string &
110 kernel::name() const {
111 return _name;
112 }
113
114 std::vector<size_t>
115 kernel::optimal_block_size(const command_queue &q,
116 const std::vector<size_t> &grid_size) const {
117 return factor::find_grid_optimal_factor<size_t>(
118 q.device().max_threads_per_block(), q.device().max_block_size(),
119 grid_size);
120 }
121
122 std::vector<size_t>
123 kernel::required_block_size() const {
124 return { 0, 0, 0 };
125 }
126
127 kernel::argument_range
128 kernel::args() {
129 return map(derefs(), _args);
130 }
131
132 kernel::const_argument_range
133 kernel::args() const {
134 return map(derefs(), _args);
135 }
136
137 const module &
138 kernel::module(const command_queue &q) const {
139 return program().binary(q.device());
140 }
141
142 kernel::exec_context::exec_context(kernel &kern) :
143 kern(kern), q(NULL), mem_local(0), st(NULL), cs() {
144 }
145
146 kernel::exec_context::~exec_context() {
147 if (st)
148 q->pipe->delete_compute_state(q->pipe, st);
149 }
150
151 void *
152 kernel::exec_context::bind(intrusive_ptr<command_queue> _q,
153 const std::vector<size_t> &grid_offset) {
154 std::swap(q, _q);
155
156 // Bind kernel arguments.
157 auto &m = kern.program().binary(q->device());
158 auto margs = find(name_equals(kern.name()), m.syms).args;
159 auto msec = find(type_equals(module::section::text), m.secs);
160 auto explicit_arg = kern._args.begin();
161
162 for (auto &marg : margs) {
163 switch (marg.semantic) {
164 case module::argument::general:
165 (*(explicit_arg++))->bind(*this, marg);
166 break;
167
168 case module::argument::grid_dimension: {
169 const cl_uint dimension = grid_offset.size();
170 auto arg = argument::create(marg);
171
172 arg->set(sizeof(dimension), &dimension);
173 arg->bind(*this, marg);
174 break;
175 }
176 case module::argument::grid_offset: {
177 for (cl_uint x : pad_vector(*q, grid_offset, 1)) {
178 auto arg = argument::create(marg);
179
180 arg->set(sizeof(x), &x);
181 arg->bind(*this, marg);
182 }
183 break;
184 }
185 }
186 }
187
188 // Create a new compute state if anything changed.
189 if (!st || q != _q ||
190 cs.req_local_mem != mem_local ||
191 cs.req_input_mem != input.size()) {
192 if (st)
193 _q->pipe->delete_compute_state(_q->pipe, st);
194
195 cs.prog = &(msec.data[0]);
196 cs.req_local_mem = mem_local;
197 cs.req_input_mem = input.size();
198 st = q->pipe->create_compute_state(q->pipe, &cs);
199 }
200
201 return st;
202 }
203
204 void
205 kernel::exec_context::unbind() {
206 for (auto &arg : kern.args())
207 arg.unbind(*this);
208
209 input.clear();
210 samplers.clear();
211 sviews.clear();
212 resources.clear();
213 g_buffers.clear();
214 g_handles.clear();
215 mem_local = 0;
216 }
217
218 namespace {
219 template<typename T>
220 std::vector<uint8_t>
221 bytes(const T& x) {
222 return { (uint8_t *)&x, (uint8_t *)&x + sizeof(x) };
223 }
224
225 ///
226 /// Transform buffer \a v from the native byte order into the byte
227 /// order specified by \a e.
228 ///
229 template<typename T>
230 void
231 byteswap(T &v, pipe_endian e) {
232 if (PIPE_ENDIAN_NATIVE != e)
233 std::reverse(v.begin(), v.end());
234 }
235
236 ///
237 /// Pad buffer \a v to the next multiple of \a n.
238 ///
239 template<typename T>
240 void
241 align(T &v, size_t n) {
242 v.resize(util_align_npot(v.size(), n));
243 }
244
245 bool
246 msb(const std::vector<uint8_t> &s) {
247 if (PIPE_ENDIAN_NATIVE == PIPE_ENDIAN_LITTLE)
248 return s.back() & 0x80;
249 else
250 return s.front() & 0x80;
251 }
252
253 ///
254 /// Resize buffer \a v to size \a n using sign or zero extension
255 /// according to \a ext.
256 ///
257 template<typename T>
258 void
259 extend(T &v, enum module::argument::ext_type ext, size_t n) {
260 const size_t m = std::min(v.size(), n);
261 const bool sign_ext = (ext == module::argument::sign_ext);
262 const uint8_t fill = (sign_ext && msb(v) ? ~0 : 0);
263 T w(n, fill);
264
265 if (PIPE_ENDIAN_NATIVE == PIPE_ENDIAN_LITTLE)
266 std::copy_n(v.begin(), m, w.begin());
267 else
268 std::copy_n(v.end() - m, m, w.end() - m);
269
270 std::swap(v, w);
271 }
272
273 ///
274 /// Append buffer \a w to \a v.
275 ///
276 template<typename T>
277 void
278 insert(T &v, const T &w) {
279 v.insert(v.end(), w.begin(), w.end());
280 }
281
282 ///
283 /// Append \a n elements to the end of buffer \a v.
284 ///
285 template<typename T>
286 size_t
287 allocate(T &v, size_t n) {
288 size_t pos = v.size();
289 v.resize(pos + n);
290 return pos;
291 }
292 }
293
294 std::unique_ptr<kernel::argument>
295 kernel::argument::create(const module::argument &marg) {
296 switch (marg.type) {
297 case module::argument::scalar:
298 return std::unique_ptr<kernel::argument>(new scalar_argument(marg.size));
299
300 case module::argument::global:
301 return std::unique_ptr<kernel::argument>(new global_argument);
302
303 case module::argument::local:
304 return std::unique_ptr<kernel::argument>(new local_argument);
305
306 case module::argument::constant:
307 return std::unique_ptr<kernel::argument>(new constant_argument);
308
309 case module::argument::image2d_rd:
310 case module::argument::image3d_rd:
311 return std::unique_ptr<kernel::argument>(new image_rd_argument);
312
313 case module::argument::image2d_wr:
314 case module::argument::image3d_wr:
315 return std::unique_ptr<kernel::argument>(new image_wr_argument);
316
317 case module::argument::sampler:
318 return std::unique_ptr<kernel::argument>(new sampler_argument);
319
320 }
321 throw error(CL_INVALID_KERNEL_DEFINITION);
322 }
323
324 kernel::argument::argument() : _set(false) {
325 }
326
327 bool
328 kernel::argument::set() const {
329 return _set;
330 }
331
332 size_t
333 kernel::argument::storage() const {
334 return 0;
335 }
336
337 kernel::scalar_argument::scalar_argument(size_t size) : size(size) {
338 }
339
340 void
341 kernel::scalar_argument::set(size_t size, const void *value) {
342 if (size != this->size)
343 throw error(CL_INVALID_ARG_SIZE);
344
345 v = { (uint8_t *)value, (uint8_t *)value + size };
346 _set = true;
347 }
348
349 void
350 kernel::scalar_argument::bind(exec_context &ctx,
351 const module::argument &marg) {
352 auto w = v;
353
354 extend(w, marg.ext_type, marg.target_size);
355 byteswap(w, ctx.q->device().endianness());
356 align(ctx.input, marg.target_align);
357 insert(ctx.input, w);
358 }
359
360 void
361 kernel::scalar_argument::unbind(exec_context &ctx) {
362 }
363
364 void
365 kernel::global_argument::set(size_t size, const void *value) {
366 if (size != sizeof(cl_mem))
367 throw error(CL_INVALID_ARG_SIZE);
368
369 buf = pobj<buffer>(value ? *(cl_mem *)value : NULL);
370 _set = true;
371 }
372
373 void
374 kernel::global_argument::bind(exec_context &ctx,
375 const module::argument &marg) {
376 align(ctx.input, marg.target_align);
377
378 if (buf) {
379 const resource &r = buf->resource(*ctx.q);
380 ctx.g_handles.push_back(ctx.input.size());
381 ctx.g_buffers.push_back(r.pipe);
382
383 // How to handle multi-demensional offsets?
384 // We don't need to. Buffer offsets are always
385 // one-dimensional.
386 auto v = bytes(r.offset[0]);
387 extend(v, marg.ext_type, marg.target_size);
388 byteswap(v, ctx.q->device().endianness());
389 insert(ctx.input, v);
390 } else {
391 // Null pointer.
392 allocate(ctx.input, marg.target_size);
393 }
394 }
395
396 void
397 kernel::global_argument::unbind(exec_context &ctx) {
398 }
399
400 size_t
401 kernel::local_argument::storage() const {
402 return _storage;
403 }
404
405 void
406 kernel::local_argument::set(size_t size, const void *value) {
407 if (value)
408 throw error(CL_INVALID_ARG_VALUE);
409
410 _storage = size;
411 _set = true;
412 }
413
414 void
415 kernel::local_argument::bind(exec_context &ctx,
416 const module::argument &marg) {
417 auto v = bytes(ctx.mem_local);
418
419 extend(v, module::argument::zero_ext, marg.target_size);
420 byteswap(v, ctx.q->device().endianness());
421 align(ctx.input, marg.target_align);
422 insert(ctx.input, v);
423
424 ctx.mem_local += _storage;
425 }
426
427 void
428 kernel::local_argument::unbind(exec_context &ctx) {
429 }
430
431 void
432 kernel::constant_argument::set(size_t size, const void *value) {
433 if (size != sizeof(cl_mem))
434 throw error(CL_INVALID_ARG_SIZE);
435
436 buf = pobj<buffer>(value ? *(cl_mem *)value : NULL);
437 _set = true;
438 }
439
440 void
441 kernel::constant_argument::bind(exec_context &ctx,
442 const module::argument &marg) {
443 align(ctx.input, marg.target_align);
444
445 if (buf) {
446 resource &r = buf->resource(*ctx.q);
447 auto v = bytes(ctx.resources.size() << 24 | r.offset[0]);
448
449 extend(v, module::argument::zero_ext, marg.target_size);
450 byteswap(v, ctx.q->device().endianness());
451 insert(ctx.input, v);
452
453 st = r.bind_surface(*ctx.q, false);
454 ctx.resources.push_back(st);
455 } else {
456 // Null pointer.
457 allocate(ctx.input, marg.target_size);
458 }
459 }
460
461 void
462 kernel::constant_argument::unbind(exec_context &ctx) {
463 if (buf)
464 buf->resource(*ctx.q).unbind_surface(*ctx.q, st);
465 }
466
467 void
468 kernel::image_rd_argument::set(size_t size, const void *value) {
469 if (size != sizeof(cl_mem))
470 throw error(CL_INVALID_ARG_SIZE);
471
472 img = &obj<image>(*(cl_mem *)value);
473 _set = true;
474 }
475
476 void
477 kernel::image_rd_argument::bind(exec_context &ctx,
478 const module::argument &marg) {
479 auto v = bytes(ctx.sviews.size());
480
481 extend(v, module::argument::zero_ext, marg.target_size);
482 byteswap(v, ctx.q->device().endianness());
483 align(ctx.input, marg.target_align);
484 insert(ctx.input, v);
485
486 st = img->resource(*ctx.q).bind_sampler_view(*ctx.q);
487 ctx.sviews.push_back(st);
488 }
489
490 void
491 kernel::image_rd_argument::unbind(exec_context &ctx) {
492 img->resource(*ctx.q).unbind_sampler_view(*ctx.q, st);
493 }
494
495 void
496 kernel::image_wr_argument::set(size_t size, const void *value) {
497 if (size != sizeof(cl_mem))
498 throw error(CL_INVALID_ARG_SIZE);
499
500 img = &obj<image>(*(cl_mem *)value);
501 _set = true;
502 }
503
504 void
505 kernel::image_wr_argument::bind(exec_context &ctx,
506 const module::argument &marg) {
507 auto v = bytes(ctx.resources.size());
508
509 extend(v, module::argument::zero_ext, marg.target_size);
510 byteswap(v, ctx.q->device().endianness());
511 align(ctx.input, marg.target_align);
512 insert(ctx.input, v);
513
514 st = img->resource(*ctx.q).bind_surface(*ctx.q, true);
515 ctx.resources.push_back(st);
516 }
517
518 void
519 kernel::image_wr_argument::unbind(exec_context &ctx) {
520 img->resource(*ctx.q).unbind_surface(*ctx.q, st);
521 }
522
523 void
524 kernel::sampler_argument::set(size_t size, const void *value) {
525 if (size != sizeof(cl_sampler))
526 throw error(CL_INVALID_ARG_SIZE);
527
528 s = &obj(*(cl_sampler *)value);
529 _set = true;
530 }
531
532 void
533 kernel::sampler_argument::bind(exec_context &ctx,
534 const module::argument &marg) {
535 st = s->bind(*ctx.q);
536 ctx.samplers.push_back(st);
537 }
538
539 void
540 kernel::sampler_argument::unbind(exec_context &ctx) {
541 s->unbind(*ctx.q, st);
542 }