clover: fix build failure since bfd695e
[mesa.git] / src / gallium / state_trackers / clover / core / kernel.cpp
1 //
2 // Copyright 2012 Francisco Jerez
3 //
4 // Permission is hereby granted, free of charge, to any person obtaining a
5 // copy of this software and associated documentation files (the "Software"),
6 // to deal in the Software without restriction, including without limitation
7 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 // and/or sell copies of the Software, and to permit persons to whom the
9 // Software is furnished to do so, subject to the following conditions:
10 //
11 // The above copyright notice and this permission notice shall be included in
12 // all copies or substantial portions of the Software.
13 //
14 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 // OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 // OTHER DEALINGS IN THE SOFTWARE.
21 //
22
23 #include "core/kernel.hpp"
24 #include "core/resource.hpp"
25 #include "util/factor.hpp"
26 #include "util/u_math.h"
27 #include "pipe/p_context.h"
28
29 using namespace clover;
30
31 kernel::kernel(clover::program &prog, const std::string &name,
32 const std::vector<module::argument> &margs) :
33 program(prog), _name(name), exec(*this),
34 program_ref(prog._kernel_ref_counter) {
35 for (auto &marg : margs) {
36 if (marg.semantic == module::argument::general)
37 _args.emplace_back(argument::create(marg));
38 }
39 }
40
41 template<typename V>
42 static inline std::vector<uint>
43 pad_vector(command_queue &q, const V &v, uint x) {
44 std::vector<uint> w { v.begin(), v.end() };
45 w.resize(q.device().max_block_size().size(), x);
46 return w;
47 }
48
49 void
50 kernel::launch(command_queue &q,
51 const std::vector<size_t> &grid_offset,
52 const std::vector<size_t> &grid_size,
53 const std::vector<size_t> &block_size) {
54 const auto m = program().binary(q.device());
55 const auto reduced_grid_size =
56 map(divides(), grid_size, block_size);
57 void *st = exec.bind(&q, grid_offset);
58 struct pipe_grid_info info;
59
60 // The handles are created during exec_context::bind(), so we need make
61 // sure to call exec_context::bind() before retrieving them.
62 std::vector<uint32_t *> g_handles = map([&](size_t h) {
63 return (uint32_t *)&exec.input[h];
64 }, exec.g_handles);
65
66 q.pipe->bind_compute_state(q.pipe, st);
67 q.pipe->bind_sampler_states(q.pipe, PIPE_SHADER_COMPUTE,
68 0, exec.samplers.size(),
69 exec.samplers.data());
70
71 q.pipe->set_sampler_views(q.pipe, PIPE_SHADER_COMPUTE, 0,
72 exec.sviews.size(), exec.sviews.data());
73 q.pipe->set_compute_resources(q.pipe, 0, exec.resources.size(),
74 exec.resources.data());
75 q.pipe->set_global_binding(q.pipe, 0, exec.g_buffers.size(),
76 exec.g_buffers.data(), g_handles.data());
77
78 // Fill information for the launch_grid() call.
79 copy(pad_vector(q, block_size, 1), info.block);
80 copy(pad_vector(q, reduced_grid_size, 1), info.grid);
81 info.pc = find(name_equals(_name), m.syms).offset;
82 info.input = exec.input.data();
83
84 q.pipe->launch_grid(q.pipe, &info);
85
86 q.pipe->set_global_binding(q.pipe, 0, exec.g_buffers.size(), NULL, NULL);
87 q.pipe->set_compute_resources(q.pipe, 0, exec.resources.size(), NULL);
88 q.pipe->set_sampler_views(q.pipe, PIPE_SHADER_COMPUTE, 0,
89 exec.sviews.size(), NULL);
90 q.pipe->bind_sampler_states(q.pipe, PIPE_SHADER_COMPUTE, 0,
91 exec.samplers.size(), NULL);
92 exec.unbind();
93 }
94
95 size_t
96 kernel::mem_local() const {
97 size_t sz = 0;
98
99 for (auto &arg : args()) {
100 if (dynamic_cast<local_argument *>(&arg))
101 sz += arg.storage();
102 }
103
104 return sz;
105 }
106
107 size_t
108 kernel::mem_private() const {
109 return 0;
110 }
111
112 const std::string &
113 kernel::name() const {
114 return _name;
115 }
116
117 std::vector<size_t>
118 kernel::optimal_block_size(const command_queue &q,
119 const std::vector<size_t> &grid_size) const {
120 return factor::find_grid_optimal_factor<size_t>(
121 q.device().max_threads_per_block(), q.device().max_block_size(),
122 grid_size);
123 }
124
125 std::vector<size_t>
126 kernel::required_block_size() const {
127 return { 0, 0, 0 };
128 }
129
130 kernel::argument_range
131 kernel::args() {
132 return map(derefs(), _args);
133 }
134
135 kernel::const_argument_range
136 kernel::args() const {
137 return map(derefs(), _args);
138 }
139
140 const module &
141 kernel::module(const command_queue &q) const {
142 return program().binary(q.device());
143 }
144
145 kernel::exec_context::exec_context(kernel &kern) :
146 kern(kern), q(NULL), mem_local(0), st(NULL), cs() {
147 }
148
149 kernel::exec_context::~exec_context() {
150 if (st)
151 q->pipe->delete_compute_state(q->pipe, st);
152 }
153
154 void *
155 kernel::exec_context::bind(intrusive_ptr<command_queue> _q,
156 const std::vector<size_t> &grid_offset) {
157 std::swap(q, _q);
158
159 // Bind kernel arguments.
160 auto &m = kern.program().binary(q->device());
161 auto margs = find(name_equals(kern.name()), m.syms).args;
162 auto msec = find(type_equals(module::section::text), m.secs);
163 auto explicit_arg = kern._args.begin();
164
165 for (auto &marg : margs) {
166 switch (marg.semantic) {
167 case module::argument::general:
168 (*(explicit_arg++))->bind(*this, marg);
169 break;
170
171 case module::argument::grid_dimension: {
172 const cl_uint dimension = grid_offset.size();
173 auto arg = argument::create(marg);
174
175 arg->set(sizeof(dimension), &dimension);
176 arg->bind(*this, marg);
177 break;
178 }
179 case module::argument::grid_offset: {
180 for (cl_uint x : pad_vector(*q, grid_offset, 1)) {
181 auto arg = argument::create(marg);
182
183 arg->set(sizeof(x), &x);
184 arg->bind(*this, marg);
185 }
186 break;
187 }
188 case module::argument::image_size: {
189 auto img = dynamic_cast<image_argument &>(**(explicit_arg - 1)).get();
190 std::vector<cl_uint> image_size{
191 static_cast<cl_uint>(img->width()),
192 static_cast<cl_uint>(img->height()),
193 static_cast<cl_uint>(img->depth())};
194 for (auto x : image_size) {
195 auto arg = argument::create(marg);
196
197 arg->set(sizeof(x), &x);
198 arg->bind(*this, marg);
199 }
200 break;
201 }
202 case module::argument::image_format: {
203 auto img = dynamic_cast<image_argument &>(**(explicit_arg - 1)).get();
204 cl_image_format fmt = img->format();
205 std::vector<cl_uint> image_format{
206 static_cast<cl_uint>(fmt.image_channel_data_type),
207 static_cast<cl_uint>(fmt.image_channel_order)};
208 for (auto x : image_format) {
209 auto arg = argument::create(marg);
210
211 arg->set(sizeof(x), &x);
212 arg->bind(*this, marg);
213 }
214 break;
215 }
216 }
217 }
218
219 // Create a new compute state if anything changed.
220 if (!st || q != _q ||
221 cs.req_local_mem != mem_local ||
222 cs.req_input_mem != input.size()) {
223 if (st)
224 _q->pipe->delete_compute_state(_q->pipe, st);
225
226 cs.prog = &(msec.data[0]);
227 cs.req_local_mem = mem_local;
228 cs.req_input_mem = input.size();
229 st = q->pipe->create_compute_state(q->pipe, &cs);
230 }
231
232 return st;
233 }
234
235 void
236 kernel::exec_context::unbind() {
237 for (auto &arg : kern.args())
238 arg.unbind(*this);
239
240 input.clear();
241 samplers.clear();
242 sviews.clear();
243 resources.clear();
244 g_buffers.clear();
245 g_handles.clear();
246 mem_local = 0;
247 }
248
249 namespace {
250 template<typename T>
251 std::vector<uint8_t>
252 bytes(const T& x) {
253 return { (uint8_t *)&x, (uint8_t *)&x + sizeof(x) };
254 }
255
256 ///
257 /// Transform buffer \a v from the native byte order into the byte
258 /// order specified by \a e.
259 ///
260 template<typename T>
261 void
262 byteswap(T &v, pipe_endian e) {
263 if (PIPE_ENDIAN_NATIVE != e)
264 std::reverse(v.begin(), v.end());
265 }
266
267 ///
268 /// Pad buffer \a v to the next multiple of \a n.
269 ///
270 template<typename T>
271 void
272 align(T &v, size_t n) {
273 v.resize(util_align_npot(v.size(), n));
274 }
275
276 bool
277 msb(const std::vector<uint8_t> &s) {
278 if (PIPE_ENDIAN_NATIVE == PIPE_ENDIAN_LITTLE)
279 return s.back() & 0x80;
280 else
281 return s.front() & 0x80;
282 }
283
284 ///
285 /// Resize buffer \a v to size \a n using sign or zero extension
286 /// according to \a ext.
287 ///
288 template<typename T>
289 void
290 extend(T &v, enum module::argument::ext_type ext, size_t n) {
291 const size_t m = std::min(v.size(), n);
292 const bool sign_ext = (ext == module::argument::sign_ext);
293 const uint8_t fill = (sign_ext && msb(v) ? ~0 : 0);
294 T w(n, fill);
295
296 if (PIPE_ENDIAN_NATIVE == PIPE_ENDIAN_LITTLE)
297 std::copy_n(v.begin(), m, w.begin());
298 else
299 std::copy_n(v.end() - m, m, w.end() - m);
300
301 std::swap(v, w);
302 }
303
304 ///
305 /// Append buffer \a w to \a v.
306 ///
307 template<typename T>
308 void
309 insert(T &v, const T &w) {
310 v.insert(v.end(), w.begin(), w.end());
311 }
312
313 ///
314 /// Append \a n elements to the end of buffer \a v.
315 ///
316 template<typename T>
317 size_t
318 allocate(T &v, size_t n) {
319 size_t pos = v.size();
320 v.resize(pos + n);
321 return pos;
322 }
323 }
324
325 std::unique_ptr<kernel::argument>
326 kernel::argument::create(const module::argument &marg) {
327 switch (marg.type) {
328 case module::argument::scalar:
329 return std::unique_ptr<kernel::argument>(new scalar_argument(marg.size));
330
331 case module::argument::global:
332 return std::unique_ptr<kernel::argument>(new global_argument);
333
334 case module::argument::local:
335 return std::unique_ptr<kernel::argument>(new local_argument);
336
337 case module::argument::constant:
338 return std::unique_ptr<kernel::argument>(new constant_argument);
339
340 case module::argument::image2d_rd:
341 case module::argument::image3d_rd:
342 return std::unique_ptr<kernel::argument>(new image_rd_argument);
343
344 case module::argument::image2d_wr:
345 case module::argument::image3d_wr:
346 return std::unique_ptr<kernel::argument>(new image_wr_argument);
347
348 case module::argument::sampler:
349 return std::unique_ptr<kernel::argument>(new sampler_argument);
350
351 }
352 throw error(CL_INVALID_KERNEL_DEFINITION);
353 }
354
355 kernel::argument::argument() : _set(false) {
356 }
357
358 bool
359 kernel::argument::set() const {
360 return _set;
361 }
362
363 size_t
364 kernel::argument::storage() const {
365 return 0;
366 }
367
368 kernel::scalar_argument::scalar_argument(size_t size) : size(size) {
369 }
370
371 void
372 kernel::scalar_argument::set(size_t size, const void *value) {
373 if (!value)
374 throw error(CL_INVALID_ARG_VALUE);
375
376 if (size != this->size)
377 throw error(CL_INVALID_ARG_SIZE);
378
379 v = { (uint8_t *)value, (uint8_t *)value + size };
380 _set = true;
381 }
382
383 void
384 kernel::scalar_argument::bind(exec_context &ctx,
385 const module::argument &marg) {
386 auto w = v;
387
388 extend(w, marg.ext_type, marg.target_size);
389 byteswap(w, ctx.q->device().endianness());
390 align(ctx.input, marg.target_align);
391 insert(ctx.input, w);
392 }
393
394 void
395 kernel::scalar_argument::unbind(exec_context &ctx) {
396 }
397
398 void
399 kernel::global_argument::set(size_t size, const void *value) {
400 if (size != sizeof(cl_mem))
401 throw error(CL_INVALID_ARG_SIZE);
402
403 buf = pobj<buffer>(value ? *(cl_mem *)value : NULL);
404 _set = true;
405 }
406
407 void
408 kernel::global_argument::bind(exec_context &ctx,
409 const module::argument &marg) {
410 align(ctx.input, marg.target_align);
411
412 if (buf) {
413 const resource &r = buf->resource(*ctx.q);
414 ctx.g_handles.push_back(ctx.input.size());
415 ctx.g_buffers.push_back(r.pipe);
416
417 // How to handle multi-demensional offsets?
418 // We don't need to. Buffer offsets are always
419 // one-dimensional.
420 auto v = bytes(r.offset[0]);
421 extend(v, marg.ext_type, marg.target_size);
422 byteswap(v, ctx.q->device().endianness());
423 insert(ctx.input, v);
424 } else {
425 // Null pointer.
426 allocate(ctx.input, marg.target_size);
427 }
428 }
429
430 void
431 kernel::global_argument::unbind(exec_context &ctx) {
432 }
433
434 size_t
435 kernel::local_argument::storage() const {
436 return _storage;
437 }
438
439 void
440 kernel::local_argument::set(size_t size, const void *value) {
441 if (value)
442 throw error(CL_INVALID_ARG_VALUE);
443
444 if (!size)
445 throw error(CL_INVALID_ARG_SIZE);
446
447 _storage = size;
448 _set = true;
449 }
450
451 void
452 kernel::local_argument::bind(exec_context &ctx,
453 const module::argument &marg) {
454 auto v = bytes(ctx.mem_local);
455
456 extend(v, module::argument::zero_ext, marg.target_size);
457 byteswap(v, ctx.q->device().endianness());
458 align(ctx.input, marg.target_align);
459 insert(ctx.input, v);
460
461 ctx.mem_local += _storage;
462 }
463
464 void
465 kernel::local_argument::unbind(exec_context &ctx) {
466 }
467
468 void
469 kernel::constant_argument::set(size_t size, const void *value) {
470 if (size != sizeof(cl_mem))
471 throw error(CL_INVALID_ARG_SIZE);
472
473 buf = pobj<buffer>(value ? *(cl_mem *)value : NULL);
474 _set = true;
475 }
476
477 void
478 kernel::constant_argument::bind(exec_context &ctx,
479 const module::argument &marg) {
480 align(ctx.input, marg.target_align);
481
482 if (buf) {
483 resource &r = buf->resource(*ctx.q);
484 auto v = bytes(ctx.resources.size() << 24 | r.offset[0]);
485
486 extend(v, module::argument::zero_ext, marg.target_size);
487 byteswap(v, ctx.q->device().endianness());
488 insert(ctx.input, v);
489
490 st = r.bind_surface(*ctx.q, false);
491 ctx.resources.push_back(st);
492 } else {
493 // Null pointer.
494 allocate(ctx.input, marg.target_size);
495 }
496 }
497
498 void
499 kernel::constant_argument::unbind(exec_context &ctx) {
500 if (buf)
501 buf->resource(*ctx.q).unbind_surface(*ctx.q, st);
502 }
503
504 void
505 kernel::image_rd_argument::set(size_t size, const void *value) {
506 if (!value)
507 throw error(CL_INVALID_ARG_VALUE);
508
509 if (size != sizeof(cl_mem))
510 throw error(CL_INVALID_ARG_SIZE);
511
512 img = &obj<image>(*(cl_mem *)value);
513 _set = true;
514 }
515
516 void
517 kernel::image_rd_argument::bind(exec_context &ctx,
518 const module::argument &marg) {
519 auto v = bytes(ctx.sviews.size());
520
521 extend(v, module::argument::zero_ext, marg.target_size);
522 byteswap(v, ctx.q->device().endianness());
523 align(ctx.input, marg.target_align);
524 insert(ctx.input, v);
525
526 st = img->resource(*ctx.q).bind_sampler_view(*ctx.q);
527 ctx.sviews.push_back(st);
528 }
529
530 void
531 kernel::image_rd_argument::unbind(exec_context &ctx) {
532 img->resource(*ctx.q).unbind_sampler_view(*ctx.q, st);
533 }
534
535 void
536 kernel::image_wr_argument::set(size_t size, const void *value) {
537 if (!value)
538 throw error(CL_INVALID_ARG_VALUE);
539
540 if (size != sizeof(cl_mem))
541 throw error(CL_INVALID_ARG_SIZE);
542
543 img = &obj<image>(*(cl_mem *)value);
544 _set = true;
545 }
546
547 void
548 kernel::image_wr_argument::bind(exec_context &ctx,
549 const module::argument &marg) {
550 auto v = bytes(ctx.resources.size());
551
552 extend(v, module::argument::zero_ext, marg.target_size);
553 byteswap(v, ctx.q->device().endianness());
554 align(ctx.input, marg.target_align);
555 insert(ctx.input, v);
556
557 st = img->resource(*ctx.q).bind_surface(*ctx.q, true);
558 ctx.resources.push_back(st);
559 }
560
561 void
562 kernel::image_wr_argument::unbind(exec_context &ctx) {
563 img->resource(*ctx.q).unbind_surface(*ctx.q, st);
564 }
565
566 void
567 kernel::sampler_argument::set(size_t size, const void *value) {
568 if (!value)
569 throw error(CL_INVALID_SAMPLER);
570
571 if (size != sizeof(cl_sampler))
572 throw error(CL_INVALID_ARG_SIZE);
573
574 s = &obj(*(cl_sampler *)value);
575 _set = true;
576 }
577
578 void
579 kernel::sampler_argument::bind(exec_context &ctx,
580 const module::argument &marg) {
581 st = s->bind(*ctx.q);
582 ctx.samplers.push_back(st);
583 }
584
585 void
586 kernel::sampler_argument::unbind(exec_context &ctx) {
587 s->unbind(*ctx.q, st);
588 }