428229c050390672e0b86826a41a2c96c5b42700
[mesa.git] / src / gallium / state_trackers / clover / core / kernel.cpp
1 //
2 // Copyright 2012 Francisco Jerez
3 //
4 // Permission is hereby granted, free of charge, to any person obtaining a
5 // copy of this software and associated documentation files (the "Software"),
6 // to deal in the Software without restriction, including without limitation
7 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 // and/or sell copies of the Software, and to permit persons to whom the
9 // Software is furnished to do so, subject to the following conditions:
10 //
11 // The above copyright notice and this permission notice shall be included in
12 // all copies or substantial portions of the Software.
13 //
14 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 // OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 // OTHER DEALINGS IN THE SOFTWARE.
21 //
22
23 #include "core/kernel.hpp"
24 #include "core/resource.hpp"
25 #include "util/factor.hpp"
26 #include "util/u_math.h"
27 #include "pipe/p_context.h"
28
29 using namespace clover;
30
31 kernel::kernel(clover::program &prog, const std::string &name,
32 const std::vector<module::argument> &margs) :
33 program(prog), _name(name), exec(*this),
34 program_ref(prog._kernel_ref_counter) {
35 for (auto &marg : margs) {
36 if (marg.semantic == module::argument::general)
37 _args.emplace_back(argument::create(marg));
38 }
39 }
40
41 template<typename V>
42 static inline std::vector<uint>
43 pad_vector(command_queue &q, const V &v, uint x) {
44 std::vector<uint> w { v.begin(), v.end() };
45 w.resize(q.device().max_block_size().size(), x);
46 return w;
47 }
48
49 void
50 kernel::launch(command_queue &q,
51 const std::vector<size_t> &grid_offset,
52 const std::vector<size_t> &grid_size,
53 const std::vector<size_t> &block_size) {
54 const auto m = program().build(q.device()).binary;
55 const auto reduced_grid_size =
56 map(divides(), grid_size, block_size);
57 void *st = exec.bind(&q, grid_offset);
58 struct pipe_grid_info info = {};
59
60 // The handles are created during exec_context::bind(), so we need make
61 // sure to call exec_context::bind() before retrieving them.
62 std::vector<uint64_t *> g_handles = map([&](size_t h) {
63 return (uint64_t *)&exec.input[h];
64 }, exec.g_handles);
65
66 q.pipe->bind_compute_state(q.pipe, st);
67 q.pipe->bind_sampler_states(q.pipe, PIPE_SHADER_COMPUTE,
68 0, exec.samplers.size(),
69 exec.samplers.data());
70
71 q.pipe->set_sampler_views(q.pipe, PIPE_SHADER_COMPUTE, 0,
72 exec.sviews.size(), exec.sviews.data());
73 q.pipe->set_compute_resources(q.pipe, 0, exec.resources.size(),
74 exec.resources.data());
75 q.pipe->set_global_binding(q.pipe, 0, exec.g_buffers.size(),
76 exec.g_buffers.data(), g_handles.data());
77
78 // Fill information for the launch_grid() call.
79 info.work_dim = grid_size.size();
80 copy(pad_vector(q, block_size, 1), info.block);
81 copy(pad_vector(q, reduced_grid_size, 1), info.grid);
82 info.pc = find(name_equals(_name), m.syms).offset;
83 info.input = exec.input.data();
84
85 q.pipe->launch_grid(q.pipe, &info);
86
87 q.pipe->set_global_binding(q.pipe, 0, exec.g_buffers.size(), NULL, NULL);
88 q.pipe->set_compute_resources(q.pipe, 0, exec.resources.size(), NULL);
89 q.pipe->set_sampler_views(q.pipe, PIPE_SHADER_COMPUTE, 0,
90 exec.sviews.size(), NULL);
91 q.pipe->bind_sampler_states(q.pipe, PIPE_SHADER_COMPUTE, 0,
92 exec.samplers.size(), NULL);
93
94 q.pipe->memory_barrier(q.pipe, PIPE_BARRIER_GLOBAL_BUFFER);
95 exec.unbind();
96 }
97
98 size_t
99 kernel::mem_local() const {
100 size_t sz = 0;
101
102 for (auto &arg : args()) {
103 if (dynamic_cast<local_argument *>(&arg))
104 sz += arg.storage();
105 }
106
107 return sz;
108 }
109
110 size_t
111 kernel::mem_private() const {
112 return 0;
113 }
114
115 const std::string &
116 kernel::name() const {
117 return _name;
118 }
119
120 std::vector<size_t>
121 kernel::optimal_block_size(const command_queue &q,
122 const std::vector<size_t> &grid_size) const {
123 return factor::find_grid_optimal_factor<size_t>(
124 q.device().max_threads_per_block(), q.device().max_block_size(),
125 grid_size);
126 }
127
128 std::vector<size_t>
129 kernel::required_block_size() const {
130 return { 0, 0, 0 };
131 }
132
133 kernel::argument_range
134 kernel::args() {
135 return map(derefs(), _args);
136 }
137
138 kernel::const_argument_range
139 kernel::args() const {
140 return map(derefs(), _args);
141 }
142
143 const module &
144 kernel::module(const command_queue &q) const {
145 return program().build(q.device()).binary;
146 }
147
148 kernel::exec_context::exec_context(kernel &kern) :
149 kern(kern), q(NULL), mem_local(0), st(NULL), cs() {
150 }
151
152 kernel::exec_context::~exec_context() {
153 if (st)
154 q->pipe->delete_compute_state(q->pipe, st);
155 }
156
157 void *
158 kernel::exec_context::bind(intrusive_ptr<command_queue> _q,
159 const std::vector<size_t> &grid_offset) {
160 std::swap(q, _q);
161
162 // Bind kernel arguments.
163 auto &m = kern.program().build(q->device()).binary;
164 auto msym = find(name_equals(kern.name()), m.syms);
165 auto margs = msym.args;
166 auto msec = find(id_equals(msym.section), m.secs);
167 auto explicit_arg = kern._args.begin();
168
169 for (auto &marg : margs) {
170 switch (marg.semantic) {
171 case module::argument::general:
172 (*(explicit_arg++))->bind(*this, marg);
173 break;
174
175 case module::argument::grid_dimension: {
176 const cl_uint dimension = grid_offset.size();
177 auto arg = argument::create(marg);
178
179 arg->set(sizeof(dimension), &dimension);
180 arg->bind(*this, marg);
181 break;
182 }
183 case module::argument::grid_offset: {
184 for (cl_uint x : pad_vector(*q, grid_offset, 0)) {
185 auto arg = argument::create(marg);
186
187 arg->set(sizeof(x), &x);
188 arg->bind(*this, marg);
189 }
190 break;
191 }
192 case module::argument::image_size: {
193 auto img = dynamic_cast<image_argument &>(**(explicit_arg - 1)).get();
194 std::vector<cl_uint> image_size{
195 static_cast<cl_uint>(img->width()),
196 static_cast<cl_uint>(img->height()),
197 static_cast<cl_uint>(img->depth())};
198 for (auto x : image_size) {
199 auto arg = argument::create(marg);
200
201 arg->set(sizeof(x), &x);
202 arg->bind(*this, marg);
203 }
204 break;
205 }
206 case module::argument::image_format: {
207 auto img = dynamic_cast<image_argument &>(**(explicit_arg - 1)).get();
208 cl_image_format fmt = img->format();
209 std::vector<cl_uint> image_format{
210 static_cast<cl_uint>(fmt.image_channel_data_type),
211 static_cast<cl_uint>(fmt.image_channel_order)};
212 for (auto x : image_format) {
213 auto arg = argument::create(marg);
214
215 arg->set(sizeof(x), &x);
216 arg->bind(*this, marg);
217 }
218 break;
219 }
220 }
221 }
222
223 // Create a new compute state if anything changed.
224 if (!st || q != _q ||
225 cs.req_local_mem != mem_local ||
226 cs.req_input_mem != input.size()) {
227 if (st)
228 _q->pipe->delete_compute_state(_q->pipe, st);
229
230 cs.ir_type = q->device().ir_format();
231 cs.prog = &(msec.data[0]);
232 cs.req_local_mem = mem_local;
233 cs.req_input_mem = input.size();
234 st = q->pipe->create_compute_state(q->pipe, &cs);
235 if (!st) {
236 unbind(); // Cleanup
237 throw error(CL_OUT_OF_RESOURCES);
238 }
239 }
240
241 return st;
242 }
243
244 void
245 kernel::exec_context::unbind() {
246 for (auto &arg : kern.args())
247 arg.unbind(*this);
248
249 input.clear();
250 samplers.clear();
251 sviews.clear();
252 resources.clear();
253 g_buffers.clear();
254 g_handles.clear();
255 mem_local = 0;
256 }
257
258 namespace {
259 template<typename T>
260 std::vector<uint8_t>
261 bytes(const T& x) {
262 return { (uint8_t *)&x, (uint8_t *)&x + sizeof(x) };
263 }
264
265 ///
266 /// Transform buffer \a v from the native byte order into the byte
267 /// order specified by \a e.
268 ///
269 template<typename T>
270 void
271 byteswap(T &v, pipe_endian e) {
272 if (PIPE_ENDIAN_NATIVE != e)
273 std::reverse(v.begin(), v.end());
274 }
275
276 ///
277 /// Pad buffer \a v to the next multiple of \a n.
278 ///
279 template<typename T>
280 void
281 align(T &v, size_t n) {
282 v.resize(util_align_npot(v.size(), n));
283 }
284
285 bool
286 msb(const std::vector<uint8_t> &s) {
287 if (PIPE_ENDIAN_NATIVE == PIPE_ENDIAN_LITTLE)
288 return s.back() & 0x80;
289 else
290 return s.front() & 0x80;
291 }
292
293 ///
294 /// Resize buffer \a v to size \a n using sign or zero extension
295 /// according to \a ext.
296 ///
297 template<typename T>
298 void
299 extend(T &v, enum module::argument::ext_type ext, size_t n) {
300 const size_t m = std::min(v.size(), n);
301 const bool sign_ext = (ext == module::argument::sign_ext);
302 const uint8_t fill = (sign_ext && msb(v) ? ~0 : 0);
303 T w(n, fill);
304
305 if (PIPE_ENDIAN_NATIVE == PIPE_ENDIAN_LITTLE)
306 std::copy_n(v.begin(), m, w.begin());
307 else
308 std::copy_n(v.end() - m, m, w.end() - m);
309
310 std::swap(v, w);
311 }
312
313 ///
314 /// Append buffer \a w to \a v.
315 ///
316 template<typename T>
317 void
318 insert(T &v, const T &w) {
319 v.insert(v.end(), w.begin(), w.end());
320 }
321
322 ///
323 /// Append \a n elements to the end of buffer \a v.
324 ///
325 template<typename T>
326 size_t
327 allocate(T &v, size_t n) {
328 size_t pos = v.size();
329 v.resize(pos + n);
330 return pos;
331 }
332 }
333
334 std::unique_ptr<kernel::argument>
335 kernel::argument::create(const module::argument &marg) {
336 switch (marg.type) {
337 case module::argument::scalar:
338 return std::unique_ptr<kernel::argument>(new scalar_argument(marg.size));
339
340 case module::argument::global:
341 return std::unique_ptr<kernel::argument>(new global_argument);
342
343 case module::argument::local:
344 return std::unique_ptr<kernel::argument>(new local_argument);
345
346 case module::argument::constant:
347 return std::unique_ptr<kernel::argument>(new constant_argument);
348
349 case module::argument::image2d_rd:
350 case module::argument::image3d_rd:
351 return std::unique_ptr<kernel::argument>(new image_rd_argument);
352
353 case module::argument::image2d_wr:
354 case module::argument::image3d_wr:
355 return std::unique_ptr<kernel::argument>(new image_wr_argument);
356
357 case module::argument::sampler:
358 return std::unique_ptr<kernel::argument>(new sampler_argument);
359
360 }
361 throw error(CL_INVALID_KERNEL_DEFINITION);
362 }
363
364 kernel::argument::argument() : _set(false) {
365 }
366
367 bool
368 kernel::argument::set() const {
369 return _set;
370 }
371
372 size_t
373 kernel::argument::storage() const {
374 return 0;
375 }
376
377 kernel::scalar_argument::scalar_argument(size_t size) : size(size) {
378 }
379
380 void
381 kernel::scalar_argument::set(size_t size, const void *value) {
382 if (!value)
383 throw error(CL_INVALID_ARG_VALUE);
384
385 if (size != this->size)
386 throw error(CL_INVALID_ARG_SIZE);
387
388 v = { (uint8_t *)value, (uint8_t *)value + size };
389 _set = true;
390 }
391
392 void
393 kernel::scalar_argument::bind(exec_context &ctx,
394 const module::argument &marg) {
395 auto w = v;
396
397 extend(w, marg.ext_type, marg.target_size);
398 byteswap(w, ctx.q->device().endianness());
399 align(ctx.input, marg.target_align);
400 insert(ctx.input, w);
401 }
402
403 void
404 kernel::scalar_argument::unbind(exec_context &ctx) {
405 }
406
407 void
408 kernel::global_argument::set(size_t size, const void *value) {
409 if (size != sizeof(cl_mem))
410 throw error(CL_INVALID_ARG_SIZE);
411
412 buf = pobj<buffer>(value ? *(cl_mem *)value : NULL);
413 _set = true;
414 }
415
416 void
417 kernel::global_argument::bind(exec_context &ctx,
418 const module::argument &marg) {
419 align(ctx.input, marg.target_align);
420
421 if (buf) {
422 const resource &r = buf->resource(*ctx.q);
423 ctx.g_handles.push_back(ctx.input.size());
424 ctx.g_buffers.push_back(r.pipe);
425
426 // How to handle multi-demensional offsets?
427 // We don't need to. Buffer offsets are always
428 // one-dimensional.
429 auto v = bytes(r.offset[0]);
430 extend(v, marg.ext_type, marg.target_size);
431 byteswap(v, ctx.q->device().endianness());
432 insert(ctx.input, v);
433 } else {
434 // Null pointer.
435 allocate(ctx.input, marg.target_size);
436 }
437 }
438
439 void
440 kernel::global_argument::unbind(exec_context &ctx) {
441 }
442
443 size_t
444 kernel::local_argument::storage() const {
445 return _storage;
446 }
447
448 void
449 kernel::local_argument::set(size_t size, const void *value) {
450 if (value)
451 throw error(CL_INVALID_ARG_VALUE);
452
453 if (!size)
454 throw error(CL_INVALID_ARG_SIZE);
455
456 _storage = size;
457 _set = true;
458 }
459
460 void
461 kernel::local_argument::bind(exec_context &ctx,
462 const module::argument &marg) {
463 auto v = bytes(ctx.mem_local);
464
465 extend(v, module::argument::zero_ext, marg.target_size);
466 byteswap(v, ctx.q->device().endianness());
467 align(ctx.input, marg.target_align);
468 insert(ctx.input, v);
469
470 ctx.mem_local += _storage;
471 }
472
473 void
474 kernel::local_argument::unbind(exec_context &ctx) {
475 }
476
477 void
478 kernel::constant_argument::set(size_t size, const void *value) {
479 if (size != sizeof(cl_mem))
480 throw error(CL_INVALID_ARG_SIZE);
481
482 buf = pobj<buffer>(value ? *(cl_mem *)value : NULL);
483 _set = true;
484 }
485
486 void
487 kernel::constant_argument::bind(exec_context &ctx,
488 const module::argument &marg) {
489 align(ctx.input, marg.target_align);
490
491 if (buf) {
492 resource &r = buf->resource(*ctx.q);
493 auto v = bytes(ctx.resources.size() << 24 | r.offset[0]);
494
495 extend(v, module::argument::zero_ext, marg.target_size);
496 byteswap(v, ctx.q->device().endianness());
497 insert(ctx.input, v);
498
499 st = r.bind_surface(*ctx.q, false);
500 ctx.resources.push_back(st);
501 } else {
502 // Null pointer.
503 allocate(ctx.input, marg.target_size);
504 }
505 }
506
507 void
508 kernel::constant_argument::unbind(exec_context &ctx) {
509 if (buf)
510 buf->resource(*ctx.q).unbind_surface(*ctx.q, st);
511 }
512
513 void
514 kernel::image_rd_argument::set(size_t size, const void *value) {
515 if (!value)
516 throw error(CL_INVALID_ARG_VALUE);
517
518 if (size != sizeof(cl_mem))
519 throw error(CL_INVALID_ARG_SIZE);
520
521 img = &obj<image>(*(cl_mem *)value);
522 _set = true;
523 }
524
525 void
526 kernel::image_rd_argument::bind(exec_context &ctx,
527 const module::argument &marg) {
528 auto v = bytes(ctx.sviews.size());
529
530 extend(v, module::argument::zero_ext, marg.target_size);
531 byteswap(v, ctx.q->device().endianness());
532 align(ctx.input, marg.target_align);
533 insert(ctx.input, v);
534
535 st = img->resource(*ctx.q).bind_sampler_view(*ctx.q);
536 ctx.sviews.push_back(st);
537 }
538
539 void
540 kernel::image_rd_argument::unbind(exec_context &ctx) {
541 img->resource(*ctx.q).unbind_sampler_view(*ctx.q, st);
542 }
543
544 void
545 kernel::image_wr_argument::set(size_t size, const void *value) {
546 if (!value)
547 throw error(CL_INVALID_ARG_VALUE);
548
549 if (size != sizeof(cl_mem))
550 throw error(CL_INVALID_ARG_SIZE);
551
552 img = &obj<image>(*(cl_mem *)value);
553 _set = true;
554 }
555
556 void
557 kernel::image_wr_argument::bind(exec_context &ctx,
558 const module::argument &marg) {
559 auto v = bytes(ctx.resources.size());
560
561 extend(v, module::argument::zero_ext, marg.target_size);
562 byteswap(v, ctx.q->device().endianness());
563 align(ctx.input, marg.target_align);
564 insert(ctx.input, v);
565
566 st = img->resource(*ctx.q).bind_surface(*ctx.q, true);
567 ctx.resources.push_back(st);
568 }
569
570 void
571 kernel::image_wr_argument::unbind(exec_context &ctx) {
572 img->resource(*ctx.q).unbind_surface(*ctx.q, st);
573 }
574
575 void
576 kernel::sampler_argument::set(size_t size, const void *value) {
577 if (!value)
578 throw error(CL_INVALID_SAMPLER);
579
580 if (size != sizeof(cl_sampler))
581 throw error(CL_INVALID_ARG_SIZE);
582
583 s = &obj(*(cl_sampler *)value);
584 _set = true;
585 }
586
587 void
588 kernel::sampler_argument::bind(exec_context &ctx,
589 const module::argument &marg) {
590 st = s->bind(*ctx.q);
591 ctx.samplers.push_back(st);
592 }
593
594 void
595 kernel::sampler_argument::unbind(exec_context &ctx) {
596 s->unbind(*ctx.q, st);
597 }