clover: grid_offset should be padded with 0 not 1
[mesa.git] / src / gallium / state_trackers / clover / core / kernel.cpp
1 //
2 // Copyright 2012 Francisco Jerez
3 //
4 // Permission is hereby granted, free of charge, to any person obtaining a
5 // copy of this software and associated documentation files (the "Software"),
6 // to deal in the Software without restriction, including without limitation
7 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 // and/or sell copies of the Software, and to permit persons to whom the
9 // Software is furnished to do so, subject to the following conditions:
10 //
11 // The above copyright notice and this permission notice shall be included in
12 // all copies or substantial portions of the Software.
13 //
14 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 // OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 // OTHER DEALINGS IN THE SOFTWARE.
21 //
22
23 #include "core/kernel.hpp"
24 #include "core/resource.hpp"
25 #include "util/factor.hpp"
26 #include "util/u_math.h"
27 #include "pipe/p_context.h"
28
29 using namespace clover;
30
31 kernel::kernel(clover::program &prog, const std::string &name,
32 const std::vector<module::argument> &margs) :
33 program(prog), _name(name), exec(*this),
34 program_ref(prog._kernel_ref_counter) {
35 for (auto &marg : margs) {
36 if (marg.semantic == module::argument::general)
37 _args.emplace_back(argument::create(marg));
38 }
39 }
40
41 template<typename V>
42 static inline std::vector<uint>
43 pad_vector(command_queue &q, const V &v, uint x) {
44 std::vector<uint> w { v.begin(), v.end() };
45 w.resize(q.device().max_block_size().size(), x);
46 return w;
47 }
48
49 void
50 kernel::launch(command_queue &q,
51 const std::vector<size_t> &grid_offset,
52 const std::vector<size_t> &grid_size,
53 const std::vector<size_t> &block_size) {
54 const auto m = program().binary(q.device());
55 const auto reduced_grid_size =
56 map(divides(), grid_size, block_size);
57 void *st = exec.bind(&q, grid_offset);
58 struct pipe_grid_info info = {};
59
60 // The handles are created during exec_context::bind(), so we need make
61 // sure to call exec_context::bind() before retrieving them.
62 std::vector<uint32_t *> g_handles = map([&](size_t h) {
63 return (uint32_t *)&exec.input[h];
64 }, exec.g_handles);
65
66 q.pipe->bind_compute_state(q.pipe, st);
67 q.pipe->bind_sampler_states(q.pipe, PIPE_SHADER_COMPUTE,
68 0, exec.samplers.size(),
69 exec.samplers.data());
70
71 q.pipe->set_sampler_views(q.pipe, PIPE_SHADER_COMPUTE, 0,
72 exec.sviews.size(), exec.sviews.data());
73 q.pipe->set_compute_resources(q.pipe, 0, exec.resources.size(),
74 exec.resources.data());
75 q.pipe->set_global_binding(q.pipe, 0, exec.g_buffers.size(),
76 exec.g_buffers.data(), g_handles.data());
77
78 // Fill information for the launch_grid() call.
79 copy(pad_vector(q, block_size, 1), info.block);
80 copy(pad_vector(q, reduced_grid_size, 1), info.grid);
81 info.pc = find(name_equals(_name), m.syms).offset;
82 info.input = exec.input.data();
83
84 q.pipe->launch_grid(q.pipe, &info);
85
86 q.pipe->set_global_binding(q.pipe, 0, exec.g_buffers.size(), NULL, NULL);
87 q.pipe->set_compute_resources(q.pipe, 0, exec.resources.size(), NULL);
88 q.pipe->set_sampler_views(q.pipe, PIPE_SHADER_COMPUTE, 0,
89 exec.sviews.size(), NULL);
90 q.pipe->bind_sampler_states(q.pipe, PIPE_SHADER_COMPUTE, 0,
91 exec.samplers.size(), NULL);
92
93 q.pipe->memory_barrier(q.pipe, PIPE_BARRIER_GLOBAL_BUFFER);
94 exec.unbind();
95 }
96
97 size_t
98 kernel::mem_local() const {
99 size_t sz = 0;
100
101 for (auto &arg : args()) {
102 if (dynamic_cast<local_argument *>(&arg))
103 sz += arg.storage();
104 }
105
106 return sz;
107 }
108
109 size_t
110 kernel::mem_private() const {
111 return 0;
112 }
113
114 const std::string &
115 kernel::name() const {
116 return _name;
117 }
118
119 std::vector<size_t>
120 kernel::optimal_block_size(const command_queue &q,
121 const std::vector<size_t> &grid_size) const {
122 return factor::find_grid_optimal_factor<size_t>(
123 q.device().max_threads_per_block(), q.device().max_block_size(),
124 grid_size);
125 }
126
127 std::vector<size_t>
128 kernel::required_block_size() const {
129 return { 0, 0, 0 };
130 }
131
132 kernel::argument_range
133 kernel::args() {
134 return map(derefs(), _args);
135 }
136
137 kernel::const_argument_range
138 kernel::args() const {
139 return map(derefs(), _args);
140 }
141
142 const module &
143 kernel::module(const command_queue &q) const {
144 return program().binary(q.device());
145 }
146
147 kernel::exec_context::exec_context(kernel &kern) :
148 kern(kern), q(NULL), mem_local(0), st(NULL), cs() {
149 }
150
151 kernel::exec_context::~exec_context() {
152 if (st)
153 q->pipe->delete_compute_state(q->pipe, st);
154 }
155
156 void *
157 kernel::exec_context::bind(intrusive_ptr<command_queue> _q,
158 const std::vector<size_t> &grid_offset) {
159 std::swap(q, _q);
160
161 // Bind kernel arguments.
162 auto &m = kern.program().binary(q->device());
163 auto margs = find(name_equals(kern.name()), m.syms).args;
164 auto msec = find(type_equals(module::section::text), m.secs);
165 auto explicit_arg = kern._args.begin();
166
167 for (auto &marg : margs) {
168 switch (marg.semantic) {
169 case module::argument::general:
170 (*(explicit_arg++))->bind(*this, marg);
171 break;
172
173 case module::argument::grid_dimension: {
174 const cl_uint dimension = grid_offset.size();
175 auto arg = argument::create(marg);
176
177 arg->set(sizeof(dimension), &dimension);
178 arg->bind(*this, marg);
179 break;
180 }
181 case module::argument::grid_offset: {
182 for (cl_uint x : pad_vector(*q, grid_offset, 0)) {
183 auto arg = argument::create(marg);
184
185 arg->set(sizeof(x), &x);
186 arg->bind(*this, marg);
187 }
188 break;
189 }
190 case module::argument::image_size: {
191 auto img = dynamic_cast<image_argument &>(**(explicit_arg - 1)).get();
192 std::vector<cl_uint> image_size{
193 static_cast<cl_uint>(img->width()),
194 static_cast<cl_uint>(img->height()),
195 static_cast<cl_uint>(img->depth())};
196 for (auto x : image_size) {
197 auto arg = argument::create(marg);
198
199 arg->set(sizeof(x), &x);
200 arg->bind(*this, marg);
201 }
202 break;
203 }
204 case module::argument::image_format: {
205 auto img = dynamic_cast<image_argument &>(**(explicit_arg - 1)).get();
206 cl_image_format fmt = img->format();
207 std::vector<cl_uint> image_format{
208 static_cast<cl_uint>(fmt.image_channel_data_type),
209 static_cast<cl_uint>(fmt.image_channel_order)};
210 for (auto x : image_format) {
211 auto arg = argument::create(marg);
212
213 arg->set(sizeof(x), &x);
214 arg->bind(*this, marg);
215 }
216 break;
217 }
218 }
219 }
220
221 // Create a new compute state if anything changed.
222 if (!st || q != _q ||
223 cs.req_local_mem != mem_local ||
224 cs.req_input_mem != input.size()) {
225 if (st)
226 _q->pipe->delete_compute_state(_q->pipe, st);
227
228 cs.ir_type = q->device().ir_format();
229 cs.prog = &(msec.data[0]);
230 cs.req_local_mem = mem_local;
231 cs.req_input_mem = input.size();
232 st = q->pipe->create_compute_state(q->pipe, &cs);
233 }
234
235 return st;
236 }
237
238 void
239 kernel::exec_context::unbind() {
240 for (auto &arg : kern.args())
241 arg.unbind(*this);
242
243 input.clear();
244 samplers.clear();
245 sviews.clear();
246 resources.clear();
247 g_buffers.clear();
248 g_handles.clear();
249 mem_local = 0;
250 }
251
252 namespace {
253 template<typename T>
254 std::vector<uint8_t>
255 bytes(const T& x) {
256 return { (uint8_t *)&x, (uint8_t *)&x + sizeof(x) };
257 }
258
259 ///
260 /// Transform buffer \a v from the native byte order into the byte
261 /// order specified by \a e.
262 ///
263 template<typename T>
264 void
265 byteswap(T &v, pipe_endian e) {
266 if (PIPE_ENDIAN_NATIVE != e)
267 std::reverse(v.begin(), v.end());
268 }
269
270 ///
271 /// Pad buffer \a v to the next multiple of \a n.
272 ///
273 template<typename T>
274 void
275 align(T &v, size_t n) {
276 v.resize(util_align_npot(v.size(), n));
277 }
278
279 bool
280 msb(const std::vector<uint8_t> &s) {
281 if (PIPE_ENDIAN_NATIVE == PIPE_ENDIAN_LITTLE)
282 return s.back() & 0x80;
283 else
284 return s.front() & 0x80;
285 }
286
287 ///
288 /// Resize buffer \a v to size \a n using sign or zero extension
289 /// according to \a ext.
290 ///
291 template<typename T>
292 void
293 extend(T &v, enum module::argument::ext_type ext, size_t n) {
294 const size_t m = std::min(v.size(), n);
295 const bool sign_ext = (ext == module::argument::sign_ext);
296 const uint8_t fill = (sign_ext && msb(v) ? ~0 : 0);
297 T w(n, fill);
298
299 if (PIPE_ENDIAN_NATIVE == PIPE_ENDIAN_LITTLE)
300 std::copy_n(v.begin(), m, w.begin());
301 else
302 std::copy_n(v.end() - m, m, w.end() - m);
303
304 std::swap(v, w);
305 }
306
307 ///
308 /// Append buffer \a w to \a v.
309 ///
310 template<typename T>
311 void
312 insert(T &v, const T &w) {
313 v.insert(v.end(), w.begin(), w.end());
314 }
315
316 ///
317 /// Append \a n elements to the end of buffer \a v.
318 ///
319 template<typename T>
320 size_t
321 allocate(T &v, size_t n) {
322 size_t pos = v.size();
323 v.resize(pos + n);
324 return pos;
325 }
326 }
327
328 std::unique_ptr<kernel::argument>
329 kernel::argument::create(const module::argument &marg) {
330 switch (marg.type) {
331 case module::argument::scalar:
332 return std::unique_ptr<kernel::argument>(new scalar_argument(marg.size));
333
334 case module::argument::global:
335 return std::unique_ptr<kernel::argument>(new global_argument);
336
337 case module::argument::local:
338 return std::unique_ptr<kernel::argument>(new local_argument);
339
340 case module::argument::constant:
341 return std::unique_ptr<kernel::argument>(new constant_argument);
342
343 case module::argument::image2d_rd:
344 case module::argument::image3d_rd:
345 return std::unique_ptr<kernel::argument>(new image_rd_argument);
346
347 case module::argument::image2d_wr:
348 case module::argument::image3d_wr:
349 return std::unique_ptr<kernel::argument>(new image_wr_argument);
350
351 case module::argument::sampler:
352 return std::unique_ptr<kernel::argument>(new sampler_argument);
353
354 }
355 throw error(CL_INVALID_KERNEL_DEFINITION);
356 }
357
358 kernel::argument::argument() : _set(false) {
359 }
360
361 bool
362 kernel::argument::set() const {
363 return _set;
364 }
365
366 size_t
367 kernel::argument::storage() const {
368 return 0;
369 }
370
371 kernel::scalar_argument::scalar_argument(size_t size) : size(size) {
372 }
373
374 void
375 kernel::scalar_argument::set(size_t size, const void *value) {
376 if (!value)
377 throw error(CL_INVALID_ARG_VALUE);
378
379 if (size != this->size)
380 throw error(CL_INVALID_ARG_SIZE);
381
382 v = { (uint8_t *)value, (uint8_t *)value + size };
383 _set = true;
384 }
385
386 void
387 kernel::scalar_argument::bind(exec_context &ctx,
388 const module::argument &marg) {
389 auto w = v;
390
391 extend(w, marg.ext_type, marg.target_size);
392 byteswap(w, ctx.q->device().endianness());
393 align(ctx.input, marg.target_align);
394 insert(ctx.input, w);
395 }
396
397 void
398 kernel::scalar_argument::unbind(exec_context &ctx) {
399 }
400
401 void
402 kernel::global_argument::set(size_t size, const void *value) {
403 if (size != sizeof(cl_mem))
404 throw error(CL_INVALID_ARG_SIZE);
405
406 buf = pobj<buffer>(value ? *(cl_mem *)value : NULL);
407 _set = true;
408 }
409
410 void
411 kernel::global_argument::bind(exec_context &ctx,
412 const module::argument &marg) {
413 align(ctx.input, marg.target_align);
414
415 if (buf) {
416 const resource &r = buf->resource(*ctx.q);
417 ctx.g_handles.push_back(ctx.input.size());
418 ctx.g_buffers.push_back(r.pipe);
419
420 // How to handle multi-demensional offsets?
421 // We don't need to. Buffer offsets are always
422 // one-dimensional.
423 auto v = bytes(r.offset[0]);
424 extend(v, marg.ext_type, marg.target_size);
425 byteswap(v, ctx.q->device().endianness());
426 insert(ctx.input, v);
427 } else {
428 // Null pointer.
429 allocate(ctx.input, marg.target_size);
430 }
431 }
432
433 void
434 kernel::global_argument::unbind(exec_context &ctx) {
435 }
436
437 size_t
438 kernel::local_argument::storage() const {
439 return _storage;
440 }
441
442 void
443 kernel::local_argument::set(size_t size, const void *value) {
444 if (value)
445 throw error(CL_INVALID_ARG_VALUE);
446
447 if (!size)
448 throw error(CL_INVALID_ARG_SIZE);
449
450 _storage = size;
451 _set = true;
452 }
453
454 void
455 kernel::local_argument::bind(exec_context &ctx,
456 const module::argument &marg) {
457 auto v = bytes(ctx.mem_local);
458
459 extend(v, module::argument::zero_ext, marg.target_size);
460 byteswap(v, ctx.q->device().endianness());
461 align(ctx.input, marg.target_align);
462 insert(ctx.input, v);
463
464 ctx.mem_local += _storage;
465 }
466
467 void
468 kernel::local_argument::unbind(exec_context &ctx) {
469 }
470
471 void
472 kernel::constant_argument::set(size_t size, const void *value) {
473 if (size != sizeof(cl_mem))
474 throw error(CL_INVALID_ARG_SIZE);
475
476 buf = pobj<buffer>(value ? *(cl_mem *)value : NULL);
477 _set = true;
478 }
479
480 void
481 kernel::constant_argument::bind(exec_context &ctx,
482 const module::argument &marg) {
483 align(ctx.input, marg.target_align);
484
485 if (buf) {
486 resource &r = buf->resource(*ctx.q);
487 auto v = bytes(ctx.resources.size() << 24 | r.offset[0]);
488
489 extend(v, module::argument::zero_ext, marg.target_size);
490 byteswap(v, ctx.q->device().endianness());
491 insert(ctx.input, v);
492
493 st = r.bind_surface(*ctx.q, false);
494 ctx.resources.push_back(st);
495 } else {
496 // Null pointer.
497 allocate(ctx.input, marg.target_size);
498 }
499 }
500
501 void
502 kernel::constant_argument::unbind(exec_context &ctx) {
503 if (buf)
504 buf->resource(*ctx.q).unbind_surface(*ctx.q, st);
505 }
506
507 void
508 kernel::image_rd_argument::set(size_t size, const void *value) {
509 if (!value)
510 throw error(CL_INVALID_ARG_VALUE);
511
512 if (size != sizeof(cl_mem))
513 throw error(CL_INVALID_ARG_SIZE);
514
515 img = &obj<image>(*(cl_mem *)value);
516 _set = true;
517 }
518
519 void
520 kernel::image_rd_argument::bind(exec_context &ctx,
521 const module::argument &marg) {
522 auto v = bytes(ctx.sviews.size());
523
524 extend(v, module::argument::zero_ext, marg.target_size);
525 byteswap(v, ctx.q->device().endianness());
526 align(ctx.input, marg.target_align);
527 insert(ctx.input, v);
528
529 st = img->resource(*ctx.q).bind_sampler_view(*ctx.q);
530 ctx.sviews.push_back(st);
531 }
532
533 void
534 kernel::image_rd_argument::unbind(exec_context &ctx) {
535 img->resource(*ctx.q).unbind_sampler_view(*ctx.q, st);
536 }
537
538 void
539 kernel::image_wr_argument::set(size_t size, const void *value) {
540 if (!value)
541 throw error(CL_INVALID_ARG_VALUE);
542
543 if (size != sizeof(cl_mem))
544 throw error(CL_INVALID_ARG_SIZE);
545
546 img = &obj<image>(*(cl_mem *)value);
547 _set = true;
548 }
549
550 void
551 kernel::image_wr_argument::bind(exec_context &ctx,
552 const module::argument &marg) {
553 auto v = bytes(ctx.resources.size());
554
555 extend(v, module::argument::zero_ext, marg.target_size);
556 byteswap(v, ctx.q->device().endianness());
557 align(ctx.input, marg.target_align);
558 insert(ctx.input, v);
559
560 st = img->resource(*ctx.q).bind_surface(*ctx.q, true);
561 ctx.resources.push_back(st);
562 }
563
564 void
565 kernel::image_wr_argument::unbind(exec_context &ctx) {
566 img->resource(*ctx.q).unbind_surface(*ctx.q, st);
567 }
568
569 void
570 kernel::sampler_argument::set(size_t size, const void *value) {
571 if (!value)
572 throw error(CL_INVALID_SAMPLER);
573
574 if (size != sizeof(cl_sampler))
575 throw error(CL_INVALID_ARG_SIZE);
576
577 s = &obj(*(cl_sampler *)value);
578 _set = true;
579 }
580
581 void
582 kernel::sampler_argument::bind(exec_context &ctx,
583 const module::argument &marg) {
584 st = s->bind(*ctx.q);
585 ctx.samplers.push_back(st);
586 }
587
588 void
589 kernel::sampler_argument::unbind(exec_context &ctx) {
590 s->unbind(*ctx.q, st);
591 }