clover: Calculate optimal work group size when it's not specified by the user.
[mesa.git] / src / gallium / state_trackers / clover / core / kernel.cpp
1 //
2 // Copyright 2012 Francisco Jerez
3 //
4 // Permission is hereby granted, free of charge, to any person obtaining a
5 // copy of this software and associated documentation files (the "Software"),
6 // to deal in the Software without restriction, including without limitation
7 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 // and/or sell copies of the Software, and to permit persons to whom the
9 // Software is furnished to do so, subject to the following conditions:
10 //
11 // The above copyright notice and this permission notice shall be included in
12 // all copies or substantial portions of the Software.
13 //
14 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 // OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 // OTHER DEALINGS IN THE SOFTWARE.
21 //
22
23 #include "core/kernel.hpp"
24 #include "core/resource.hpp"
25 #include "util/factor.hpp"
26 #include "util/u_math.h"
27 #include "pipe/p_context.h"
28
29 using namespace clover;
30
31 kernel::kernel(program &prog, const std::string &name,
32 const std::vector<module::argument> &margs) :
33 prog(prog), _name(name), exec(*this) {
34 for (auto &marg : margs) {
35 if (marg.type == module::argument::scalar)
36 _args.emplace_back(new scalar_argument(marg.size));
37 else if (marg.type == module::argument::global)
38 _args.emplace_back(new global_argument);
39 else if (marg.type == module::argument::local)
40 _args.emplace_back(new local_argument);
41 else if (marg.type == module::argument::constant)
42 _args.emplace_back(new constant_argument);
43 else if (marg.type == module::argument::image2d_rd ||
44 marg.type == module::argument::image3d_rd)
45 _args.emplace_back(new image_rd_argument);
46 else if (marg.type == module::argument::image2d_wr ||
47 marg.type == module::argument::image3d_wr)
48 _args.emplace_back(new image_wr_argument);
49 else if (marg.type == module::argument::sampler)
50 _args.emplace_back(new sampler_argument);
51 else
52 throw error(CL_INVALID_KERNEL_DEFINITION);
53 }
54 }
55
56 template<typename V>
57 static inline std::vector<uint>
58 pad_vector(command_queue &q, const V &v, uint x) {
59 std::vector<uint> w { v.begin(), v.end() };
60 w.resize(q.dev.max_block_size().size(), x);
61 return w;
62 }
63
64 void
65 kernel::launch(command_queue &q,
66 const std::vector<size_t> &grid_offset,
67 const std::vector<size_t> &grid_size,
68 const std::vector<size_t> &block_size) {
69 const auto m = prog.binary(q.dev);
70 const auto reduced_grid_size =
71 map(divides(), grid_size, block_size);
72 void *st = exec.bind(&q);
73
74 // The handles are created during exec_context::bind(), so we need make
75 // sure to call exec_context::bind() before retrieving them.
76 std::vector<uint32_t *> g_handles = map([&](size_t h) {
77 return (uint32_t *)&exec.input[h];
78 }, exec.g_handles);
79
80 q.pipe->bind_compute_state(q.pipe, st);
81 q.pipe->bind_sampler_states(q.pipe, PIPE_SHADER_COMPUTE,
82 0, exec.samplers.size(),
83 exec.samplers.data());
84
85 q.pipe->set_sampler_views(q.pipe, PIPE_SHADER_COMPUTE, 0,
86 exec.sviews.size(), exec.sviews.data());
87 q.pipe->set_compute_resources(q.pipe, 0, exec.resources.size(),
88 exec.resources.data());
89 q.pipe->set_global_binding(q.pipe, 0, exec.g_buffers.size(),
90 exec.g_buffers.data(), g_handles.data());
91
92 q.pipe->launch_grid(q.pipe,
93 pad_vector(q, block_size, 1).data(),
94 pad_vector(q, reduced_grid_size, 1).data(),
95 find(name_equals(_name), m.syms).offset,
96 exec.input.data());
97
98 q.pipe->set_global_binding(q.pipe, 0, exec.g_buffers.size(), NULL, NULL);
99 q.pipe->set_compute_resources(q.pipe, 0, exec.resources.size(), NULL);
100 q.pipe->set_sampler_views(q.pipe, PIPE_SHADER_COMPUTE, 0,
101 exec.sviews.size(), NULL);
102 q.pipe->bind_sampler_states(q.pipe, PIPE_SHADER_COMPUTE, 0,
103 exec.samplers.size(), NULL);
104 exec.unbind();
105 }
106
107 size_t
108 kernel::mem_local() const {
109 size_t sz = 0;
110
111 for (auto &arg : args()) {
112 if (dynamic_cast<local_argument *>(&arg))
113 sz += arg.storage();
114 }
115
116 return sz;
117 }
118
119 size_t
120 kernel::mem_private() const {
121 return 0;
122 }
123
124 const std::string &
125 kernel::name() const {
126 return _name;
127 }
128
129 std::vector<size_t>
130 kernel::optimal_block_size(const command_queue &q,
131 const std::vector<size_t> &grid_size) const {
132 return factor::find_grid_optimal_factor<size_t>(
133 q.dev.max_threads_per_block(), q.dev.max_block_size(),
134 grid_size);
135 }
136
137 std::vector<size_t>
138 kernel::required_block_size() const {
139 return { 0, 0, 0 };
140 }
141
142 kernel::argument_range
143 kernel::args() {
144 return map(derefs(), _args);
145 }
146
147 kernel::const_argument_range
148 kernel::args() const {
149 return map(derefs(), _args);
150 }
151
152 const module &
153 kernel::module(const command_queue &q) const {
154 return prog.binary(q.dev);
155 }
156
157 kernel::exec_context::exec_context(kernel &kern) :
158 kern(kern), q(NULL), mem_local(0), st(NULL), cs() {
159 }
160
161 kernel::exec_context::~exec_context() {
162 if (st)
163 q->pipe->delete_compute_state(q->pipe, st);
164 }
165
166 void *
167 kernel::exec_context::bind(command_queue *_q) {
168 std::swap(q, _q);
169
170 // Bind kernel arguments.
171 auto &m = kern.prog.binary(q->dev);
172 auto margs = find(name_equals(kern.name()), m.syms).args;
173 auto msec = find(type_equals(module::section::text), m.secs);
174
175 for_each([=](kernel::argument &karg, const module::argument &marg) {
176 karg.bind(*this, marg);
177 }, kern.args(), margs);
178
179 // Create a new compute state if anything changed.
180 if (!st || q != _q ||
181 cs.req_local_mem != mem_local ||
182 cs.req_input_mem != input.size()) {
183 if (st)
184 _q->pipe->delete_compute_state(_q->pipe, st);
185
186 cs.prog = msec.data.begin();
187 cs.req_local_mem = mem_local;
188 cs.req_input_mem = input.size();
189 st = q->pipe->create_compute_state(q->pipe, &cs);
190 }
191
192 return st;
193 }
194
195 void
196 kernel::exec_context::unbind() {
197 for (auto &arg : kern.args())
198 arg.unbind(*this);
199
200 input.clear();
201 samplers.clear();
202 sviews.clear();
203 resources.clear();
204 g_buffers.clear();
205 g_handles.clear();
206 mem_local = 0;
207 }
208
209 namespace {
210 template<typename T>
211 std::vector<uint8_t>
212 bytes(const T& x) {
213 return { (uint8_t *)&x, (uint8_t *)&x + sizeof(x) };
214 }
215
216 ///
217 /// Transform buffer \a v from the native byte order into the byte
218 /// order specified by \a e.
219 ///
220 template<typename T>
221 void
222 byteswap(T &v, pipe_endian e) {
223 if (PIPE_ENDIAN_NATIVE != e)
224 std::reverse(v.begin(), v.end());
225 }
226
227 ///
228 /// Pad buffer \a v to the next multiple of \a n.
229 ///
230 template<typename T>
231 void
232 align(T &v, size_t n) {
233 v.resize(util_align_npot(v.size(), n));
234 }
235
236 bool
237 msb(const std::vector<uint8_t> &s) {
238 if (PIPE_ENDIAN_NATIVE == PIPE_ENDIAN_LITTLE)
239 return s.back() & 0x80;
240 else
241 return s.front() & 0x80;
242 }
243
244 ///
245 /// Resize buffer \a v to size \a n using sign or zero extension
246 /// according to \a ext.
247 ///
248 template<typename T>
249 void
250 extend(T &v, enum module::argument::ext_type ext, size_t n) {
251 const size_t m = std::min(v.size(), n);
252 const bool sign_ext = (ext == module::argument::sign_ext);
253 const uint8_t fill = (sign_ext && msb(v) ? ~0 : 0);
254 T w(n, fill);
255
256 if (PIPE_ENDIAN_NATIVE == PIPE_ENDIAN_LITTLE)
257 std::copy_n(v.begin(), m, w.begin());
258 else
259 std::copy_n(v.end() - m, m, w.end() - m);
260
261 std::swap(v, w);
262 }
263
264 ///
265 /// Append buffer \a w to \a v.
266 ///
267 template<typename T>
268 void
269 insert(T &v, const T &w) {
270 v.insert(v.end(), w.begin(), w.end());
271 }
272
273 ///
274 /// Append \a n elements to the end of buffer \a v.
275 ///
276 template<typename T>
277 size_t
278 allocate(T &v, size_t n) {
279 size_t pos = v.size();
280 v.resize(pos + n);
281 return pos;
282 }
283 }
284
285 kernel::argument::argument() : _set(false) {
286 }
287
288 bool
289 kernel::argument::set() const {
290 return _set;
291 }
292
293 size_t
294 kernel::argument::storage() const {
295 return 0;
296 }
297
298 kernel::scalar_argument::scalar_argument(size_t size) : size(size) {
299 }
300
301 void
302 kernel::scalar_argument::set(size_t size, const void *value) {
303 if (size != this->size)
304 throw error(CL_INVALID_ARG_SIZE);
305
306 v = { (uint8_t *)value, (uint8_t *)value + size };
307 _set = true;
308 }
309
310 void
311 kernel::scalar_argument::bind(exec_context &ctx,
312 const module::argument &marg) {
313 auto w = v;
314
315 extend(w, marg.ext_type, marg.target_size);
316 byteswap(w, ctx.q->dev.endianness());
317 align(ctx.input, marg.target_align);
318 insert(ctx.input, w);
319 }
320
321 void
322 kernel::scalar_argument::unbind(exec_context &ctx) {
323 }
324
325 void
326 kernel::global_argument::set(size_t size, const void *value) {
327 if (size != sizeof(cl_mem))
328 throw error(CL_INVALID_ARG_SIZE);
329
330 buf = &obj<buffer>(*(cl_mem *)value);
331 _set = true;
332 }
333
334 void
335 kernel::global_argument::bind(exec_context &ctx,
336 const module::argument &marg) {
337 align(ctx.input, marg.target_align);
338 ctx.g_handles.push_back(allocate(ctx.input, marg.target_size));
339 ctx.g_buffers.push_back(buf->resource(*ctx.q).pipe);
340 }
341
342 void
343 kernel::global_argument::unbind(exec_context &ctx) {
344 }
345
346 size_t
347 kernel::local_argument::storage() const {
348 return _storage;
349 }
350
351 void
352 kernel::local_argument::set(size_t size, const void *value) {
353 if (value)
354 throw error(CL_INVALID_ARG_VALUE);
355
356 _storage = size;
357 _set = true;
358 }
359
360 void
361 kernel::local_argument::bind(exec_context &ctx,
362 const module::argument &marg) {
363 auto v = bytes(ctx.mem_local);
364
365 extend(v, module::argument::zero_ext, marg.target_size);
366 byteswap(v, ctx.q->dev.endianness());
367 align(ctx.input, marg.target_align);
368 insert(ctx.input, v);
369
370 ctx.mem_local += _storage;
371 }
372
373 void
374 kernel::local_argument::unbind(exec_context &ctx) {
375 }
376
377 void
378 kernel::constant_argument::set(size_t size, const void *value) {
379 if (size != sizeof(cl_mem))
380 throw error(CL_INVALID_ARG_SIZE);
381
382 buf = &obj<buffer>(*(cl_mem *)value);
383 _set = true;
384 }
385
386 void
387 kernel::constant_argument::bind(exec_context &ctx,
388 const module::argument &marg) {
389 auto v = bytes(ctx.resources.size() << 24);
390
391 extend(v, module::argument::zero_ext, marg.target_size);
392 byteswap(v, ctx.q->dev.endianness());
393 align(ctx.input, marg.target_align);
394 insert(ctx.input, v);
395
396 st = buf->resource(*ctx.q).bind_surface(*ctx.q, false);
397 ctx.resources.push_back(st);
398 }
399
400 void
401 kernel::constant_argument::unbind(exec_context &ctx) {
402 buf->resource(*ctx.q).unbind_surface(*ctx.q, st);
403 }
404
405 void
406 kernel::image_rd_argument::set(size_t size, const void *value) {
407 if (size != sizeof(cl_mem))
408 throw error(CL_INVALID_ARG_SIZE);
409
410 img = &obj<image>(*(cl_mem *)value);
411 _set = true;
412 }
413
414 void
415 kernel::image_rd_argument::bind(exec_context &ctx,
416 const module::argument &marg) {
417 auto v = bytes(ctx.sviews.size());
418
419 extend(v, module::argument::zero_ext, marg.target_size);
420 byteswap(v, ctx.q->dev.endianness());
421 align(ctx.input, marg.target_align);
422 insert(ctx.input, v);
423
424 st = img->resource(*ctx.q).bind_sampler_view(*ctx.q);
425 ctx.sviews.push_back(st);
426 }
427
428 void
429 kernel::image_rd_argument::unbind(exec_context &ctx) {
430 img->resource(*ctx.q).unbind_sampler_view(*ctx.q, st);
431 }
432
433 void
434 kernel::image_wr_argument::set(size_t size, const void *value) {
435 if (size != sizeof(cl_mem))
436 throw error(CL_INVALID_ARG_SIZE);
437
438 img = &obj<image>(*(cl_mem *)value);
439 _set = true;
440 }
441
442 void
443 kernel::image_wr_argument::bind(exec_context &ctx,
444 const module::argument &marg) {
445 auto v = bytes(ctx.resources.size());
446
447 extend(v, module::argument::zero_ext, marg.target_size);
448 byteswap(v, ctx.q->dev.endianness());
449 align(ctx.input, marg.target_align);
450 insert(ctx.input, v);
451
452 st = img->resource(*ctx.q).bind_surface(*ctx.q, true);
453 ctx.resources.push_back(st);
454 }
455
456 void
457 kernel::image_wr_argument::unbind(exec_context &ctx) {
458 img->resource(*ctx.q).unbind_surface(*ctx.q, st);
459 }
460
461 void
462 kernel::sampler_argument::set(size_t size, const void *value) {
463 if (size != sizeof(cl_sampler))
464 throw error(CL_INVALID_ARG_SIZE);
465
466 s = &obj(*(cl_sampler *)value);
467 _set = true;
468 }
469
470 void
471 kernel::sampler_argument::bind(exec_context &ctx,
472 const module::argument &marg) {
473 st = s->bind(*ctx.q);
474 ctx.samplers.push_back(st);
475 }
476
477 void
478 kernel::sampler_argument::unbind(exec_context &ctx) {
479 s->unbind(*ctx.q, st);
480 }