nv50,nvc0: add support for cull distances
[mesa.git] / src / gallium / drivers / nouveau / nv50 / nv50_program.c
1 /*
2 * Copyright 2010 Christoph Bumiller
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 */
22
23 #include "nv50/nv50_program.h"
24 #include "nv50/nv50_context.h"
25
26 #include "codegen/nv50_ir_driver.h"
27
28 static inline unsigned
29 bitcount4(const uint32_t val)
30 {
31 static const uint8_t cnt[16]
32 = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 };
33 return cnt[val & 0xf];
34 }
35
36 static int
37 nv50_vertprog_assign_slots(struct nv50_ir_prog_info *info)
38 {
39 struct nv50_program *prog = (struct nv50_program *)info->driverPriv;
40 unsigned i, n, c;
41
42 n = 0;
43 for (i = 0; i < info->numInputs; ++i) {
44 prog->in[i].id = i;
45 prog->in[i].sn = info->in[i].sn;
46 prog->in[i].si = info->in[i].si;
47 prog->in[i].hw = n;
48 prog->in[i].mask = info->in[i].mask;
49
50 prog->vp.attrs[(4 * i) / 32] |= info->in[i].mask << ((4 * i) % 32);
51
52 for (c = 0; c < 4; ++c)
53 if (info->in[i].mask & (1 << c))
54 info->in[i].slot[c] = n++;
55
56 if (info->in[i].sn == TGSI_SEMANTIC_PRIMID)
57 prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_PRIMITIVE_ID;
58 }
59 prog->in_nr = info->numInputs;
60
61 for (i = 0; i < info->numSysVals; ++i) {
62 switch (info->sv[i].sn) {
63 case TGSI_SEMANTIC_INSTANCEID:
64 prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_INSTANCE_ID;
65 continue;
66 case TGSI_SEMANTIC_VERTEXID:
67 prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_VERTEX_ID;
68 prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_VERTEX_ID_DRAW_ARRAYS_ADD_START;
69 continue;
70 default:
71 break;
72 }
73 }
74
75 /*
76 * Corner case: VP has no inputs, but we will still need to submit data to
77 * draw it. HW will shout at us and won't draw anything if we don't enable
78 * any input, so let's just pretend it's the first one.
79 */
80 if (prog->vp.attrs[0] == 0 &&
81 prog->vp.attrs[1] == 0 &&
82 prog->vp.attrs[2] == 0)
83 prog->vp.attrs[0] |= 0xf;
84
85 /* VertexID before InstanceID */
86 if (info->io.vertexId < info->numSysVals)
87 info->sv[info->io.vertexId].slot[0] = n++;
88 if (info->io.instanceId < info->numSysVals)
89 info->sv[info->io.instanceId].slot[0] = n++;
90
91 n = 0;
92 for (i = 0; i < info->numOutputs; ++i) {
93 switch (info->out[i].sn) {
94 case TGSI_SEMANTIC_PSIZE:
95 prog->vp.psiz = i;
96 break;
97 case TGSI_SEMANTIC_CLIPDIST:
98 prog->vp.clpd[info->out[i].si] = n;
99 break;
100 case TGSI_SEMANTIC_EDGEFLAG:
101 prog->vp.edgeflag = i;
102 break;
103 case TGSI_SEMANTIC_BCOLOR:
104 prog->vp.bfc[info->out[i].si] = i;
105 break;
106 case TGSI_SEMANTIC_LAYER:
107 prog->gp.has_layer = true;
108 prog->gp.layerid = n;
109 break;
110 case TGSI_SEMANTIC_VIEWPORT_INDEX:
111 prog->gp.has_viewport = true;
112 prog->gp.viewportid = n;
113 break;
114 default:
115 break;
116 }
117 prog->out[i].id = i;
118 prog->out[i].sn = info->out[i].sn;
119 prog->out[i].si = info->out[i].si;
120 prog->out[i].hw = n;
121 prog->out[i].mask = info->out[i].mask;
122
123 for (c = 0; c < 4; ++c)
124 if (info->out[i].mask & (1 << c))
125 info->out[i].slot[c] = n++;
126 }
127 prog->out_nr = info->numOutputs;
128 prog->max_out = n;
129 if (!prog->max_out)
130 prog->max_out = 1;
131
132 if (prog->vp.psiz < info->numOutputs)
133 prog->vp.psiz = prog->out[prog->vp.psiz].hw;
134
135 return 0;
136 }
137
138 static int
139 nv50_fragprog_assign_slots(struct nv50_ir_prog_info *info)
140 {
141 struct nv50_program *prog = (struct nv50_program *)info->driverPriv;
142 unsigned i, n, m, c;
143 unsigned nvary;
144 unsigned nflat;
145 unsigned nintp = 0;
146
147 /* count recorded non-flat inputs */
148 for (m = 0, i = 0; i < info->numInputs; ++i) {
149 switch (info->in[i].sn) {
150 case TGSI_SEMANTIC_POSITION:
151 continue;
152 default:
153 m += info->in[i].flat ? 0 : 1;
154 break;
155 }
156 }
157 /* careful: id may be != i in info->in[prog->in[i].id] */
158
159 /* Fill prog->in[] so that non-flat inputs are first and
160 * kick out special inputs that don't use the RESULT_MAP.
161 */
162 for (n = 0, i = 0; i < info->numInputs; ++i) {
163 if (info->in[i].sn == TGSI_SEMANTIC_POSITION) {
164 prog->fp.interp |= info->in[i].mask << 24;
165 for (c = 0; c < 4; ++c)
166 if (info->in[i].mask & (1 << c))
167 info->in[i].slot[c] = nintp++;
168 } else {
169 unsigned j = info->in[i].flat ? m++ : n++;
170
171 if (info->in[i].sn == TGSI_SEMANTIC_COLOR)
172 prog->vp.bfc[info->in[i].si] = j;
173 else if (info->in[i].sn == TGSI_SEMANTIC_PRIMID)
174 prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_PRIMITIVE_ID;
175
176 prog->in[j].id = i;
177 prog->in[j].mask = info->in[i].mask;
178 prog->in[j].sn = info->in[i].sn;
179 prog->in[j].si = info->in[i].si;
180 prog->in[j].linear = info->in[i].linear;
181
182 prog->in_nr++;
183 }
184 }
185 if (!(prog->fp.interp & (8 << 24))) {
186 ++nintp;
187 prog->fp.interp |= 8 << 24;
188 }
189
190 for (i = 0; i < prog->in_nr; ++i) {
191 int j = prog->in[i].id;
192
193 prog->in[i].hw = nintp;
194 for (c = 0; c < 4; ++c)
195 if (prog->in[i].mask & (1 << c))
196 info->in[j].slot[c] = nintp++;
197 }
198 /* (n == m) if m never increased, i.e. no flat inputs */
199 nflat = (n < m) ? (nintp - prog->in[n].hw) : 0;
200 nintp -= bitcount4(prog->fp.interp >> 24); /* subtract position inputs */
201 nvary = nintp - nflat;
202
203 prog->fp.interp |= nvary << NV50_3D_FP_INTERPOLANT_CTRL_COUNT_NONFLAT__SHIFT;
204 prog->fp.interp |= nintp << NV50_3D_FP_INTERPOLANT_CTRL_COUNT__SHIFT;
205
206 /* put front/back colors right after HPOS */
207 prog->fp.colors = 4 << NV50_3D_SEMANTIC_COLOR_FFC0_ID__SHIFT;
208 for (i = 0; i < 2; ++i)
209 if (prog->vp.bfc[i] < 0xff)
210 prog->fp.colors += bitcount4(prog->in[prog->vp.bfc[i]].mask) << 16;
211
212 /* FP outputs */
213
214 if (info->prop.fp.numColourResults > 1)
215 prog->fp.flags[0] |= NV50_3D_FP_CONTROL_MULTIPLE_RESULTS;
216
217 for (i = 0; i < info->numOutputs; ++i) {
218 prog->out[i].id = i;
219 prog->out[i].sn = info->out[i].sn;
220 prog->out[i].si = info->out[i].si;
221 prog->out[i].mask = info->out[i].mask;
222
223 if (i == info->io.fragDepth || i == info->io.sampleMask)
224 continue;
225 prog->out[i].hw = info->out[i].si * 4;
226
227 for (c = 0; c < 4; ++c)
228 info->out[i].slot[c] = prog->out[i].hw + c;
229
230 prog->max_out = MAX2(prog->max_out, prog->out[i].hw + 4);
231 }
232
233 if (info->io.sampleMask < PIPE_MAX_SHADER_OUTPUTS) {
234 info->out[info->io.sampleMask].slot[0] = prog->max_out++;
235 prog->fp.has_samplemask = 1;
236 }
237
238 if (info->io.fragDepth < PIPE_MAX_SHADER_OUTPUTS)
239 info->out[info->io.fragDepth].slot[2] = prog->max_out++;
240
241 if (!prog->max_out)
242 prog->max_out = 4;
243
244 return 0;
245 }
246
247 static int
248 nv50_program_assign_varying_slots(struct nv50_ir_prog_info *info)
249 {
250 switch (info->type) {
251 case PIPE_SHADER_VERTEX:
252 return nv50_vertprog_assign_slots(info);
253 case PIPE_SHADER_GEOMETRY:
254 return nv50_vertprog_assign_slots(info);
255 case PIPE_SHADER_FRAGMENT:
256 return nv50_fragprog_assign_slots(info);
257 case PIPE_SHADER_COMPUTE:
258 return 0;
259 default:
260 return -1;
261 }
262 }
263
264 static struct nv50_stream_output_state *
265 nv50_program_create_strmout_state(const struct nv50_ir_prog_info *info,
266 const struct pipe_stream_output_info *pso)
267 {
268 struct nv50_stream_output_state *so;
269 unsigned b, i, c;
270 unsigned base[4];
271
272 so = MALLOC_STRUCT(nv50_stream_output_state);
273 if (!so)
274 return NULL;
275 memset(so->map, 0xff, sizeof(so->map));
276
277 for (b = 0; b < 4; ++b)
278 so->num_attribs[b] = 0;
279 for (i = 0; i < pso->num_outputs; ++i) {
280 unsigned end = pso->output[i].dst_offset + pso->output[i].num_components;
281 b = pso->output[i].output_buffer;
282 assert(b < 4);
283 so->num_attribs[b] = MAX2(so->num_attribs[b], end);
284 }
285
286 so->ctrl = NV50_3D_STRMOUT_BUFFERS_CTRL_INTERLEAVED;
287
288 so->stride[0] = pso->stride[0] * 4;
289 base[0] = 0;
290 for (b = 1; b < 4; ++b) {
291 assert(!so->num_attribs[b] || so->num_attribs[b] == pso->stride[b]);
292 so->stride[b] = so->num_attribs[b] * 4;
293 if (so->num_attribs[b])
294 so->ctrl = (b + 1) << NV50_3D_STRMOUT_BUFFERS_CTRL_SEPARATE__SHIFT;
295 base[b] = align(base[b - 1] + so->num_attribs[b - 1], 4);
296 }
297 if (so->ctrl & NV50_3D_STRMOUT_BUFFERS_CTRL_INTERLEAVED) {
298 assert(so->stride[0] < NV50_3D_STRMOUT_BUFFERS_CTRL_STRIDE__MAX);
299 so->ctrl |= so->stride[0] << NV50_3D_STRMOUT_BUFFERS_CTRL_STRIDE__SHIFT;
300 }
301
302 so->map_size = base[3] + so->num_attribs[3];
303
304 for (i = 0; i < pso->num_outputs; ++i) {
305 const unsigned s = pso->output[i].start_component;
306 const unsigned p = pso->output[i].dst_offset;
307 const unsigned r = pso->output[i].register_index;
308 b = pso->output[i].output_buffer;
309
310 for (c = 0; c < pso->output[i].num_components; ++c)
311 so->map[base[b] + p + c] = info->out[r].slot[s + c];
312 }
313
314 return so;
315 }
316
317 bool
318 nv50_program_translate(struct nv50_program *prog, uint16_t chipset,
319 struct pipe_debug_callback *debug)
320 {
321 struct nv50_ir_prog_info *info;
322 int i, ret;
323 const uint8_t map_undef = (prog->type == PIPE_SHADER_VERTEX) ? 0x40 : 0x80;
324
325 info = CALLOC_STRUCT(nv50_ir_prog_info);
326 if (!info)
327 return false;
328
329 info->type = prog->type;
330 info->target = chipset;
331 info->bin.sourceRep = NV50_PROGRAM_IR_TGSI;
332 info->bin.source = (void *)prog->pipe.tokens;
333
334 info->io.auxCBSlot = 15;
335 info->io.ucpBase = NV50_CB_AUX_UCP_OFFSET;
336 info->io.genUserClip = prog->vp.clpd_nr;
337
338 info->io.suInfoBase = NV50_CB_AUX_TEX_MS_OFFSET;
339 info->io.sampleInfoBase = NV50_CB_AUX_SAMPLE_OFFSET;
340 info->io.msInfoCBSlot = 15;
341 info->io.msInfoBase = NV50_CB_AUX_MS_OFFSET;
342
343 info->assignSlots = nv50_program_assign_varying_slots;
344
345 prog->vp.bfc[0] = 0xff;
346 prog->vp.bfc[1] = 0xff;
347 prog->vp.edgeflag = 0xff;
348 prog->vp.clpd[0] = map_undef;
349 prog->vp.clpd[1] = map_undef;
350 prog->vp.psiz = map_undef;
351 prog->gp.has_layer = 0;
352 prog->gp.has_viewport = 0;
353
354 if (prog->type == PIPE_SHADER_COMPUTE)
355 info->prop.cp.inputOffset = 0x10;
356
357 info->driverPriv = prog;
358
359 #ifdef DEBUG
360 info->optLevel = debug_get_num_option("NV50_PROG_OPTIMIZE", 3);
361 info->dbgFlags = debug_get_num_option("NV50_PROG_DEBUG", 0);
362 #else
363 info->optLevel = 3;
364 #endif
365
366 ret = nv50_ir_generate_code(info);
367 if (ret) {
368 NOUVEAU_ERR("shader translation failed: %i\n", ret);
369 goto out;
370 }
371
372 prog->code = info->bin.code;
373 prog->code_size = info->bin.codeSize;
374 prog->fixups = info->bin.relocData;
375 prog->interps = info->bin.fixupData;
376 prog->max_gpr = MAX2(4, (info->bin.maxGPR >> 1) + 1);
377 prog->tls_space = info->bin.tlsSpace;
378
379 prog->vp.need_vertex_id = info->io.vertexId < PIPE_MAX_SHADER_INPUTS;
380
381 prog->vp.clip_enable = (1 << info->io.clipDistances) - 1;
382 prog->vp.cull_enable =
383 ((1 << info->io.cullDistances) - 1) << info->io.clipDistances;
384 prog->vp.clip_mode = 0;
385 for (i = 0; i < info->io.cullDistances; ++i)
386 prog->vp.clip_mode |= 1 << ((info->io.clipDistances + i) * 4);
387
388 if (prog->type == PIPE_SHADER_FRAGMENT) {
389 if (info->prop.fp.writesDepth) {
390 prog->fp.flags[0] |= NV50_3D_FP_CONTROL_EXPORTS_Z;
391 prog->fp.flags[1] = 0x11;
392 }
393 if (info->prop.fp.usesDiscard)
394 prog->fp.flags[0] |= NV50_3D_FP_CONTROL_USES_KIL;
395 } else
396 if (prog->type == PIPE_SHADER_GEOMETRY) {
397 switch (info->prop.gp.outputPrim) {
398 case PIPE_PRIM_LINE_STRIP:
399 prog->gp.prim_type = NV50_3D_GP_OUTPUT_PRIMITIVE_TYPE_LINE_STRIP;
400 break;
401 case PIPE_PRIM_TRIANGLE_STRIP:
402 prog->gp.prim_type = NV50_3D_GP_OUTPUT_PRIMITIVE_TYPE_TRIANGLE_STRIP;
403 break;
404 case PIPE_PRIM_POINTS:
405 default:
406 assert(info->prop.gp.outputPrim == PIPE_PRIM_POINTS);
407 prog->gp.prim_type = NV50_3D_GP_OUTPUT_PRIMITIVE_TYPE_POINTS;
408 break;
409 }
410 prog->gp.vert_count = info->prop.gp.maxVertices;
411 }
412
413 if (prog->type == PIPE_SHADER_COMPUTE) {
414 prog->cp.syms = info->bin.syms;
415 prog->cp.num_syms = info->bin.numSyms;
416 } else {
417 FREE(info->bin.syms);
418 }
419
420 if (prog->pipe.stream_output.num_outputs)
421 prog->so = nv50_program_create_strmout_state(info,
422 &prog->pipe.stream_output);
423
424 pipe_debug_message(debug, SHADER_INFO,
425 "type: %d, local: %d, gpr: %d, inst: %d, bytes: %d",
426 prog->type, info->bin.tlsSpace, prog->max_gpr,
427 info->bin.instructions, info->bin.codeSize);
428
429 out:
430 FREE(info);
431 return !ret;
432 }
433
434 bool
435 nv50_program_upload_code(struct nv50_context *nv50, struct nv50_program *prog)
436 {
437 struct nouveau_heap *heap;
438 int ret;
439 uint32_t size = align(prog->code_size, 0x40);
440 uint8_t prog_type;
441
442 switch (prog->type) {
443 case PIPE_SHADER_VERTEX: heap = nv50->screen->vp_code_heap; break;
444 case PIPE_SHADER_GEOMETRY: heap = nv50->screen->gp_code_heap; break;
445 case PIPE_SHADER_FRAGMENT: heap = nv50->screen->fp_code_heap; break;
446 case PIPE_SHADER_COMPUTE: heap = nv50->screen->fp_code_heap; break;
447 default:
448 assert(!"invalid program type");
449 return false;
450 }
451
452 ret = nouveau_heap_alloc(heap, size, prog, &prog->mem);
453 if (ret) {
454 /* Out of space: evict everything to compactify the code segment, hoping
455 * the working set is much smaller and drifts slowly. Improve me !
456 */
457 while (heap->next) {
458 struct nv50_program *evict = heap->next->priv;
459 if (evict)
460 nouveau_heap_free(&evict->mem);
461 }
462 debug_printf("WARNING: out of code space, evicting all shaders.\n");
463 ret = nouveau_heap_alloc(heap, size, prog, &prog->mem);
464 if (ret) {
465 NOUVEAU_ERR("shader too large (0x%x) to fit in code space ?\n", size);
466 return false;
467 }
468 }
469
470 if (prog->type == PIPE_SHADER_COMPUTE) {
471 /* CP code must be uploaded in FP code segment. */
472 prog_type = 1;
473 } else {
474 prog->code_base = prog->mem->start;
475 prog_type = prog->type;
476 }
477
478 ret = nv50_tls_realloc(nv50->screen, prog->tls_space);
479 if (ret < 0) {
480 nouveau_heap_free(&prog->mem);
481 return false;
482 }
483 if (ret > 0)
484 nv50->state.new_tls_space = true;
485
486 if (prog->fixups)
487 nv50_ir_relocate_code(prog->fixups, prog->code, prog->code_base, 0, 0);
488 if (prog->interps)
489 nv50_ir_apply_fixups(prog->interps, prog->code,
490 prog->fp.force_persample_interp,
491 false /* flatshade */);
492
493 nv50_sifc_linear_u8(&nv50->base, nv50->screen->code,
494 (prog_type << NV50_CODE_BO_SIZE_LOG2) + prog->code_base,
495 NOUVEAU_BO_VRAM, prog->code_size, prog->code);
496
497 BEGIN_NV04(nv50->base.pushbuf, NV50_3D(CODE_CB_FLUSH), 1);
498 PUSH_DATA (nv50->base.pushbuf, 0);
499
500 return true;
501 }
502
503 void
504 nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
505 {
506 const struct pipe_shader_state pipe = p->pipe;
507 const ubyte type = p->type;
508
509 if (p->mem)
510 nouveau_heap_free(&p->mem);
511
512 FREE(p->code);
513
514 FREE(p->fixups);
515 FREE(p->interps);
516 FREE(p->so);
517
518 if (type == PIPE_SHADER_COMPUTE)
519 FREE(p->cp.syms);
520
521 memset(p, 0, sizeof(*p));
522
523 p->pipe = pipe;
524 p->type = type;
525 }