2 * Copyright 2010 Christoph Bumiller
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
23 #include "pipe/p_defines.h"
25 #include "compiler/nir/nir.h"
27 #include "nv50/nv50_program.h"
28 #include "nv50/nv50_context.h"
30 #include "codegen/nv50_ir_driver.h"
32 static inline unsigned
33 bitcount4(const uint32_t val
)
35 static const uint8_t cnt
[16]
36 = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 };
37 return cnt
[val
& 0xf];
41 nv50_vertprog_assign_slots(struct nv50_ir_prog_info
*info
)
43 struct nv50_program
*prog
= (struct nv50_program
*)info
->driverPriv
;
47 for (i
= 0; i
< info
->numInputs
; ++i
) {
49 prog
->in
[i
].sn
= info
->in
[i
].sn
;
50 prog
->in
[i
].si
= info
->in
[i
].si
;
52 prog
->in
[i
].mask
= info
->in
[i
].mask
;
54 prog
->vp
.attrs
[(4 * i
) / 32] |= info
->in
[i
].mask
<< ((4 * i
) % 32);
56 for (c
= 0; c
< 4; ++c
)
57 if (info
->in
[i
].mask
& (1 << c
))
58 info
->in
[i
].slot
[c
] = n
++;
60 if (info
->in
[i
].sn
== TGSI_SEMANTIC_PRIMID
)
61 prog
->vp
.attrs
[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_PRIMITIVE_ID
;
63 prog
->in_nr
= info
->numInputs
;
65 for (i
= 0; i
< info
->numSysVals
; ++i
) {
66 switch (info
->sv
[i
].sn
) {
67 case TGSI_SEMANTIC_INSTANCEID
:
68 prog
->vp
.attrs
[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_INSTANCE_ID
;
70 case TGSI_SEMANTIC_VERTEXID
:
71 prog
->vp
.attrs
[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_VERTEX_ID
;
72 prog
->vp
.attrs
[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_VERTEX_ID_DRAW_ARRAYS_ADD_START
;
80 * Corner case: VP has no inputs, but we will still need to submit data to
81 * draw it. HW will shout at us and won't draw anything if we don't enable
82 * any input, so let's just pretend it's the first one.
84 if (prog
->vp
.attrs
[0] == 0 &&
85 prog
->vp
.attrs
[1] == 0 &&
86 prog
->vp
.attrs
[2] == 0)
87 prog
->vp
.attrs
[0] |= 0xf;
89 /* VertexID before InstanceID */
90 if (info
->io
.vertexId
< info
->numSysVals
)
91 info
->sv
[info
->io
.vertexId
].slot
[0] = n
++;
92 if (info
->io
.instanceId
< info
->numSysVals
)
93 info
->sv
[info
->io
.instanceId
].slot
[0] = n
++;
96 for (i
= 0; i
< info
->numOutputs
; ++i
) {
97 switch (info
->out
[i
].sn
) {
98 case TGSI_SEMANTIC_PSIZE
:
101 case TGSI_SEMANTIC_CLIPDIST
:
102 prog
->vp
.clpd
[info
->out
[i
].si
] = n
;
104 case TGSI_SEMANTIC_EDGEFLAG
:
105 prog
->vp
.edgeflag
= i
;
107 case TGSI_SEMANTIC_BCOLOR
:
108 prog
->vp
.bfc
[info
->out
[i
].si
] = i
;
110 case TGSI_SEMANTIC_LAYER
:
111 prog
->gp
.has_layer
= true;
112 prog
->gp
.layerid
= n
;
114 case TGSI_SEMANTIC_VIEWPORT_INDEX
:
115 prog
->gp
.has_viewport
= true;
116 prog
->gp
.viewportid
= n
;
122 prog
->out
[i
].sn
= info
->out
[i
].sn
;
123 prog
->out
[i
].si
= info
->out
[i
].si
;
125 prog
->out
[i
].mask
= info
->out
[i
].mask
;
127 for (c
= 0; c
< 4; ++c
)
128 if (info
->out
[i
].mask
& (1 << c
))
129 info
->out
[i
].slot
[c
] = n
++;
131 prog
->out_nr
= info
->numOutputs
;
136 if (prog
->vp
.psiz
< info
->numOutputs
)
137 prog
->vp
.psiz
= prog
->out
[prog
->vp
.psiz
].hw
;
143 nv50_fragprog_assign_slots(struct nv50_ir_prog_info
*info
)
145 struct nv50_program
*prog
= (struct nv50_program
*)info
->driverPriv
;
151 /* count recorded non-flat inputs */
152 for (m
= 0, i
= 0; i
< info
->numInputs
; ++i
) {
153 switch (info
->in
[i
].sn
) {
154 case TGSI_SEMANTIC_POSITION
:
157 m
+= info
->in
[i
].flat
? 0 : 1;
161 /* careful: id may be != i in info->in[prog->in[i].id] */
163 /* Fill prog->in[] so that non-flat inputs are first and
164 * kick out special inputs that don't use the RESULT_MAP.
166 for (n
= 0, i
= 0; i
< info
->numInputs
; ++i
) {
167 if (info
->in
[i
].sn
== TGSI_SEMANTIC_POSITION
) {
168 prog
->fp
.interp
|= info
->in
[i
].mask
<< 24;
169 for (c
= 0; c
< 4; ++c
)
170 if (info
->in
[i
].mask
& (1 << c
))
171 info
->in
[i
].slot
[c
] = nintp
++;
173 unsigned j
= info
->in
[i
].flat
? m
++ : n
++;
175 if (info
->in
[i
].sn
== TGSI_SEMANTIC_COLOR
)
176 prog
->vp
.bfc
[info
->in
[i
].si
] = j
;
177 else if (info
->in
[i
].sn
== TGSI_SEMANTIC_PRIMID
)
178 prog
->vp
.attrs
[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_PRIMITIVE_ID
;
181 prog
->in
[j
].mask
= info
->in
[i
].mask
;
182 prog
->in
[j
].sn
= info
->in
[i
].sn
;
183 prog
->in
[j
].si
= info
->in
[i
].si
;
184 prog
->in
[j
].linear
= info
->in
[i
].linear
;
189 if (!(prog
->fp
.interp
& (8 << 24))) {
191 prog
->fp
.interp
|= 8 << 24;
194 for (i
= 0; i
< prog
->in_nr
; ++i
) {
195 int j
= prog
->in
[i
].id
;
197 prog
->in
[i
].hw
= nintp
;
198 for (c
= 0; c
< 4; ++c
)
199 if (prog
->in
[i
].mask
& (1 << c
))
200 info
->in
[j
].slot
[c
] = nintp
++;
202 /* (n == m) if m never increased, i.e. no flat inputs */
203 nflat
= (n
< m
) ? (nintp
- prog
->in
[n
].hw
) : 0;
204 nintp
-= bitcount4(prog
->fp
.interp
>> 24); /* subtract position inputs */
205 nvary
= nintp
- nflat
;
207 prog
->fp
.interp
|= nvary
<< NV50_3D_FP_INTERPOLANT_CTRL_COUNT_NONFLAT__SHIFT
;
208 prog
->fp
.interp
|= nintp
<< NV50_3D_FP_INTERPOLANT_CTRL_COUNT__SHIFT
;
210 /* put front/back colors right after HPOS */
211 prog
->fp
.colors
= 4 << NV50_3D_SEMANTIC_COLOR_FFC0_ID__SHIFT
;
212 for (i
= 0; i
< 2; ++i
)
213 if (prog
->vp
.bfc
[i
] < 0xff)
214 prog
->fp
.colors
+= bitcount4(prog
->in
[prog
->vp
.bfc
[i
]].mask
) << 16;
218 if (info
->prop
.fp
.numColourResults
> 1)
219 prog
->fp
.flags
[0] |= NV50_3D_FP_CONTROL_MULTIPLE_RESULTS
;
221 for (i
= 0; i
< info
->numOutputs
; ++i
) {
223 prog
->out
[i
].sn
= info
->out
[i
].sn
;
224 prog
->out
[i
].si
= info
->out
[i
].si
;
225 prog
->out
[i
].mask
= info
->out
[i
].mask
;
227 if (i
== info
->io
.fragDepth
|| i
== info
->io
.sampleMask
)
229 prog
->out
[i
].hw
= info
->out
[i
].si
* 4;
231 for (c
= 0; c
< 4; ++c
)
232 info
->out
[i
].slot
[c
] = prog
->out
[i
].hw
+ c
;
234 prog
->max_out
= MAX2(prog
->max_out
, prog
->out
[i
].hw
+ 4);
237 if (info
->io
.sampleMask
< PIPE_MAX_SHADER_OUTPUTS
) {
238 info
->out
[info
->io
.sampleMask
].slot
[0] = prog
->max_out
++;
239 prog
->fp
.has_samplemask
= 1;
242 if (info
->io
.fragDepth
< PIPE_MAX_SHADER_OUTPUTS
)
243 info
->out
[info
->io
.fragDepth
].slot
[2] = prog
->max_out
++;
252 nv50_program_assign_varying_slots(struct nv50_ir_prog_info
*info
)
254 switch (info
->type
) {
255 case PIPE_SHADER_VERTEX
:
256 return nv50_vertprog_assign_slots(info
);
257 case PIPE_SHADER_GEOMETRY
:
258 return nv50_vertprog_assign_slots(info
);
259 case PIPE_SHADER_FRAGMENT
:
260 return nv50_fragprog_assign_slots(info
);
261 case PIPE_SHADER_COMPUTE
:
268 static struct nv50_stream_output_state
*
269 nv50_program_create_strmout_state(const struct nv50_ir_prog_info
*info
,
270 const struct pipe_stream_output_info
*pso
)
272 struct nv50_stream_output_state
*so
;
276 so
= MALLOC_STRUCT(nv50_stream_output_state
);
279 memset(so
->map
, 0xff, sizeof(so
->map
));
281 for (b
= 0; b
< 4; ++b
)
282 so
->num_attribs
[b
] = 0;
283 for (i
= 0; i
< pso
->num_outputs
; ++i
) {
284 unsigned end
= pso
->output
[i
].dst_offset
+ pso
->output
[i
].num_components
;
285 b
= pso
->output
[i
].output_buffer
;
287 so
->num_attribs
[b
] = MAX2(so
->num_attribs
[b
], end
);
290 so
->ctrl
= NV50_3D_STRMOUT_BUFFERS_CTRL_INTERLEAVED
;
292 so
->stride
[0] = pso
->stride
[0] * 4;
294 for (b
= 1; b
< 4; ++b
) {
295 assert(!so
->num_attribs
[b
] || so
->num_attribs
[b
] == pso
->stride
[b
]);
296 so
->stride
[b
] = so
->num_attribs
[b
] * 4;
297 if (so
->num_attribs
[b
])
298 so
->ctrl
= (b
+ 1) << NV50_3D_STRMOUT_BUFFERS_CTRL_SEPARATE__SHIFT
;
299 base
[b
] = align(base
[b
- 1] + so
->num_attribs
[b
- 1], 4);
301 if (so
->ctrl
& NV50_3D_STRMOUT_BUFFERS_CTRL_INTERLEAVED
) {
302 assert(so
->stride
[0] < NV50_3D_STRMOUT_BUFFERS_CTRL_STRIDE__MAX
);
303 so
->ctrl
|= so
->stride
[0] << NV50_3D_STRMOUT_BUFFERS_CTRL_STRIDE__SHIFT
;
306 so
->map_size
= base
[3] + so
->num_attribs
[3];
308 for (i
= 0; i
< pso
->num_outputs
; ++i
) {
309 const unsigned s
= pso
->output
[i
].start_component
;
310 const unsigned p
= pso
->output
[i
].dst_offset
;
311 const unsigned r
= pso
->output
[i
].register_index
;
312 b
= pso
->output
[i
].output_buffer
;
314 if (r
>= info
->numOutputs
)
317 for (c
= 0; c
< pso
->output
[i
].num_components
; ++c
)
318 so
->map
[base
[b
] + p
+ c
] = info
->out
[r
].slot
[s
+ c
];
325 nv50_program_translate(struct nv50_program
*prog
, uint16_t chipset
,
326 struct pipe_debug_callback
*debug
)
328 struct nv50_ir_prog_info
*info
;
330 const uint8_t map_undef
= (prog
->type
== PIPE_SHADER_VERTEX
) ? 0x40 : 0x80;
332 info
= CALLOC_STRUCT(nv50_ir_prog_info
);
336 info
->type
= prog
->type
;
337 info
->target
= chipset
;
339 info
->bin
.sourceRep
= prog
->pipe
.type
;
340 switch (prog
->pipe
.type
) {
341 case PIPE_SHADER_IR_TGSI
:
342 info
->bin
.source
= (void *)prog
->pipe
.tokens
;
344 case PIPE_SHADER_IR_NIR
:
345 info
->bin
.source
= (void *)nir_shader_clone(NULL
, prog
->pipe
.ir
.nir
);
348 assert(!"unsupported IR!");
352 info
->bin
.smemSize
= prog
->cp
.smem_size
;
353 info
->io
.auxCBSlot
= 15;
354 info
->io
.ucpBase
= NV50_CB_AUX_UCP_OFFSET
;
355 info
->io
.genUserClip
= prog
->vp
.clpd_nr
;
356 if (prog
->fp
.alphatest
)
357 info
->io
.alphaRefBase
= NV50_CB_AUX_ALPHATEST_OFFSET
;
359 info
->io
.suInfoBase
= NV50_CB_AUX_TEX_MS_OFFSET
;
360 info
->io
.sampleInfoBase
= NV50_CB_AUX_SAMPLE_OFFSET
;
361 info
->io
.msInfoCBSlot
= 15;
362 info
->io
.msInfoBase
= NV50_CB_AUX_MS_OFFSET
;
364 info
->assignSlots
= nv50_program_assign_varying_slots
;
366 prog
->vp
.bfc
[0] = 0xff;
367 prog
->vp
.bfc
[1] = 0xff;
368 prog
->vp
.edgeflag
= 0xff;
369 prog
->vp
.clpd
[0] = map_undef
;
370 prog
->vp
.clpd
[1] = map_undef
;
371 prog
->vp
.psiz
= map_undef
;
372 prog
->gp
.has_layer
= 0;
373 prog
->gp
.has_viewport
= 0;
375 if (prog
->type
== PIPE_SHADER_COMPUTE
)
376 info
->prop
.cp
.inputOffset
= 0x10;
378 info
->driverPriv
= prog
;
381 info
->optLevel
= debug_get_num_option("NV50_PROG_OPTIMIZE", 3);
382 info
->dbgFlags
= debug_get_num_option("NV50_PROG_DEBUG", 0);
383 info
->omitLineNum
= debug_get_num_option("NV50_PROG_DEBUG_OMIT_LINENUM", 0);
388 ret
= nv50_ir_generate_code(info
);
390 NOUVEAU_ERR("shader translation failed: %i\n", ret
);
394 prog
->code
= info
->bin
.code
;
395 prog
->code_size
= info
->bin
.codeSize
;
396 prog
->fixups
= info
->bin
.relocData
;
397 prog
->interps
= info
->bin
.fixupData
;
398 prog
->max_gpr
= MAX2(4, (info
->bin
.maxGPR
>> 1) + 1);
399 prog
->tls_space
= info
->bin
.tlsSpace
;
400 prog
->cp
.smem_size
= info
->bin
.smemSize
;
401 prog
->mul_zero_wins
= info
->io
.mul_zero_wins
;
402 prog
->vp
.need_vertex_id
= info
->io
.vertexId
< PIPE_MAX_SHADER_INPUTS
;
404 prog
->vp
.clip_enable
= (1 << info
->io
.clipDistances
) - 1;
405 prog
->vp
.cull_enable
=
406 ((1 << info
->io
.cullDistances
) - 1) << info
->io
.clipDistances
;
407 prog
->vp
.clip_mode
= 0;
408 for (i
= 0; i
< info
->io
.cullDistances
; ++i
)
409 prog
->vp
.clip_mode
|= 1 << ((info
->io
.clipDistances
+ i
) * 4);
411 if (prog
->type
== PIPE_SHADER_FRAGMENT
) {
412 if (info
->prop
.fp
.writesDepth
) {
413 prog
->fp
.flags
[0] |= NV50_3D_FP_CONTROL_EXPORTS_Z
;
414 prog
->fp
.flags
[1] = 0x11;
416 if (info
->prop
.fp
.usesDiscard
)
417 prog
->fp
.flags
[0] |= NV50_3D_FP_CONTROL_USES_KIL
;
419 if (prog
->type
== PIPE_SHADER_GEOMETRY
) {
420 switch (info
->prop
.gp
.outputPrim
) {
421 case PIPE_PRIM_LINE_STRIP
:
422 prog
->gp
.prim_type
= NV50_3D_GP_OUTPUT_PRIMITIVE_TYPE_LINE_STRIP
;
424 case PIPE_PRIM_TRIANGLE_STRIP
:
425 prog
->gp
.prim_type
= NV50_3D_GP_OUTPUT_PRIMITIVE_TYPE_TRIANGLE_STRIP
;
427 case PIPE_PRIM_POINTS
:
429 assert(info
->prop
.gp
.outputPrim
== PIPE_PRIM_POINTS
);
430 prog
->gp
.prim_type
= NV50_3D_GP_OUTPUT_PRIMITIVE_TYPE_POINTS
;
433 prog
->gp
.vert_count
= CLAMP(info
->prop
.gp
.maxVertices
, 1, 1024);
436 if (prog
->type
== PIPE_SHADER_COMPUTE
) {
437 prog
->cp
.syms
= info
->bin
.syms
;
438 prog
->cp
.num_syms
= info
->bin
.numSyms
;
440 FREE(info
->bin
.syms
);
443 if (prog
->pipe
.stream_output
.num_outputs
)
444 prog
->so
= nv50_program_create_strmout_state(info
,
445 &prog
->pipe
.stream_output
);
447 pipe_debug_message(debug
, SHADER_INFO
,
448 "type: %d, local: %d, shared: %d, gpr: %d, inst: %d, bytes: %d",
449 prog
->type
, info
->bin
.tlsSpace
, info
->bin
.smemSize
,
450 prog
->max_gpr
, info
->bin
.instructions
,
454 if (info
->bin
.sourceRep
== PIPE_SHADER_IR_NIR
)
455 ralloc_free((void *)info
->bin
.source
);
461 nv50_program_upload_code(struct nv50_context
*nv50
, struct nv50_program
*prog
)
463 struct nouveau_heap
*heap
;
465 uint32_t size
= align(prog
->code_size
, 0x40);
468 switch (prog
->type
) {
469 case PIPE_SHADER_VERTEX
: heap
= nv50
->screen
->vp_code_heap
; break;
470 case PIPE_SHADER_GEOMETRY
: heap
= nv50
->screen
->gp_code_heap
; break;
471 case PIPE_SHADER_FRAGMENT
: heap
= nv50
->screen
->fp_code_heap
; break;
472 case PIPE_SHADER_COMPUTE
: heap
= nv50
->screen
->fp_code_heap
; break;
474 assert(!"invalid program type");
478 ret
= nouveau_heap_alloc(heap
, size
, prog
, &prog
->mem
);
480 /* Out of space: evict everything to compactify the code segment, hoping
481 * the working set is much smaller and drifts slowly. Improve me !
484 struct nv50_program
*evict
= heap
->next
->priv
;
486 nouveau_heap_free(&evict
->mem
);
488 debug_printf("WARNING: out of code space, evicting all shaders.\n");
489 ret
= nouveau_heap_alloc(heap
, size
, prog
, &prog
->mem
);
491 NOUVEAU_ERR("shader too large (0x%x) to fit in code space ?\n", size
);
496 if (prog
->type
== PIPE_SHADER_COMPUTE
) {
497 /* CP code must be uploaded in FP code segment. */
500 prog
->code_base
= prog
->mem
->start
;
501 prog_type
= prog
->type
;
504 ret
= nv50_tls_realloc(nv50
->screen
, prog
->tls_space
);
506 nouveau_heap_free(&prog
->mem
);
510 nv50
->state
.new_tls_space
= true;
513 nv50_ir_relocate_code(prog
->fixups
, prog
->code
, prog
->code_base
, 0, 0);
515 nv50_ir_apply_fixups(prog
->interps
, prog
->code
,
516 prog
->fp
.force_persample_interp
,
517 false /* flatshade */,
518 prog
->fp
.alphatest
- 1);
520 nv50_sifc_linear_u8(&nv50
->base
, nv50
->screen
->code
,
521 (prog_type
<< NV50_CODE_BO_SIZE_LOG2
) + prog
->code_base
,
522 NOUVEAU_BO_VRAM
, prog
->code_size
, prog
->code
);
524 BEGIN_NV04(nv50
->base
.pushbuf
, NV50_3D(CODE_CB_FLUSH
), 1);
525 PUSH_DATA (nv50
->base
.pushbuf
, 0);
531 nv50_program_destroy(struct nv50_context
*nv50
, struct nv50_program
*p
)
533 const struct pipe_shader_state pipe
= p
->pipe
;
534 const ubyte type
= p
->type
;
537 nouveau_heap_free(&p
->mem
);
545 if (type
== PIPE_SHADER_COMPUTE
)
548 memset(p
, 0, sizeof(*p
));