1 /* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
4 * Copyright (C) 2013 Rob Clark <robclark@freedesktop.org>
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26 * Rob Clark <robclark@freedesktop.org>
29 #include "pipe/p_state.h"
30 #include "util/u_string.h"
31 #include "util/u_memory.h"
32 #include "util/u_inlines.h"
33 #include "util/u_format.h"
34 #include "tgsi/tgsi_dump.h"
35 #include "tgsi/tgsi_parse.h"
37 #include "freedreno_lowering.h"
38 #include "freedreno_program.h"
40 #include "fd3_program.h"
41 #include "fd3_compiler.h"
43 #include "fd3_texture.h"
47 delete_shader(struct fd3_shader_stateobj
*so
)
49 ir3_shader_destroy(so
->ir
);
55 assemble_shader(struct pipe_context
*pctx
, struct fd3_shader_stateobj
*so
)
57 struct fd_context
*ctx
= fd_context(pctx
);
60 bin
= ir3_shader_assemble(so
->ir
, &so
->info
);
61 sz
= so
->info
.sizedwords
* 4;
63 so
->bo
= fd_bo_new(ctx
->dev
, sz
,
64 DRM_FREEDRENO_GEM_CACHE_WCOMBINE
|
65 DRM_FREEDRENO_GEM_TYPE_KMEM
);
67 memcpy(fd_bo_map(so
->bo
), bin
, sz
);
71 so
->instrlen
= so
->info
.sizedwords
/ 8;
72 so
->constlen
= so
->info
.max_const
+ 1;
75 /* for vertex shader, the inputs are loaded into registers before the shader
76 * is executed, so max_regs from the shader instructions might not properly
77 * reflect the # of registers actually used:
80 fixup_vp_regfootprint(struct fd3_shader_stateobj
*so
)
83 for (i
= 0; i
< so
->inputs_count
; i
++)
84 so
->info
.max_reg
= MAX2(so
->info
.max_reg
, (so
->inputs
[i
].regid
+ 3) >> 2);
85 for (i
= 0; i
< so
->outputs_count
; i
++)
86 so
->info
.max_reg
= MAX2(so
->info
.max_reg
, (so
->outputs
[i
].regid
+ 3) >> 2);
89 static struct fd3_shader_stateobj
*
90 create_shader(struct pipe_context
*pctx
, const struct pipe_shader_state
*cso
,
93 struct fd3_shader_stateobj
*so
= CALLOC_STRUCT(fd3_shader_stateobj
);
94 const struct tgsi_token
*tokens
= cso
->tokens
;
102 if (fd_mesa_debug
& FD_DBG_DISASM
) {
103 DBG("dump tgsi: type=%d", so
->type
);
104 tgsi_dump(tokens
, 0);
107 if ((type
== SHADER_FRAGMENT
) && (fd_mesa_debug
& FD_DBG_FRAGHALF
))
108 so
->half_precision
= true;
111 if (!(fd_mesa_debug
& FD_DBG_NOOPT
)) {
112 ret
= fd3_compile_shader(so
, tokens
);
114 debug_error("new compiler failed, trying fallback!");
116 so
->inputs_count
= 0;
117 so
->outputs_count
= 0;
119 so
->samplers_count
= 0;
120 so
->immediates_count
= 0;
123 ret
= -1; /* force fallback to old compiler */
127 ret
= fd3_compile_shader_old(so
, tokens
);
130 debug_error("compile failed!");
134 assemble_shader(pctx
, so
);
136 debug_error("assemble failed!");
140 if (type
== SHADER_VERTEX
)
141 fixup_vp_regfootprint(so
);
143 if (fd_mesa_debug
& FD_DBG_DISASM
) {
144 DBG("disassemble: type=%d", so
->type
);
145 disasm_a3xx(fd_bo_map(so
->bo
), so
->info
.sizedwords
, 0, so
->type
);
156 fd3_fp_state_create(struct pipe_context
*pctx
,
157 const struct pipe_shader_state
*cso
)
159 return create_shader(pctx
, cso
, SHADER_FRAGMENT
);
163 fd3_fp_state_delete(struct pipe_context
*pctx
, void *hwcso
)
165 struct fd3_shader_stateobj
*so
= hwcso
;
170 fd3_vp_state_create(struct pipe_context
*pctx
,
171 const struct pipe_shader_state
*cso
)
173 return create_shader(pctx
, cso
, SHADER_VERTEX
);
177 fd3_vp_state_delete(struct pipe_context
*pctx
, void *hwcso
)
179 struct fd3_shader_stateobj
*so
= hwcso
;
184 emit_shader(struct fd_ringbuffer
*ring
, const struct fd3_shader_stateobj
*so
)
186 const struct ir3_shader_info
*si
= &so
->info
;
187 enum adreno_state_block sb
;
188 enum adreno_state_src src
;
189 uint32_t i
, sz
, *bin
;
191 if (so
->type
== SHADER_VERTEX
) {
197 if (fd_mesa_debug
& FD_DBG_DIRECT
) {
200 bin
= fd_bo_map(so
->bo
);
207 OUT_PKT3(ring
, CP_LOAD_STATE
, 2 + sz
);
208 OUT_RING(ring
, CP_LOAD_STATE_0_DST_OFF(0) |
209 CP_LOAD_STATE_0_STATE_SRC(src
) |
210 CP_LOAD_STATE_0_STATE_BLOCK(sb
) |
211 CP_LOAD_STATE_0_NUM_UNIT(so
->instrlen
));
213 OUT_RING(ring
, CP_LOAD_STATE_1_EXT_SRC_ADDR(0) |
214 CP_LOAD_STATE_1_STATE_TYPE(ST_SHADER
));
216 OUT_RELOC(ring
, so
->bo
, 0,
217 CP_LOAD_STATE_1_STATE_TYPE(ST_SHADER
), 0);
219 for (i
= 0; i
< sz
; i
++) {
220 OUT_RING(ring
, bin
[i
]);
225 find_output(const struct fd3_shader_stateobj
*so
, fd3_semantic semantic
)
228 for (j
= 0; j
< so
->outputs_count
; j
++)
229 if (so
->outputs
[j
].semantic
== semantic
)
235 find_output_regid(const struct fd3_shader_stateobj
*so
, fd3_semantic semantic
)
238 for (j
= 0; j
< so
->outputs_count
; j
++)
239 if (so
->outputs
[j
].semantic
== semantic
)
240 return so
->outputs
[j
].regid
;
245 fd3_program_emit(struct fd_ringbuffer
*ring
,
246 struct fd_program_stateobj
*prog
, bool binning
)
248 const struct fd3_shader_stateobj
*vp
= prog
->vp
;
249 const struct fd3_shader_stateobj
*fp
= prog
->fp
;
250 const struct ir3_shader_info
*vsi
= &vp
->info
;
251 const struct ir3_shader_info
*fsi
= &fp
->info
;
252 uint32_t pos_regid
, posz_regid
, psize_regid
, color_regid
;
256 /* use dummy stateobj to simplify binning vs non-binning: */
257 static const struct fd3_shader_stateobj binning_fp
= {};
262 pos_regid
= find_output_regid(vp
,
263 fd3_semantic_name(TGSI_SEMANTIC_POSITION
, 0));
264 posz_regid
= find_output_regid(fp
,
265 fd3_semantic_name(TGSI_SEMANTIC_POSITION
, 0));
266 psize_regid
= find_output_regid(vp
,
267 fd3_semantic_name(TGSI_SEMANTIC_PSIZE
, 0));
268 color_regid
= find_output_regid(fp
,
269 fd3_semantic_name(TGSI_SEMANTIC_COLOR
, 0));
271 /* we could probably divide this up into things that need to be
272 * emitted if frag-prog is dirty vs if vert-prog is dirty..
275 OUT_PKT0(ring
, REG_A3XX_HLSQ_CONTROL_0_REG
, 6);
276 OUT_RING(ring
, A3XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE(FOUR_QUADS
) |
277 /* NOTE: I guess SHADERRESTART and CONSTFULLUPDATE maybe
278 * flush some caches? I think we only need to set those
279 * bits if we have updated const or shader..
281 A3XX_HLSQ_CONTROL_0_REG_SPSHADERRESTART
|
282 A3XX_HLSQ_CONTROL_0_REG_SPCONSTFULLUPDATE
);
283 OUT_RING(ring
, A3XX_HLSQ_CONTROL_1_REG_VSTHREADSIZE(TWO_QUADS
) |
284 A3XX_HLSQ_CONTROL_1_REG_VSSUPERTHREADENABLE
);
285 OUT_RING(ring
, A3XX_HLSQ_CONTROL_2_REG_PRIMALLOCTHRESHOLD(31));
286 OUT_RING(ring
, 0x00000000); /* HLSQ_CONTROL_3_REG */
287 OUT_RING(ring
, A3XX_HLSQ_VS_CONTROL_REG_CONSTLENGTH(vp
->constlen
) |
288 A3XX_HLSQ_VS_CONTROL_REG_CONSTSTARTOFFSET(0) |
289 A3XX_HLSQ_VS_CONTROL_REG_INSTRLENGTH(vp
->instrlen
));
290 OUT_RING(ring
, A3XX_HLSQ_FS_CONTROL_REG_CONSTLENGTH(fp
->constlen
) |
291 A3XX_HLSQ_FS_CONTROL_REG_CONSTSTARTOFFSET(128) |
292 A3XX_HLSQ_FS_CONTROL_REG_INSTRLENGTH(fp
->instrlen
));
294 OUT_PKT0(ring
, REG_A3XX_SP_SP_CTRL_REG
, 1);
295 OUT_RING(ring
, A3XX_SP_SP_CTRL_REG_CONSTMODE(0) |
296 COND(binning
, A3XX_SP_SP_CTRL_REG_BINNING
) |
297 A3XX_SP_SP_CTRL_REG_SLEEPMODE(1) |
298 A3XX_SP_SP_CTRL_REG_L0MODE(0));
300 OUT_PKT0(ring
, REG_A3XX_SP_VS_LENGTH_REG
, 1);
301 OUT_RING(ring
, A3XX_SP_VS_LENGTH_REG_SHADERLENGTH(vp
->instrlen
));
303 OUT_PKT0(ring
, REG_A3XX_SP_VS_CTRL_REG0
, 3);
304 OUT_RING(ring
, A3XX_SP_VS_CTRL_REG0_THREADMODE(MULTI
) |
305 A3XX_SP_VS_CTRL_REG0_INSTRBUFFERMODE(BUFFER
) |
306 A3XX_SP_VS_CTRL_REG0_CACHEINVALID
|
307 A3XX_SP_VS_CTRL_REG0_HALFREGFOOTPRINT(vsi
->max_half_reg
+ 1) |
308 A3XX_SP_VS_CTRL_REG0_FULLREGFOOTPRINT(vsi
->max_reg
+ 1) |
309 A3XX_SP_VS_CTRL_REG0_INOUTREGOVERLAP(0) |
310 A3XX_SP_VS_CTRL_REG0_THREADSIZE(TWO_QUADS
) |
311 A3XX_SP_VS_CTRL_REG0_SUPERTHREADMODE
|
312 COND(vp
->samplers_count
> 0, A3XX_SP_VS_CTRL_REG0_PIXLODENABLE
) |
313 A3XX_SP_VS_CTRL_REG0_LENGTH(vp
->instrlen
));
314 OUT_RING(ring
, A3XX_SP_VS_CTRL_REG1_CONSTLENGTH(vp
->constlen
) |
315 A3XX_SP_VS_CTRL_REG1_INITIALOUTSTANDING(vp
->total_in
) |
316 A3XX_SP_VS_CTRL_REG1_CONSTFOOTPRINT(MAX2(vsi
->max_const
, 0)));
317 OUT_RING(ring
, A3XX_SP_VS_PARAM_REG_POSREGID(pos_regid
) |
318 A3XX_SP_VS_PARAM_REG_PSIZEREGID(psize_regid
) |
319 A3XX_SP_VS_PARAM_REG_TOTALVSOUTVAR(fp
->inputs_count
));
321 for (i
= 0; i
< fp
->inputs_count
; ) {
325 OUT_PKT0(ring
, REG_A3XX_SP_VS_OUT_REG(i
/2), 1);
327 j
= find_output(vp
, fp
->inputs
[i
].semantic
);
328 reg
|= A3XX_SP_VS_OUT_REG_A_REGID(vp
->outputs
[j
].regid
);
329 reg
|= A3XX_SP_VS_OUT_REG_A_COMPMASK(fp
->inputs
[i
].compmask
);
332 j
= find_output(vp
, fp
->inputs
[i
].semantic
);
333 reg
|= A3XX_SP_VS_OUT_REG_B_REGID(vp
->outputs
[j
].regid
);
334 reg
|= A3XX_SP_VS_OUT_REG_B_COMPMASK(fp
->inputs
[i
].compmask
);
340 for (i
= 0; i
< fp
->inputs_count
; ) {
343 OUT_PKT0(ring
, REG_A3XX_SP_VS_VPC_DST_REG(i
/4), 1);
345 reg
|= A3XX_SP_VS_VPC_DST_REG_OUTLOC0(fp
->inputs
[i
++].inloc
);
346 reg
|= A3XX_SP_VS_VPC_DST_REG_OUTLOC1(fp
->inputs
[i
++].inloc
);
347 reg
|= A3XX_SP_VS_VPC_DST_REG_OUTLOC2(fp
->inputs
[i
++].inloc
);
348 reg
|= A3XX_SP_VS_VPC_DST_REG_OUTLOC3(fp
->inputs
[i
++].inloc
);
353 OUT_PKT0(ring
, REG_A3XX_SP_VS_OBJ_OFFSET_REG
, 2);
354 OUT_RING(ring
, A3XX_SP_VS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(0) |
355 A3XX_SP_VS_OBJ_OFFSET_REG_SHADEROBJOFFSET(0));
356 OUT_RELOC(ring
, vp
->bo
, 0, 0, 0); /* SP_VS_OBJ_START_REG */
359 OUT_PKT0(ring
, REG_A3XX_SP_FS_LENGTH_REG
, 1);
360 OUT_RING(ring
, 0x00000000);
362 OUT_PKT0(ring
, REG_A3XX_SP_FS_CTRL_REG0
, 2);
363 OUT_RING(ring
, A3XX_SP_FS_CTRL_REG0_THREADMODE(MULTI
) |
364 A3XX_SP_FS_CTRL_REG0_INSTRBUFFERMODE(BUFFER
));
365 OUT_RING(ring
, 0x00000000);
367 OUT_PKT0(ring
, REG_A3XX_SP_FS_LENGTH_REG
, 1);
368 OUT_RING(ring
, A3XX_SP_FS_LENGTH_REG_SHADERLENGTH(fp
->instrlen
));
370 OUT_PKT0(ring
, REG_A3XX_SP_FS_CTRL_REG0
, 2);
371 OUT_RING(ring
, A3XX_SP_FS_CTRL_REG0_THREADMODE(MULTI
) |
372 A3XX_SP_FS_CTRL_REG0_INSTRBUFFERMODE(BUFFER
) |
373 A3XX_SP_FS_CTRL_REG0_CACHEINVALID
|
374 A3XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT(fsi
->max_half_reg
+ 1) |
375 A3XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(fsi
->max_reg
+ 1) |
376 A3XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP(1) |
377 A3XX_SP_FS_CTRL_REG0_THREADSIZE(FOUR_QUADS
) |
378 A3XX_SP_FS_CTRL_REG0_SUPERTHREADMODE
|
379 COND(fp
->samplers_count
> 0, A3XX_SP_FS_CTRL_REG0_PIXLODENABLE
) |
380 A3XX_SP_FS_CTRL_REG0_LENGTH(fp
->instrlen
));
381 OUT_RING(ring
, A3XX_SP_FS_CTRL_REG1_CONSTLENGTH(fp
->constlen
) |
382 A3XX_SP_FS_CTRL_REG1_INITIALOUTSTANDING(fp
->total_in
) |
383 A3XX_SP_FS_CTRL_REG1_CONSTFOOTPRINT(MAX2(fsi
->max_const
, 0)) |
384 A3XX_SP_FS_CTRL_REG1_HALFPRECVAROFFSET(63));
385 OUT_PKT0(ring
, REG_A3XX_SP_FS_OBJ_OFFSET_REG
, 2);
386 OUT_RING(ring
, A3XX_SP_FS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(128) |
387 A3XX_SP_FS_OBJ_OFFSET_REG_SHADEROBJOFFSET(0));
388 OUT_RELOC(ring
, fp
->bo
, 0, 0, 0); /* SP_FS_OBJ_START_REG */
391 OUT_PKT0(ring
, REG_A3XX_SP_FS_FLAT_SHAD_MODE_REG_0
, 2);
392 OUT_RING(ring
, 0x00000000); /* SP_FS_FLAT_SHAD_MODE_REG_0 */
393 OUT_RING(ring
, 0x00000000); /* SP_FS_FLAT_SHAD_MODE_REG_1 */
395 OUT_PKT0(ring
, REG_A3XX_SP_FS_OUTPUT_REG
, 1);
396 if (fp
->writes_pos
) {
397 OUT_RING(ring
, A3XX_SP_FS_OUTPUT_REG_DEPTH_ENABLE
|
398 A3XX_SP_FS_OUTPUT_REG_DEPTH_REGID(posz_regid
));
400 OUT_RING(ring
, 0x00000000);
403 OUT_PKT0(ring
, REG_A3XX_SP_FS_MRT_REG(0), 4);
404 OUT_RING(ring
, A3XX_SP_FS_MRT_REG_REGID(color_regid
) |
405 COND(fp
->half_precision
, A3XX_SP_FS_MRT_REG_HALF_PRECISION
));
406 OUT_RING(ring
, A3XX_SP_FS_MRT_REG_REGID(0));
407 OUT_RING(ring
, A3XX_SP_FS_MRT_REG_REGID(0));
408 OUT_RING(ring
, A3XX_SP_FS_MRT_REG_REGID(0));
411 OUT_PKT0(ring
, REG_A3XX_VPC_ATTR
, 2);
412 OUT_RING(ring
, A3XX_VPC_ATTR_THRDASSIGN(1) |
413 A3XX_VPC_ATTR_LMSIZE(1));
414 OUT_RING(ring
, 0x00000000);
416 OUT_PKT0(ring
, REG_A3XX_VPC_ATTR
, 2);
417 OUT_RING(ring
, A3XX_VPC_ATTR_TOTALATTR(fp
->total_in
) |
418 A3XX_VPC_ATTR_THRDASSIGN(1) |
419 A3XX_VPC_ATTR_LMSIZE(1));
420 OUT_RING(ring
, A3XX_VPC_PACK_NUMFPNONPOSVAR(fp
->total_in
) |
421 A3XX_VPC_PACK_NUMNONPOSVSVAR(fp
->total_in
));
423 OUT_PKT0(ring
, REG_A3XX_VPC_VARYING_INTERP_MODE(0), 4);
424 OUT_RING(ring
, fp
->vinterp
[0]); /* VPC_VARYING_INTERP[0].MODE */
425 OUT_RING(ring
, fp
->vinterp
[1]); /* VPC_VARYING_INTERP[1].MODE */
426 OUT_RING(ring
, fp
->vinterp
[2]); /* VPC_VARYING_INTERP[2].MODE */
427 OUT_RING(ring
, fp
->vinterp
[3]); /* VPC_VARYING_INTERP[3].MODE */
429 OUT_PKT0(ring
, REG_A3XX_VPC_VARYING_PS_REPL_MODE(0), 4);
430 OUT_RING(ring
, fp
->vpsrepl
[0]); /* VPC_VARYING_PS_REPL[0].MODE */
431 OUT_RING(ring
, fp
->vpsrepl
[1]); /* VPC_VARYING_PS_REPL[1].MODE */
432 OUT_RING(ring
, fp
->vpsrepl
[2]); /* VPC_VARYING_PS_REPL[2].MODE */
433 OUT_RING(ring
, fp
->vpsrepl
[3]); /* VPC_VARYING_PS_REPL[3].MODE */
436 OUT_PKT0(ring
, REG_A3XX_VFD_VS_THREADING_THRESHOLD
, 1);
437 OUT_RING(ring
, A3XX_VFD_VS_THREADING_THRESHOLD_REGID_THRESHOLD(15) |
438 A3XX_VFD_VS_THREADING_THRESHOLD_REGID_VTXCNT(252));
440 emit_shader(ring
, vp
);
442 OUT_PKT0(ring
, REG_A3XX_VFD_PERFCOUNTER0_SELECT
, 1);
443 OUT_RING(ring
, 0x00000000); /* VFD_PERFCOUNTER0_SELECT */
446 emit_shader(ring
, fp
);
448 OUT_PKT0(ring
, REG_A3XX_VFD_PERFCOUNTER0_SELECT
, 1);
449 OUT_RING(ring
, 0x00000000); /* VFD_PERFCOUNTER0_SELECT */
452 OUT_PKT0(ring
, REG_A3XX_VFD_CONTROL_0
, 2);
453 OUT_RING(ring
, A3XX_VFD_CONTROL_0_TOTALATTRTOVS(vp
->total_in
) |
454 A3XX_VFD_CONTROL_0_PACKETSIZE(2) |
455 A3XX_VFD_CONTROL_0_STRMDECINSTRCNT(vp
->inputs_count
) |
456 A3XX_VFD_CONTROL_0_STRMFETCHINSTRCNT(vp
->inputs_count
));
457 OUT_RING(ring
, A3XX_VFD_CONTROL_1_MAXSTORAGE(1) | // XXX
458 A3XX_VFD_CONTROL_1_REGID4VTX(regid(63,0)) |
459 A3XX_VFD_CONTROL_1_REGID4INST(regid(63,0)));
462 /* hack.. until we figure out how to deal w/ vpsrepl properly.. */
464 fix_blit_fp(struct pipe_context
*pctx
)
466 struct fd_context
*ctx
= fd_context(pctx
);
467 struct fd3_shader_stateobj
*so
= ctx
->blit_prog
.fp
;
469 so
->vpsrepl
[0] = 0x99999999;
470 so
->vpsrepl
[1] = 0x99999999;
471 so
->vpsrepl
[2] = 0x99999999;
472 so
->vpsrepl
[3] = 0x99999999;
476 fd3_prog_init(struct pipe_context
*pctx
)
478 pctx
->create_fs_state
= fd3_fp_state_create
;
479 pctx
->delete_fs_state
= fd3_fp_state_delete
;
481 pctx
->create_vs_state
= fd3_vp_state_create
;
482 pctx
->delete_vs_state
= fd3_vp_state_delete
;