4 TGSI, Tungsten Graphics Shader Instructions, is an intermediate language
5 for describing shaders. Since Gallium is inherently shaderful, shaders are
6 an important part of the API. TGSI is the only intermediate representation
9 From GL_NV_vertex_program
10 -------------------------
13 ARL - Address Register Load
17 dst.x = \lfloor src.x\rfloor
19 dst.y = \lfloor src.y\rfloor
21 dst.z = \lfloor src.z\rfloor
23 dst.w = \lfloor src.w\rfloor
39 LIT - Light Coefficients
45 dst.y = max(src.x, 0.0)
47 dst.z = (src.x > 0.0) ? pow(max(src.y, 0.0), clamp(src.w, -128.0, 128.0)) : 0.0
65 RSQ - Reciprocal Square Root
69 dst.x = 1.0 / sqrt(abs(src.x))
71 dst.y = 1.0 / sqrt(abs(src.x))
73 dst.z = 1.0 / sqrt(abs(src.x))
75 dst.w = 1.0 / sqrt(abs(src.x))
78 EXP - Approximate Exponential Base 2
82 dst.x = pow(2.0, \lfloor src.x\rfloor)
84 dst.y = src.x - \lfloor src.x\rfloor
86 dst.z = pow(2.0, src.x)
91 LOG - Approximate Logarithm Base 2
95 dst.x = \lfloor lg2(abs(src.x)))\rfloor
97 dst.y = abs(src.x) / pow(2.0, \lfloor lg2(abs(src.x))\rfloor )
99 dst.z = lg2(abs(src.x))
108 dst.x = src0.x * src1.x
110 dst.y = src0.y * src1.y
112 dst.z = src0.z * src1.z
114 dst.w = src0.w * src1.w
121 dst.x = src0.x + src1.x
123 dst.y = src0.y + src1.y
125 dst.z = src0.z + src1.z
127 dst.w = src0.w + src1.w
130 DP3 - 3-component Dot Product
134 dst.x = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z
136 dst.y = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z
138 dst.z = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z
140 dst.w = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z
143 DP4 - 4-component Dot Product
147 dst.x = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src0.w * src1.w
149 dst.y = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src0.w * src1.w
151 dst.z = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src0.w * src1.w
153 dst.w = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src0.w * src1.w
156 DST - Distance Vector
162 dst.y = src0.y * src1.y
173 dst.x = min(src0.x, src1.x)
175 dst.y = min(src0.y, src1.y)
177 dst.z = min(src0.z, src1.z)
179 dst.w = min(src0.w, src1.w)
186 dst.x = max(src0.x, src1.x)
188 dst.y = max(src0.y, src1.y)
190 dst.z = max(src0.z, src1.z)
192 dst.w = max(src0.w, src1.w)
195 SLT - Set On Less Than
199 dst.x = (src0.x < src1.x) ? 1.0 : 0.0
201 dst.y = (src0.y < src1.y) ? 1.0 : 0.0
203 dst.z = (src0.z < src1.z) ? 1.0 : 0.0
205 dst.w = (src0.w < src1.w) ? 1.0 : 0.0
208 SGE - Set On Greater Equal Than
212 dst.x = (src0.x >= src1.x) ? 1.0 : 0.0
214 dst.y = (src0.y >= src1.y) ? 1.0 : 0.0
216 dst.z = (src0.z >= src1.z) ? 1.0 : 0.0
218 dst.w = (src0.w >= src1.w) ? 1.0 : 0.0
221 MAD - Multiply And Add
225 dst.x = src0.x * src1.x + src2.x
227 dst.y = src0.y * src1.y + src2.y
229 dst.z = src0.z * src1.z + src2.z
231 dst.w = src0.w * src1.w + src2.w
238 dst.x = src0.x - src1.x
240 dst.y = src0.y - src1.y
242 dst.z = src0.z - src1.z
244 dst.w = src0.w - src1.w
247 LRP - Linear Interpolate
251 dst.x = src0.x * (src1.x - src2.x) + src2.x
253 dst.y = src0.y * (src1.y - src2.y) + src2.y
255 dst.z = src0.z * (src1.z - src2.z) + src2.z
257 dst.w = src0.w * (src1.w - src2.w) + src2.w
264 dst.x = (src2.x > 0.5) ? src0.x : src1.x
266 dst.y = (src2.y > 0.5) ? src0.y : src1.y
268 dst.z = (src2.z > 0.5) ? src0.z : src1.z
270 dst.w = (src2.w > 0.5) ? src0.w : src1.w
273 DP2A - 2-component Dot Product And Add
277 dst.x = src0.x * src1.x + src0.y * src1.y + src2.x
279 dst.y = src0.x * src1.x + src0.y * src1.y + src2.x
281 dst.z = src0.x * src1.x + src0.y * src1.y + src2.x
283 dst.w = src0.x * src1.x + src0.y * src1.y + src2.x
290 dst.x = src.x - \lfloor src.x\rfloor
292 dst.y = src.y - \lfloor src.y\rfloor
294 dst.z = src.z - \lfloor src.z\rfloor
296 dst.w = src.w - \lfloor src.w\rfloor
303 dst.x = clamp(src0.x, src1.x, src2.x)
304 dst.y = clamp(src0.y, src1.y, src2.y)
305 dst.z = clamp(src0.z, src1.z, src2.z)
306 dst.w = clamp(src0.w, src1.w, src2.w)
311 This is identical to ARL.
315 dst.x = \lfloor src.x\rfloor
317 dst.y = \lfloor src.y\rfloor
319 dst.z = \lfloor src.z\rfloor
321 dst.w = \lfloor src.w\rfloor
334 1.3.10 EX2 - Exponential Base 2
338 dst.x = pow(2.0, src.x)
339 dst.y = pow(2.0, src.x)
340 dst.z = pow(2.0, src.x)
341 dst.w = pow(2.0, src.x)
344 1.3.11 LG2 - Logarithm Base 2
358 dst.x = pow(src0.x, src1.x)
359 dst.y = pow(src0.x, src1.x)
360 dst.z = pow(src0.x, src1.x)
361 dst.w = pow(src0.x, src1.x)
363 1.3.15 XPD - Cross Product
367 dst.x = src0.y * src1.z - src1.y * src0.z
368 dst.y = src0.z * src1.x - src1.z * src0.x
369 dst.z = src0.x * src1.y - src1.x * src0.y
383 1.4.2 RCC - Reciprocal Clamped
387 dst.x = (1.0 / src.x) > 0.0 ? clamp(1.0 / src.x, 5.42101e-020, 1.884467e+019) : clamp(1.0 / src.x, -1.884467e+019, -5.42101e-020)
388 dst.y = (1.0 / src.x) > 0.0 ? clamp(1.0 / src.x, 5.42101e-020, 1.884467e+019) : clamp(1.0 / src.x, -1.884467e+019, -5.42101e-020)
389 dst.z = (1.0 / src.x) > 0.0 ? clamp(1.0 / src.x, 5.42101e-020, 1.884467e+019) : clamp(1.0 / src.x, -1.884467e+019, -5.42101e-020)
390 dst.w = (1.0 / src.x) > 0.0 ? clamp(1.0 / src.x, 5.42101e-020, 1.884467e+019) : clamp(1.0 / src.x, -1.884467e+019, -5.42101e-020)
393 1.4.3 DPH - Homogeneous Dot Product
397 dst.x = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w
398 dst.y = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w
399 dst.z = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w
400 dst.w = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w
416 1.5.2 DDX - Derivative Relative To X
420 dst.x = partialx(src.x)
421 dst.y = partialx(src.y)
422 dst.z = partialx(src.z)
423 dst.w = partialx(src.w)
426 1.5.3 DDY - Derivative Relative To Y
430 dst.x = partialy(src.x)
431 dst.y = partialy(src.y)
432 dst.z = partialy(src.z)
433 dst.w = partialy(src.w)
436 1.5.7 KILP - Predicated Discard
443 1.5.10 PK2H - Pack Two 16-bit Floats
448 1.5.11 PK2US - Pack Two Unsigned 16-bit Scalars
453 1.5.12 PK4B - Pack Four Signed 8-bit Scalars
458 1.5.13 PK4UB - Pack Four Unsigned 8-bit Scalars
463 1.5.15 RFL - Reflection Vector
467 dst.x = 2.0 * (src0.x * src1.x + src0.y * src1.y + src0.z * src1.z) / (src0.x * src0.x + src0.y * src0.y + src0.z * src0.z) * src0.x - src1.x
468 dst.y = 2.0 * (src0.x * src1.x + src0.y * src1.y + src0.z * src1.z) / (src0.x * src0.x + src0.y * src0.y + src0.z * src0.z) * src0.y - src1.y
469 dst.z = 2.0 * (src0.x * src1.x + src0.y * src1.y + src0.z * src1.z) / (src0.x * src0.x + src0.y * src0.y + src0.z * src0.z) * src0.z - src1.z
472 Considered for removal.
475 1.5.16 SEQ - Set On Equal
479 dst.x = (src0.x == src1.x) ? 1.0 : 0.0
480 dst.y = (src0.y == src1.y) ? 1.0 : 0.0
481 dst.z = (src0.z == src1.z) ? 1.0 : 0.0
482 dst.w = (src0.w == src1.w) ? 1.0 : 0.0
485 1.5.17 SFL - Set On False
494 Considered for removal.
496 1.5.18 SGT - Set On Greater Than
500 dst.x = (src0.x > src1.x) ? 1.0 : 0.0
501 dst.y = (src0.y > src1.y) ? 1.0 : 0.0
502 dst.z = (src0.z > src1.z) ? 1.0 : 0.0
503 dst.w = (src0.w > src1.w) ? 1.0 : 0.0
519 1.5.20 SLE - Set On Less Equal Than
523 dst.x = (src0.x <= src1.x) ? 1.0 : 0.0
524 dst.y = (src0.y <= src1.y) ? 1.0 : 0.0
525 dst.z = (src0.z <= src1.z) ? 1.0 : 0.0
526 dst.w = (src0.w <= src1.w) ? 1.0 : 0.0
529 1.5.21 SNE - Set On Not Equal
533 dst.x = (src0.x != src1.x) ? 1.0 : 0.0
534 dst.y = (src0.y != src1.y) ? 1.0 : 0.0
535 dst.z = (src0.z != src1.z) ? 1.0 : 0.0
536 dst.w = (src0.w != src1.w) ? 1.0 : 0.0
539 1.5.22 STR - Set On True
549 1.5.23 TEX - Texture Lookup
554 1.5.24 TXD - Texture Lookup with Derivatives
559 1.5.25 TXP - Projective Texture Lookup
564 1.5.26 UP2H - Unpack Two 16-Bit Floats
568 Considered for removal.
570 1.5.27 UP2US - Unpack Two Unsigned 16-Bit Scalars
574 Considered for removal.
576 1.5.28 UP4B - Unpack Four Signed 8-Bit Values
580 Considered for removal.
582 1.5.29 UP4UB - Unpack Four Unsigned 8-Bit Scalars
586 Considered for removal.
588 1.5.30 X2D - 2D Coordinate Transformation
592 dst.x = src0.x + src1.x * src2.x + src1.y * src2.y
593 dst.y = src0.y + src1.x * src2.z + src1.y * src2.w
594 dst.z = src0.x + src1.x * src2.x + src1.y * src2.y
595 dst.w = src0.y + src1.x * src2.z + src1.y * src2.w
597 Considered for removal.
600 1.6 GL_NV_vertex_program2
601 --------------------------
604 1.6.1 ARA - Address Register Add
608 Considered for removal.
610 1.6.2 ARR - Address Register Load With Round
624 Considered for removal.
626 1.6.4 CAL - Subroutine Call
632 1.6.5 RET - Subroutine Call Return
636 Potential restrictions:
637 * Only occurs at end of function.
643 dst.x = (src.x > 0.0) ? 1.0 : (src.x < 0.0) ? -1.0 : 0.0
644 dst.y = (src.y > 0.0) ? 1.0 : (src.y < 0.0) ? -1.0 : 0.0
645 dst.z = (src.z > 0.0) ? 1.0 : (src.z < 0.0) ? -1.0 : 0.0
646 dst.w = (src.w > 0.0) ? 1.0 : (src.w < 0.0) ? -1.0 : 0.0
653 dst.x = (src0.x < 0.0) ? src1.x : src2.x
654 dst.y = (src0.y < 0.0) ? src1.y : src2.y
655 dst.z = (src0.z < 0.0) ? src1.z : src2.z
656 dst.w = (src0.w < 0.0) ? src1.w : src2.w
659 1.8.2 KIL - Conditional Discard
663 if (src.x < 0.0 || src.y < 0.0 || src.z < 0.0 || src.w < 0.0)
681 1.8.4 TXB - Texture Lookup With Bias
686 1.9.1 NRM - 3-component Vector Normalise
690 dst.x = src.x / (src.x * src.x + src.y * src.y + src.z * src.z)
691 dst.y = src.y / (src.x * src.x + src.y * src.y + src.z * src.z)
692 dst.z = src.z / (src.x * src.x + src.y * src.y + src.z * src.z)
700 dst.x = src0.x / src1.x
701 dst.y = src0.y / src1.y
702 dst.z = src0.z / src1.z
703 dst.w = src0.w / src1.w
706 1.9.3 DP2 - 2-component Dot Product
710 dst.x = src0.x * src1.x + src0.y * src1.y
711 dst.y = src0.x * src1.x + src0.y * src1.y
712 dst.z = src0.x * src1.x + src0.y * src1.y
713 dst.w = src0.x * src1.x + src0.y * src1.y
716 1.9.5 TXL - Texture Lookup With LOD
731 1.9.8 BGNFOR - Begin a For-Loop
738 pc = [matching ENDFOR] + 1
741 Note: The destination must be a loop register.
742 The source must be a constant register.
744 Considered for cleanup / removal.
757 1.9.11 ENDIF - End If
762 1.9.12 ENDFOR - End a For-Loop
764 dst.x = dst.x + dst.z
768 pc = [matching BGNFOR instruction] + 1
771 Note: The destination must be a loop register.
773 Considered for cleanup / removal.
775 1.9.13 ENDREP - End Repeat
780 1.10.1 PUSHA - Push Address Register On Stack
787 Considered for cleanup / removal.
789 1.10.2 POPA - Pop Address Register From Stack
796 Considered for cleanup / removal.
799 1.11 GL_NV_gpu_program4
800 ------------------------
802 Support for these opcodes indicated by a special pipe capability bit (TBD).
804 1.11.1 CEIL - Ceiling
814 1.11.2 I2F - Integer To Float
818 dst.x = (float) src.x
819 dst.y = (float) src.y
820 dst.z = (float) src.z
821 dst.w = (float) src.w
824 1.11.3 NOT - Bitwise Not
834 1.11.4 TRUNC - Truncate
844 1.11.5 SHL - Shift Left
848 dst.x = src0.x << src1.x
849 dst.y = src0.y << src1.x
850 dst.z = src0.z << src1.x
851 dst.w = src0.w << src1.x
854 1.11.6 SHR - Shift Right
858 dst.x = src0.x >> src1.x
859 dst.y = src0.y >> src1.x
860 dst.z = src0.z >> src1.x
861 dst.w = src0.w >> src1.x
864 1.11.7 AND - Bitwise And
868 dst.x = src0.x & src1.x
869 dst.y = src0.y & src1.y
870 dst.z = src0.z & src1.z
871 dst.w = src0.w & src1.w
874 1.11.8 OR - Bitwise Or
878 dst.x = src0.x | src1.x
879 dst.y = src0.y | src1.y
880 dst.z = src0.z | src1.z
881 dst.w = src0.w | src1.w
888 dst.x = src0.x % src1.x
889 dst.y = src0.y % src1.y
890 dst.z = src0.z % src1.z
891 dst.w = src0.w % src1.w
894 1.11.10 XOR - Bitwise Xor
898 dst.x = src0.x ^ src1.x
899 dst.y = src0.y ^ src1.y
900 dst.z = src0.z ^ src1.z
901 dst.w = src0.w ^ src1.w
904 1.11.11 SAD - Sum Of Absolute Differences
908 dst.x = abs(src0.x - src1.x) + src2.x
909 dst.y = abs(src0.y - src1.y) + src2.y
910 dst.z = abs(src0.z - src1.z) + src2.z
911 dst.w = abs(src0.w - src1.w) + src2.w
914 1.11.12 TXF - Texel Fetch
919 1.11.13 TXQ - Texture Size Query
924 1.11.14 CONT - Continue
929 1.12 GL_NV_geometry_program4
930 -----------------------------
938 1.12.2 ENDPRIM - End Primitive
947 1.13.1 BGNLOOP - Begin a Loop
952 1.13.2 BGNSUB - Begin Subroutine
957 1.13.3 ENDLOOP - End a Loop
962 1.13.4 ENDSUB - End Subroutine
968 1.13.10 NOP - No Operation
974 1.16.7 NRM4 - 4-component Vector Normalise
978 dst.x = src.x / (src.x * src.x + src.y * src.y + src.z * src.z + src.w * src.w)
979 dst.y = src.y / (src.x * src.x + src.y * src.y + src.z * src.z + src.w * src.w)
980 dst.z = src.z / (src.x * src.x + src.y * src.y + src.z * src.z + src.w * src.w)
981 dst.w = src.w / (src.x * src.x + src.y * src.y + src.z * src.z + src.w * src.w)
988 1.17.2 CALLNZ - Subroutine Call If Not Zero
998 1.17.5 BREAKC - Break Conditional
1003 2 Explanation of symbols used
1004 ==============================
1011 abs(x) Absolute value of x.
1014 ceil(x) Ceiling of x.
1016 clamp(x,y,z) Clamp x between y and z.
1017 (x < y) ? y : (x > z) ? z : x
1019 :math:`\lfloor x\rfloor` Floor of x.
1021 lg2(x) Logarithm base 2 of x.
1023 max(x,y) Maximum of x and y.
1026 min(x,y) Minimum of x and y.
1029 partialx(x) Derivative of x relative to fragment's X.
1031 partialy(x) Derivative of x relative to fragment's Y.
1033 pop() Pop from stack.
1035 pow(x,y) Raise x to power of y.
1037 push(x) Push x on stack.
1041 sqrt(x) Square root of x.
1043 trunc(x) Truncate x.
1050 discard Discard fragment.
1052 dst First destination register.
1054 dst0 First destination register.
1058 src First source register.
1060 src0 First source register.
1062 src1 Second source register.
1064 src2 Third source register.
1066 target Label of target instruction.
1073 3.1 Declaration Semantic
1074 -------------------------
1077 Follows Declaration token if Semantic bit is set.
1079 Since its purpose is to link a shader with other stages of the pipeline,
1080 it is valid to follow only those Declaration tokens that declare a register
1081 either in INPUT or OUTPUT file.
1083 SemanticName field contains the semantic name of the register being declared.
1084 There is no default value.
1086 SemanticIndex is an optional subscript that can be used to distinguish
1087 different register declarations with the same semantic name. The default value
1090 The meanings of the individual semantic names are explained in the following
1096 Valid only in a fragment shader INPUT declaration.
1098 FACE.x is negative when the primitive is back facing. FACE.x is positive
1099 when the primitive is front facing.