2 * Copyright 2011 Christoph Bumiller
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
18 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
19 * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 #include "nv50_ir_target_nvc0.h"
27 Target
*getTargetNVC0(unsigned int chipset
)
29 return new TargetNVC0(chipset
);
32 TargetNVC0::TargetNVC0(unsigned int card
)
38 // BULTINS / LIBRARY FUNCTIONS:
40 // lazyness -> will just hardcode everything for the time being
42 // Will probably make this nicer once we support subroutines properly,
43 // i.e. when we have an input IR that provides function declarations.
45 static const uint32_t nvc0_builtin_code
[] =
47 // DIV U32: slow unsigned integer division
49 // UNR recurrence (q = a / b):
50 // look for z such that 2^32 - b <= b * z < 2^32
51 // then q - 1 <= (a * z) / 2^32 <= q
53 // INPUT: $r0: dividend, $r1: divisor
54 // OUTPUT: $r0: result, $r1: modulus
55 // CLOBBER: $r2 - $r3, $p0 - $p1
56 // SIZE: 22 / 14 * 8 bytes
59 0x04009c03, 0x78000000,
62 0x08309c03, 0x60000000,
65 0x0c209c43, 0x20040000,
66 0x0810dc03, 0x50000000,
67 0x0c209c43, 0x20040000,
68 0x0810dc03, 0x50000000,
69 0x0c209c43, 0x20040000,
70 0x0810dc03, 0x50000000,
71 0x0c209c43, 0x20040000,
72 0x0810dc03, 0x50000000,
73 0x0c209c43, 0x20040000,
74 0x0000dde4, 0x28000000,
75 0x08001c43, 0x50000000,
78 0x0811dc03, 0x1b0e0000,
79 0x08104103, 0x48000000,
80 0x04000002, 0x08000000,
81 0x0811c003, 0x1b0e0000,
82 0x08104103, 0x48000000,
86 0x0401dc03, 0x1b0e0000,
87 0x00008003, 0x78000000,
88 0x0400c003, 0x78000000,
89 0x0c20c103, 0x48000000,
90 0x0c108003, 0x60000000,
93 0x0031c023, 0x1b0ec000,
94 0xb000a1e7, 0x40000000,
95 0x04000003, 0x6000c000,
96 0x0813dc03, 0x1b000000,
99 0x04208003, 0x5800c000,
100 0x0430c103, 0x4800c000,
105 // DIV S32: slow signed integer division
107 // INPUT: $r0: dividend, $r1: divisor
108 // OUTPUT: $r0: result, $r1: modulus
109 // CLOBBER: $r2 - $r3, $p0 - $p3
110 // SIZE: 18 * 8 bytes
112 0xfc05dc23, 0x188e0000,
113 0xfc17dc23, 0x18c40000,
116 0x0401dc03, 0x1b0e0000,
117 0x00008003, 0x78000000,
118 0x0400c003, 0x78000000,
119 0x0c20c103, 0x48000000,
120 0x0c108003, 0x60000000,
123 0x0031c023, 0x1b0ec000,
124 0xb000a1e7, 0x40000000,
125 0x04000003, 0x6000c000,
126 0x0813dc03, 0x1b000000,
129 0x04208003, 0x5800c000,
130 0x0430c103, 0x4800c000,
136 // RCP F64: Newton Raphson reciprocal(x): r_{i+1} = r_i * (2.0 - x * r_i)
139 // OUTPUT: $r0d (rcp(x))
140 // CLOBBER: $r2 - $r7
147 0x08011e01, 0x200c0000,
148 0x10209c01, 0x50000000,
149 0x08011e01, 0x200c0000,
150 0x10209c01, 0x50000000,
151 0x08011e01, 0x200c0000,
152 0x10201c01, 0x50000000,
153 0x00001de7, 0x90000000,
155 // RSQ F64: Newton Raphson rsqrt(x): r_{i+1} = r_i * (1.5 - 0.5 * x * r_i * r_i)
158 // OUTPUT: $r0d (rsqrt(x))
159 // CLOBBER: $r2 - $r7
160 // SIZE: 14 * 8 bytes
166 0x18001c01, 0x50000000,
167 0x0001dde2, 0x18ffe000,
168 0x08211c01, 0x50000000,
169 0x10011e01, 0x200c0000,
170 0x10209c01, 0x50000000,
171 0x08211c01, 0x50000000,
172 0x10011e01, 0x200c0000,
173 0x10209c01, 0x50000000,
174 0x08211c01, 0x50000000,
175 0x10011e01, 0x200c0000,
176 0x10201c01, 0x50000000,
177 0x00001de7, 0x90000000,
180 static const uint16_t nvc0_builtin_offsets
[NVC0_BUILTIN_COUNT
] =
189 TargetNVC0::getBuiltinCode(const uint32_t **code
, uint32_t *size
) const
191 *code
= &nvc0_builtin_code
[0];
192 *size
= sizeof(nvc0_builtin_code
);
196 TargetNVC0::getBuiltinOffset(int builtin
) const
198 assert(builtin
< NVC0_BUILTIN_COUNT
);
199 return nvc0_builtin_offsets
[builtin
];
205 unsigned int mNeg
: 4;
206 unsigned int mAbs
: 4;
207 unsigned int mNot
: 4;
208 unsigned int mSat
: 4;
209 unsigned int fConst
: 3;
210 unsigned int fImmd
: 4; // last bit indicates if full immediate is suppoted
213 static const struct opProperties _initProps
[] =
215 // neg abs not sat c[] imm
216 { OP_ADD
, 0x3, 0x3, 0x0, 0x8, 0x2, 0x2 | 0x8 },
217 { OP_SUB
, 0x3, 0x3, 0x0, 0x0, 0x2, 0x2 | 0x8 },
218 { OP_MUL
, 0x3, 0x0, 0x0, 0x8, 0x2, 0x2 | 0x8 },
219 { OP_MAX
, 0x3, 0x3, 0x0, 0x0, 0x2, 0x2 },
220 { OP_MIN
, 0x3, 0x3, 0x0, 0x0, 0x2, 0x2 },
221 { OP_MAD
, 0x7, 0x0, 0x0, 0x8, 0x6, 0x2 | 0x8 }, // special c[] constraint
222 { OP_ABS
, 0x0, 0x0, 0x0, 0x0, 0x1, 0x0 },
223 { OP_NEG
, 0x0, 0x1, 0x0, 0x0, 0x1, 0x0 },
224 { OP_CVT
, 0x1, 0x1, 0x0, 0x8, 0x1, 0x0 },
225 { OP_AND
, 0x0, 0x0, 0x3, 0x0, 0x2, 0x2 | 0x8 },
226 { OP_OR
, 0x0, 0x0, 0x3, 0x0, 0x2, 0x2 | 0x8 },
227 { OP_XOR
, 0x0, 0x0, 0x3, 0x0, 0x2, 0x2 | 0x8 },
228 { OP_SHL
, 0x0, 0x0, 0x0, 0x0, 0x2, 0x2 },
229 { OP_SHR
, 0x0, 0x0, 0x0, 0x0, 0x2, 0x2 },
230 { OP_SET
, 0x3, 0x3, 0x0, 0x0, 0x2, 0x2 },
231 { OP_SLCT
, 0x4, 0x0, 0x0, 0x0, 0x6, 0x2 }, // special c[] constraint
232 { OP_PREEX2
, 0x1, 0x1, 0x0, 0x0, 0x1, 0x1 },
233 { OP_PRESIN
, 0x1, 0x1, 0x0, 0x0, 0x1, 0x1 },
234 { OP_COS
, 0x1, 0x1, 0x0, 0x8, 0x0, 0x0 },
235 { OP_SIN
, 0x1, 0x1, 0x0, 0x8, 0x0, 0x0 },
236 { OP_EX2
, 0x1, 0x1, 0x0, 0x8, 0x0, 0x0 },
237 { OP_LG2
, 0x1, 0x1, 0x0, 0x8, 0x0, 0x0 },
238 { OP_RCP
, 0x1, 0x1, 0x0, 0x8, 0x0, 0x0 },
239 { OP_RSQ
, 0x1, 0x1, 0x0, 0x8, 0x0, 0x0 },
240 { OP_DFDX
, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0 },
241 { OP_DFDY
, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0 },
242 { OP_CALL
, 0x0, 0x0, 0x0, 0x0, 0x1, 0x0 },
243 { OP_INSBF
, 0x0, 0x0, 0x0, 0x0, 0x0, 0x4 },
244 { OP_SET_AND
, 0x3, 0x3, 0x0, 0x0, 0x2, 0x2 },
245 { OP_SET_OR
, 0x3, 0x3, 0x0, 0x0, 0x2, 0x2 },
246 { OP_SET_XOR
, 0x3, 0x3, 0x0, 0x0, 0x2, 0x2 },
248 { OP_LINTERP
, 0x0, 0x0, 0x0, 0x8, 0x0, 0x0 },
249 { OP_PINTERP
, 0x0, 0x0, 0x0, 0x8, 0x0, 0x0 },
252 void TargetNVC0::initOpInfo()
256 static const uint32_t commutative
[(OP_LAST
+ 31) / 32] =
258 // ADD, MAD, MUL, AND, OR, XOR, MAX, MIN
259 0x0670ca00, 0x0000003f, 0x00000000
262 static const uint32_t shortForm
[(OP_LAST
+ 31) / 32] =
264 // ADD, MAD, MUL, AND, OR, XOR, PRESIN, PREEX2, SFN, CVT, PINTERP, MOV
265 0x0670ca00, 0x00000000, 0x00000000
268 static const operation noDest
[] =
270 OP_STORE
, OP_WRSV
, OP_EXPORT
, OP_BRA
, OP_CALL
, OP_RET
, OP_EXIT
,
271 OP_DISCARD
, OP_CONT
, OP_BREAK
, OP_PRECONT
, OP_PREBREAK
, OP_PRERET
,
272 OP_JOIN
, OP_JOINAT
, OP_BRKPT
, OP_MEMBAR
, OP_EMIT
, OP_RESTART
,
273 OP_QUADON
, OP_QUADPOP
276 joinAnterior
= false;
278 for (i
= 0; i
< DATA_FILE_COUNT
; ++i
)
279 nativeFileMap
[i
] = (DataFile
)i
;
280 nativeFileMap
[FILE_ADDRESS
] = FILE_GPR
;
282 for (i
= 0; i
< OP_LAST
; ++i
) {
283 opInfo
[i
].variants
= NULL
;
284 opInfo
[i
].op
= (operation
)i
;
285 opInfo
[i
].srcTypes
= 1 << (int)TYPE_F32
;
286 opInfo
[i
].dstTypes
= 1 << (int)TYPE_F32
;
287 opInfo
[i
].immdBits
= 0;
288 opInfo
[i
].srcNr
= operationSrcNr
[i
];
290 for (j
= 0; j
< opInfo
[i
].srcNr
; ++j
) {
291 opInfo
[i
].srcMods
[j
] = 0;
292 opInfo
[i
].srcFiles
[j
] = 1 << (int)FILE_GPR
;
294 opInfo
[i
].dstMods
= 0;
295 opInfo
[i
].dstFiles
= 1 << (int)FILE_GPR
;
297 opInfo
[i
].hasDest
= 1;
298 opInfo
[i
].vector
= (i
>= OP_TEX
&& i
<= OP_TEXCSAA
);
299 opInfo
[i
].commutative
= (commutative
[i
/ 32] >> (i
% 32)) & 1;
300 opInfo
[i
].pseudo
= (i
< OP_MOV
);
301 opInfo
[i
].predicate
= !opInfo
[i
].pseudo
;
302 opInfo
[i
].flow
= (i
>= OP_BRA
&& i
<= OP_JOIN
);
303 opInfo
[i
].minEncSize
= (shortForm
[i
/ 32] & (1 << (i
% 32))) ? 4 : 8;
305 for (i
= 0; i
< sizeof(noDest
) / sizeof(noDest
[0]); ++i
)
306 opInfo
[noDest
[i
]].hasDest
= 0;
308 for (i
= 0; i
< sizeof(_initProps
) / sizeof(_initProps
[0]); ++i
) {
309 const struct opProperties
*prop
= &_initProps
[i
];
311 for (int s
= 0; s
< 3; ++s
) {
312 if (prop
->mNeg
& (1 << s
))
313 opInfo
[prop
->op
].srcMods
[s
] |= NV50_IR_MOD_NEG
;
314 if (prop
->mAbs
& (1 << s
))
315 opInfo
[prop
->op
].srcMods
[s
] |= NV50_IR_MOD_ABS
;
316 if (prop
->mNot
& (1 << s
))
317 opInfo
[prop
->op
].srcMods
[s
] |= NV50_IR_MOD_NOT
;
318 if (prop
->fConst
& (1 << s
))
319 opInfo
[prop
->op
].srcFiles
[s
] |= 1 << (int)FILE_MEMORY_CONST
;
320 if (prop
->fImmd
& (1 << s
))
321 opInfo
[prop
->op
].srcFiles
[s
] |= 1 << (int)FILE_IMMEDIATE
;
323 opInfo
[prop
->op
].immdBits
= 0xffffffff;
326 opInfo
[prop
->op
].dstMods
= NV50_IR_MOD_SAT
;
331 TargetNVC0::getFileSize(DataFile file
) const
334 case FILE_NULL
: return 0;
335 case FILE_GPR
: return 63;
336 case FILE_PREDICATE
: return 7;
337 case FILE_FLAGS
: return 1;
338 case FILE_ADDRESS
: return 0;
339 case FILE_IMMEDIATE
: return 0;
340 case FILE_MEMORY_CONST
: return 65536;
341 case FILE_SHADER_INPUT
: return 0x400;
342 case FILE_SHADER_OUTPUT
: return 0x400;
343 case FILE_MEMORY_GLOBAL
: return 0xffffffff;
344 case FILE_MEMORY_SHARED
: return 16 << 10;
345 case FILE_MEMORY_LOCAL
: return 48 << 10;
346 case FILE_SYSTEM_VALUE
: return 32;
348 assert(!"invalid file");
354 TargetNVC0::getFileUnit(DataFile file
) const
356 if (file
== FILE_GPR
|| file
== FILE_ADDRESS
|| file
== FILE_SYSTEM_VALUE
)
362 TargetNVC0::getSVAddress(DataFile shaderFile
, const Symbol
*sym
) const
364 const int idx
= sym
->reg
.data
.sv
.index
;
365 const SVSemantic sv
= sym
->reg
.data
.sv
.sv
;
367 const bool isInput
= shaderFile
== FILE_SHADER_INPUT
;
370 case SV_POSITION
: return 0x070 + idx
* 4;
371 case SV_INSTANCE_ID
: return 0x2f8;
372 case SV_VERTEX_ID
: return 0x2fc;
373 case SV_PRIMITIVE_ID
: return isInput
? 0x060 : 0x040;
374 case SV_LAYER
: return 0x064;
375 case SV_VIEWPORT_INDEX
: return 0x068;
376 case SV_POINT_SIZE
: return 0x06c;
377 case SV_CLIP_DISTANCE
: return 0x2c0 + idx
* 4;
378 case SV_POINT_COORD
: return 0x2e0 + idx
* 4;
379 case SV_FACE
: return 0x3fc;
380 case SV_TESS_FACTOR
: return 0x000 + idx
* 4;
381 case SV_TESS_COORD
: return 0x2f0 + idx
* 4;
388 TargetNVC0::insnCanLoad(const Instruction
*i
, int s
,
389 const Instruction
*ld
) const
391 DataFile sf
= ld
->src
[0].getFile();
393 // immediate 0 can be represented by GPR $r63
394 if (sf
== FILE_IMMEDIATE
&& ld
->getSrc(0)->reg
.data
.u64
== 0)
395 return (!i
->asTex() && i
->op
!= OP_EXPORT
&& i
->op
!= OP_STORE
);
397 if (s
> opInfo
[i
->op
].srcNr
)
399 if (!(opInfo
[i
->op
].srcFiles
[s
] & (1 << (int)sf
)))
402 // indirect loads can only be done by OP_LOAD/VFETCH/INTERP on nvc0
403 if (ld
->src
[0].isIndirect(0))
406 for (int k
= 0; i
->srcExists(k
); ++k
) {
407 if (i
->src
[k
].getFile() == FILE_IMMEDIATE
) {
408 if (i
->getSrc(k
)->reg
.data
.u64
!= 0)
411 if (i
->src
[k
].getFile() != FILE_GPR
&&
412 i
->src
[k
].getFile() != FILE_PREDICATE
) {
417 // not all instructions support full 32 bit immediates
418 if (sf
== FILE_IMMEDIATE
) {
419 Storage
®
= ld
->getSrc(0)->asImm()->reg
;
421 if (opInfo
[i
->op
].immdBits
!= 0xffffffff) {
422 if (i
->sType
== TYPE_F32
) {
423 if (reg
.data
.u32
& 0xfff)
426 if (i
->sType
== TYPE_S32
|| i
->sType
== TYPE_U32
) {
427 // with u32, 0xfffff counts as 0xffffffff as well
428 if (reg
.data
.s32
> 0x7ffff || reg
.data
.s32
< -0x80000)
432 if (i
->op
== OP_MAD
|| i
->op
== OP_FMA
) {
433 // requires src == dst, cannot decide before RA
434 // (except if we implement more constraints)
435 if (ld
->getSrc(0)->asImm()->reg
.data
.u32
& 0xfff)
444 TargetNVC0::isOpSupported(operation op
, DataType ty
) const
446 if ((op
== OP_MAD
|| op
== OP_FMA
) && (ty
!= TYPE_F32
))
448 if (op
== OP_SAD
&& ty
!= TYPE_S32
)
450 if (op
== OP_POW
|| op
== OP_SQRT
|| op
== OP_DIV
|| op
== OP_MOD
)
456 TargetNVC0::isModSupported(const Instruction
*insn
, int s
, Modifier mod
) const
458 if (!isFloatType(insn
->dType
)) {
471 if (insn
->src
[s
? 0 : 1].mod
.neg())
476 return insn
->src
[1].mod
.neg() ? false : true;
484 return (mod
& Modifier(opInfo
[insn
->op
].srcMods
[s
])) == mod
;
488 TargetNVC0::mayPredicate(const Instruction
*insn
, const Value
*pred
) const
490 if (insn
->getPredicate())
492 return opInfo
[insn
->op
].predicate
;
496 TargetNVC0::isSatSupported(const Instruction
*insn
) const
498 if (insn
->op
== OP_CVT
)
500 if (!(opInfo
[insn
->op
].dstMods
& NV50_IR_MOD_SAT
))
503 if (insn
->dType
== TYPE_U32
)
504 return (insn
->op
== OP_ADD
) || (insn
->op
== OP_MAD
);
506 return insn
->dType
== TYPE_F32
;
509 // TODO: better values
510 int TargetNVC0::getLatency(const Instruction
*i
) const
512 if (i
->op
== OP_LOAD
) {
513 if (i
->cache
== CACHE_CV
)
520 // These are "inverse" throughput values, i.e. the number of cycles required
521 // to issue a specific instruction for a full warp (32 threads).
523 // Assuming we have more than 1 warp in flight, a higher issue latency results
524 // in a lower result latency since the MP will have spent more time with other
526 // This also helps to determine the number of cycles between instructions in
529 int TargetNVC0::getThroughput(const Instruction
*i
) const
531 // TODO: better values
532 if (i
->dType
== TYPE_F32
) {
559 if (i
->dType
== TYPE_U32
|| i
->dType
== TYPE_S32
) {
582 if (i
->dType
== TYPE_F64
) {
589 } // namespace nv50_ir