panfrost/midgard: Add fround(_even), ftrunc, ffma
[mesa.git] / src / gallium / drivers / panfrost / midgard / assemble.py
1 """
2 Copyright (C) 2018 Alyssa Rosenzweig
3 Copyright (c) 2013 Connor Abbott (connor@abbott.cx)
4
5 Permission is hereby granted, free of charge, to any person obtaining a copy
6 of this software and associated documentation files (the "Software"), to deal
7 in the Software without restriction, including without limitation the rights
8 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 copies of the Software, and to permit persons to whom the Software is
10 furnished to do so, subject to the following conditions:
11
12 The above copyright notice and this permission notice shall be included in
13 all copies or substantial portions of the Software.
14
15 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 THE SOFTWARE.
22 """
23
24 import sys
25 import pprint
26 import struct
27
28 program = []
29
30 # Definitions from cwabbott's tools
31
32 t6xx_alu_ops = {
33 "fadd": 0x10,
34 "fmul": 0x14,
35 "fmin": 0x28,
36 "fmax": 0x2C,
37 "fmov": 0x30,
38 "ffloor": 0x36,
39 "fceil": 0x37,
40 "fdot3": 0x3C,
41 "fdot3r": 0x3D,
42 "fdot4": 0x3E,
43 "freduce": 0x3F,
44 "iadd": 0x40,
45 "isub": 0x46,
46 "imul": 0x58,
47 "imov": 0x7B,
48 "feq": 0x80,
49 "fne": 0x81,
50 "flt": 0x82,
51 "fle": 0x83,
52 "f2i": 0x99,
53 "f2u8": 0x9C,
54 "u2f": 0xBC,
55 "ieq": 0xA0,
56 "ine": 0xA1,
57 "ilt": 0xA4,
58 "ile": 0xA5,
59 "iand": 0x70,
60 "ior": 0x71,
61 "inot": 0x72,
62 "iandnot": 0x74,
63 "ixor": 0x76,
64 "ball": 0xA9,
65 "bany": 0xB1,
66 "i2f": 0xB8,
67 "csel": 0xC5,
68 "fatan_pt2": 0xE8,
69 "frcp": 0xF0,
70 "frsqrt": 0xF2,
71 "fsqrt": 0xF3,
72 "fexp2": 0xF4,
73 "flog2": 0xF5,
74 "fsin": 0xF6,
75 "fcos": 0xF7,
76 "fatan2_pt1": 0xF9,
77 }
78
79 t6xx_alu_bits = {
80 "vmul": 17,
81 "sadd": 19,
82 "vadd": 21,
83 "smul": 23,
84 "lut": 25,
85 "br": 26,
86 "branch": 27,
87 "constants": 32
88 }
89
90 t6xx_alu_size_bits = {
91 "vmul": 48,
92 "sadd": 32,
93 "vadd": 48,
94 "smul": 32,
95 "lut": 48,
96 "br": 16,
97 "branch": 48
98 }
99
100 t6xx_outmod = {
101 "none": 0,
102 "pos": 1,
103 "int": 2,
104 "sat": 3
105 }
106
107 t6xx_reg_mode = {
108 "quarter": 0,
109 "half": 1,
110 "full": 2,
111 "double": 3
112 }
113
114 t6xx_dest_override = {
115 "lower": 0,
116 "upper": 1,
117 "none": 2
118 }
119
120 t6xx_load_store_ops = {
121 "ld_st_noop": 0x03,
122 "ld_attr_16": 0x95,
123 "ld_attr_32": 0x94,
124 "ld_vary_16": 0x99,
125 "ld_vary_32": 0x98,
126 "ld_uniform_16": 0xAC,
127 "ld_uniform_32": 0xB0,
128 "st_vary_16": 0xD5,
129 "st_vary_32": 0xD4,
130 "ld_color_buffer_8": 0xBA
131 }
132
133 t6xx_tag = {
134 "texture": 0x3,
135 "load_store": 0x5,
136 "alu4": 0x8,
137 "alu8": 0x9,
138 "alu12": 0xA,
139 "alu16": 0xB,
140 }
141
142 def is_tag_alu(tag):
143 return (tag >= t6xx_tag["alu4"]) and (tag <= t6xx_tag["alu16"])
144
145 # Just an enum
146
147 ALU = 0
148 LDST = 1
149 TEXTURE = 2
150
151 # Constant types supported, mapping the constant prefix to the Python format
152 # string and the coercion function
153
154 constant_types = {
155 "f": ("f", float),
156 "h": ("e", float),
157 "i": ("i", int),
158 "s": ("h", int)
159 }
160
161 compact_branch_op = {
162 "jump": 1,
163 "branch": 2,
164 "discard": 4,
165 "write": 7
166 }
167
168 branch_condition = {
169 "false": 1,
170 "true": 2,
171 "always": 3,
172 }
173
174 # TODO: What else?
175
176 texture_op = {
177 "normal": 0x11,
178 "texelfetch": 0x14
179 }
180
181 texture_fmt = {
182 "2d": 0x02,
183 "3d": 0x03
184 }
185
186 with open(sys.argv[1], "r") as f:
187 for ln in f:
188 space = ln.strip().split(" ")
189
190 instruction = space[0]
191 rest = " ".join(space[1:])
192
193 arguments = [s.strip() for s in rest.split(",")]
194 program += [(instruction, arguments)]
195
196 swizzle_component = {
197 "x": 0,
198 "y": 1,
199 "z": 2,
200 "w": 3
201 }
202
203 def decode_reg_name(reg_name):
204 ireg = 0
205 upper = False
206 half = False
207
208 if reg_name[0] == 'r':
209 ireg = int(reg_name[1:])
210 elif reg_name[0] == 'h':
211 rreg = int(reg_name[2:])
212
213 # Decode half-register into its full register's half
214 ireg = rreg >> 1
215 upper = rreg & 1
216 half = True
217 else:
218 # Special case for load/store addresses
219 ireg = int(reg_name)
220
221 return (ireg, half, upper)
222
223 def standard_swizzle_from_parts(swizzle_parts):
224 swizzle_s = swizzle_parts[1] if len(swizzle_parts) > 1 else "xyzw"
225
226 swizzle = 0
227 for (i, c) in enumerate(swizzle_s):
228 swizzle |= swizzle_component[c] << (2 * i)
229
230 return swizzle
231
232 def mask_from_parts(mask_parts, large_mask):
233 mask_s = mask_parts[1] if len(mask_parts) > 1 else "xyzw"
234
235 if large_mask:
236 mask = sum([(3 << (2*swizzle_component[c]) if c in mask_s else 0) for c in "xyzw"])
237 else:
238 mask = sum([(1 << swizzle_component[c] if c in mask_s else 0) for c in "xyzw"])
239
240 return (mask, mask_s)
241
242 def decode_reg(reg):
243 if reg[0] == "#":
244 # Not actually a register, instead an immediate float
245 return (True, struct.unpack("H", struct.pack("e", float(reg[1:])))[0], 0, 0, 0, 0)
246
247 # Function call syntax used in abs() modifier
248 if reg[-1] == ')':
249 reg = reg[:-1]
250
251 swizzle_parts = reg.split(".")
252
253 reg_name = swizzle_parts[0]
254
255 modifiers = 0
256
257 if reg_name[0] == '-':
258 modifiers |= 2
259 reg_name = reg_name[1:]
260
261 if reg_name[0] == 'a':
262 modifiers |= 1
263 reg_name = reg_name[len("abs("):]
264
265 (ireg, half, upper) = decode_reg_name(reg_name)
266
267 return (False, ireg, standard_swizzle_from_parts(swizzle_parts), half, upper, modifiers)
268
269 def decode_masked_reg(reg, large_mask):
270 mask_parts = reg.split(".")
271
272 reg_name = mask_parts[0]
273 (ireg, half, upper) = decode_reg_name(reg_name)
274 (mask, mask_s) = mask_from_parts(mask_parts, large_mask)
275
276 component = max([0] + [swizzle_component[c] for c in "xyzw" if c in mask_s])
277
278 return (ireg, mask, component, half, upper)
279
280 # TODO: Fill these in XXX
281
282 # Texture pipeline registers in r28-r29
283 TEXTURE_BASE = 28
284
285 def decode_texture_reg_number(reg):
286 r = reg.split(".")[0]
287
288 if r[0] == "r":
289 return (True, int(r[1:]) - TEXTURE_BASE, 0)
290 else:
291 no = int(r[2:])
292 return (False, (no >> 1) - TEXTURE_BASE, no & 1)
293
294 def decode_texture_reg(reg):
295 (full, select, upper) = decode_texture_reg_number(reg)
296
297 # Swizzle mandatory for texture registers, afaict
298 swizzle = reg.split(".")[1]
299 swizzleL = swizzle_component[swizzle[0]]
300 swizzleR = swizzle_component[swizzle[1]]
301
302 return (full, select, upper, swizzleR, swizzleL)
303
304 def decode_texture_out_reg(reg):
305 (full, select, upper) = decode_texture_reg_number(reg)
306 (mask, _) = mask_from_parts(reg.split("."), False)
307
308 return (full, select, upper, mask)
309
310 instruction_stream = []
311
312 for p in program:
313 ins = p[0]
314 arguments = p[1]
315
316 family = ins_mod = ins.split(".")[0]
317 ins_op = (ins + ".").split(".")[1]
318
319 ins_outmod = (ins + "." + ".").split(".")[2]
320
321 try:
322 out_mod = t6xx_outmod[ins_outmod]
323 except:
324 out_mod = 0
325
326 if ins in t6xx_load_store_ops:
327 op = t6xx_load_store_ops[ins]
328 (reg, mask, component, half, upper) = decode_masked_reg(p[1][0], False)
329 (immediate, address, swizzle, half, upper, modifiers) = decode_reg(p[1][1])
330 unknown = int(p[1][2], 16)
331 b = (op << 0) | (reg << 8) | (mask << 13) | (swizzle << 17) | (unknown << 25) | (address << 51)
332 instruction_stream += [(LDST, b)]
333 elif ins_op in t6xx_alu_ops:
334 op = t6xx_alu_ops[ins_op]
335
336 (reg_out, mask, out_component, half0, upper0) = decode_masked_reg(p[1][0], True)
337 (_, reg_in1, swizzle1, half1, upper1, mod1) = decode_reg(p[1][1])
338 (immediate, reg_in2, swizzle2, half2, upper2, mod2) = decode_reg(p[1][2])
339
340 if immediate:
341 register_word = (reg_in1 << 0) | ((reg_in2 >> 11) << 5) | (reg_out << 10) | (1 << 15)
342 else:
343 register_word = (reg_in1 << 0) | (reg_in2 << 5) | (reg_out << 10)
344
345 if ins_mod in ["vadd", "vmul", "lut"]:
346 io_mode = t6xx_reg_mode["half" if half0 else "full"]
347 repsel = 0
348 i1half = half1
349 i2block = 0
350 output_override = 2 # NORMAL, TODO
351 wr_mask = 0
352
353 if (ins_outmod == "quarter"):
354 io_mode = t6xx_reg_mode["quarter"]
355
356 if half0:
357 # TODO: half actually
358 repsel = 2 * upper1
359 else:
360 repsel = upper1
361
362 if half0:
363 # Rare case...
364
365 (_, halfmask, _, _, _) = decode_masked_reg(p[1][0], False)
366 wr_mask = halfmask
367 else:
368 wr_mask = mask
369
370
371 if immediate:
372 # Inline constant: lower 11 bits
373
374 i2block = ((reg_in2 & 0xFF) << 3) | ((reg_in2 >> 8) & 0x7)
375 else:
376 if half0:
377 # TODO: replicate input 2 if half
378 pass
379 else:
380 # TODO: half selection
381 i2block = upper2 | (half2 << 2)
382
383 i2block |= swizzle2 << 3
384
385 # Extra modifier for some special cased stuff
386 try:
387 special = ins.split(".")[3]
388
389 if special == "low":
390 output_override = 0 # low
391 elif special == "fulllow":
392 # TODO: Not really a special case, just a bug?
393 io_mode = t6xx_reg_mode["full"]
394 output_override = 0 #low
395 wr_mask = 0xFF
396 except:
397 pass
398
399 instruction_word = (op << 0) | (io_mode << 8) | (mod1 << 10) | (repsel << 12) | (i1half << 14) | (swizzle1 << 15) | (mod2 << 23) | (i2block << 25) | (output_override << 36) | (out_mod << 38) | (wr_mask << 40)
400 elif ins_mod in ["sadd", "smul"]:
401 # TODO: What are these?
402 unknown2 = 0
403 unknown3 = 0
404
405 i1comp_block = 0
406
407 if half1:
408 i1comp_block = swizzle1 | (upper1 << 2)
409 else:
410 i1comp_block = swizzle1 << 1
411
412 i2block = 0
413
414 if immediate:
415 # Inline constant is splattered in a... bizarre way
416
417 i2block = (((reg_in2 >> 9) & 3) << 0) | (((reg_in2 >> 8) & 1) << 2) | (((reg_in2 >> 5) & 7) << 3) | (((reg_in2 >> 0) & 15) << 6)
418 else:
419 # TODO: half register
420 swizzle2 = (swizzle2 << 1) & 0x1F
421 i2block = (mod2 << 0) | ((not half2) << 2) | (swizzle2 << 3) | (unknown2 << 5)
422
423 outcomp_block = 0
424
425 if True:
426 outcomp_block = out_component << 1
427 else:
428 # TODO: half register
429 pass
430
431 instruction_word = (op << 0) | (mod1 << 8) | ((not half1) << 10) | (i1comp_block << 11) | (i2block << 14) | (unknown3 << 25) | (out_mod << 26) | ((not half0) << 28) | (outcomp_block) << 29
432
433 else:
434 instruction_word = op
435
436 instruction_stream += [(ALU, ins_mod, register_word, instruction_word)]
437 elif family == "texture":
438 # Texture ops use long series of modifiers to describe their needed
439 # capabilities, seperated by dots. Decode them here
440 parts = ins.split(".")
441
442 # First few modifiers are fixed, like an instruction name
443 tex_op = parts[1]
444 tex_fmt = parts[2]
445
446 # The remaining are variable, but strictly ordered
447 parts = parts[3:]
448
449 op = texture_op[tex_op]
450
451 # Some bits are defined directly in the modifier list
452 shadow = "shadow" in parts
453 cont = "cont" in parts
454 last = "last" in parts
455 has_filter = "raw" not in parts
456
457 # The remaining need order preserved since they have their own arguments
458 argument_parts = [part for part in parts if part not in ["shadow", "cont", "last", "raw"]]
459
460 bias_lod = 0
461
462 for argument, part in zip(argument_parts, arguments[4:]):
463 if argument == "bias":
464 bias_lod = int(float(part) * 256)
465 else:
466 print("Unknown argument: " + str(argument))
467
468 fmt = texture_fmt[tex_fmt]
469 has_offset = 0
470
471 magic1 = 1 # IDEK
472 magic2 = 2 # Where did this even come from?!
473
474 texture_handle = int(arguments[1][len("texture"):])
475
476 sampler_parts = arguments[2].split(".")
477 sampler_handle = int(sampler_parts[0][len("sampler"):])
478 swizzle0 = standard_swizzle_from_parts(sampler_parts)
479
480 (full0, select0, upper0, mask0) = decode_texture_out_reg(arguments[0])
481 (full1, select1, upper1, swizzleR1, swizzleL1) = decode_texture_reg(arguments[3])
482
483 tex = (op << 0) | (shadow << 6) | (cont << 8) | (last << 9) | (fmt << 10) | (has_offset << 15) | (has_filter << 16) | (select1 << 17) | (upper1 << 18) | (swizzleL1 << 19) | (swizzleR1 << 21) | (0 << 23) | (magic2 << 25) | (full0 << 29) | (magic1 << 30) | (select0 << 32) | (upper0 << 33) | (mask0 << 34) | (swizzle0 << 40) | (bias_lod << 72) | (texture_handle << 88) | (sampler_handle << 104)
484
485 instruction_stream += [(TEXTURE, tex)]
486 elif family == "br":
487 cond = ins.split(".")[2]
488 condition = branch_condition[cond]
489 bop = compact_branch_op[ins_op]
490
491 offset = int(arguments[0].split("->")[0])
492
493 # 2's complement and chill
494 if offset < 0:
495 offset = (1 << 7) - abs(offset)
496
497 # Find where we're going
498 dest_tag = int(arguments[0].split("->")[1])
499
500 br = (bop << 0) | (dest_tag << 3) | (offset << 7) | (condition << 14)
501
502 # TODO: Unconditional branch encoding
503
504 instruction_stream += [(ALU, "br", None, br)]
505 elif ins[1:] == "constants":
506 if ins[0] not in constant_types:
507 print("Unknown constant type " + str(constant_type))
508 break
509
510 (fmt, cast) = constant_types[ins[0]]
511
512 encoded = [struct.pack(fmt, cast(f)) for f in p[1]]
513
514 consts = bytearray()
515 for c in encoded:
516 consts += c
517
518 # consts must be exactly 4 quadwords, so pad with zeroes if necessary
519 consts += bytes(4*4 - len(consts))
520
521 instruction_stream += [(ALU, "constants", consts)]
522
523 # Emit from instruction stream
524 instructions = []
525 index = 0
526 while index < len(instruction_stream):
527 output_stream = bytearray()
528 ins = instruction_stream[index]
529 tag = ins[0]
530
531 can_prefetch = index + 1 < len(instruction_stream)
532 succeeding = None
533
534 if tag == LDST:
535 succeeding = instruction_stream[index + 1] if can_prefetch else None
536 parta = ins[1]
537 partb = None
538
539 if succeeding and succeeding[0] == LDST:
540 partb = succeeding[1]
541 index += 1
542 else:
543 partb = parta
544 parta = t6xx_load_store_ops["ld_st_noop"]
545
546 tag8 = t6xx_tag["load_store"]
547
548 ins = (partb << 68) | (parta << 8) | tag8
549 output_stream += (ins.to_bytes(16, "little"))
550 elif tag == TEXTURE:
551 tag8 = t6xx_tag["texture"]
552 ins = (ins[1] << 8) | tag8
553
554 output_stream += (ins.to_bytes(16, "little"))
555 elif tag == ALU:
556 # TODO: Combining ALU ops
557
558 emit_size = 4 # 32-bit tag always emitted
559
560 tag = 0
561 register_words = bytearray()
562 body_words = bytearray()
563 constant_words = None
564
565 last_alu_bit = 0
566
567 # Iterate through while there are ALU tags in strictly ascending order
568 while index < len(instruction_stream) and instruction_stream[index][0] == ALU and t6xx_alu_bits[instruction_stream[index][1]] > last_alu_bit:
569 ins = instruction_stream[index]
570
571 bit = t6xx_alu_bits[ins[1]]
572 last_alu_bit = bit
573
574 if ins[1] == "constants":
575 constant_words = ins[2]
576 else:
577 # Flag for the used part of the GPU
578 tag |= 1 << bit
579
580 # 16-bit register word, if present
581 if ins[2] is not None:
582 register_words += (ins[2].to_bytes(2, "little"))
583 emit_size += 2
584
585 size = int(t6xx_alu_size_bits[ins[1]] / 8)
586 body_words += (ins[3].to_bytes(size, "little"))
587 emit_size += size
588
589 index += 1
590
591 index -= 1 # fix off by one, from later loop increment
592
593 # Pad to nearest multiple of 4 words
594 padding = (16 - (emit_size & 15)) if (emit_size & 15) else 0
595 emit_size += padding
596
597 # emit_size includes constants
598 if constant_words:
599 emit_size += len(constant_words)
600
601 # Calculate tag given size
602 words = emit_size >> 2
603 tag |= t6xx_tag["alu" + str(words)]
604
605 # Actually emit, now that we can
606 output_stream += tag.to_bytes(4, "little")
607 output_stream += register_words
608 output_stream += body_words
609 output_stream += bytes(padding)
610
611 if constant_words:
612 output_stream += constant_words
613
614 instructions += [output_stream]
615 index += 1
616
617 # Assmebly over; just emit tags at this point
618 binary = bytearray()
619
620 for (idx, ins) in enumerate(instructions):
621 # Instruction prefetch
622 tag = 0
623
624 if idx + 1 < len(instructions):
625 tag = instructions[idx + 1][0] & 0xF
626
627 # Check for ALU special case
628
629 if is_tag_alu(tag) and idx + 2 == len(instructions):
630 tag = 1
631 else:
632 # Instruction stream over
633
634 tag = 1
635
636 ins[0] |= tag << 4
637
638 binary += ins
639
640 pprint.pprint(program)
641
642 with open(sys.argv[2], "wb") as f:
643 f.write(binary)