1 # SPDX-License-Identifier: LGPL-3-or-later
2 # Copyright 2022 Jacob Lifshay
5 from openpower
.decoder
.selectable_int
import SelectableInt
6 from openpower
.simulator
.program
import Program
7 from openpower
.test
.common
import TestAccumulatorBase
8 from openpower
.test
.state
import ExpectedState
9 from openpower
.sv
.trans
.svp64
import SVP64Asm
12 SVP64_UTF8_VALIDATION_DATA_ADDR
= 0x10000
15 class UTF8FirstTwoBytesError(enum
.IntFlag
):
16 """ Error conditions that are detectable from just the first two bytes in
21 """ ascii byte followed by a continuation byte """
24 """ leading byte followed by something other than a continuation byte """
27 """ value is `< 0x80` but is encoded using 2 bytes """
30 """ value is a surrogate (`0xD800 <= value <= 0xDFFF`) """
33 """ value is `< 0x800` but is encoded using 3 bytes """
35 Overlong4OrTooLarge
= 1 << 5
37 * `< 0x10000` but is encoded using 4 bytes
38 * or the value is `>= 0x140000` with the first continuation byte
41 The rest of the cases where the value is `> 0x10FFFF` are covered by
46 """ value is `> 0x10FFFF` with the first continuation byte being `>= 0x90`
48 The rest of the cases where the value is `> 0x10FFFF` are covered by
49 `Overlong4OrTooLarge`.
52 TwoContinuations
= 1 << 7
53 """ not actually an error -- two continuations in a row """
55 AllActualErrors
= (TooLong | TooShort | Overlong2 | Surrogate |
56 Overlong3 | Overlong4OrTooLarge | TooLarge
)
59 # look up tables for checking for errors in the first two bytes, the final
60 # error flags are generated by looking up the nibbles of the first two bytes
61 # in the appropriate tables, and bitwise ANDing the results together.
62 # To figure out what to put in each entry in the LUTs, look for all cases
63 # that could match the comment.
65 _TLN
= UTF8FirstTwoBytesError
.TooLong
66 _TS
= UTF8FirstTwoBytesError
.TooShort
67 _O2
= UTF8FirstTwoBytesError
.Overlong2
68 _SG
= UTF8FirstTwoBytesError
.Surrogate
69 _O3
= UTF8FirstTwoBytesError
.Overlong3
70 _O4TL
= UTF8FirstTwoBytesError
.Overlong4OrTooLarge
71 _TLG
= UTF8FirstTwoBytesError
.TooLarge
72 _2C
= UTF8FirstTwoBytesError
.TwoContinuations
74 FIRST_BYTE_HIGH_NIBBLE_LUT_ADDR
= 0xFF00
75 FIRST_BYTE_HIGH_NIBBLE_LUT
= [
76 _TLN
, # first 2 bytes are 0x0? 0x??
77 _TLN
, # first 2 bytes are 0x1? 0x??
78 _TLN
, # first 2 bytes are 0x2? 0x??
79 _TLN
, # first 2 bytes are 0x3? 0x??
80 _TLN
, # first 2 bytes are 0x4? 0x??
81 _TLN
, # first 2 bytes are 0x5? 0x??
82 _TLN
, # first 2 bytes are 0x6? 0x??
83 _TLN
, # first 2 bytes are 0x7? 0x??
84 _2C
, # first 2 bytes are 0x8? 0x??
85 _2C
, # first 2 bytes are 0x9? 0x??
86 _2C
, # first 2 bytes are 0xA? 0x??
87 _2C
, # first 2 bytes are 0xB? 0x??
88 _TS | _O2
, # first 2 bytes are 0xC? 0x??
89 _TS
, # first 2 bytes are 0xD? 0x??
90 _TS | _SG | _O3
, # first 2 bytes are 0xE? 0x??
91 _TS | _O4TL | _TLG
, # first 2 bytes are 0xF? 0x??
93 FIRST_BYTE_LOW_NIBBLE_LUT_ADDR
= 0xFF10
94 FIRST_BYTE_LOW_NIBBLE_LUT
= [
95 _TLN | _TS | _O2 | _O3 | _O4TL | _2C
, # first 2 bytes are 0x?0 0x??
96 _TLN | _TS | _O2 | _2C
, # first 2 bytes are 0x?1 0x??
97 _TLN | _TS | _2C
, # first 2 bytes are 0x?2 0x??
98 _TLN | _TS | _2C
, # first 2 bytes are 0x?3 0x??
99 _TLN | _TS | _TLG | _2C
, # first 2 bytes are 0x?4 0x??
100 _TLN | _TS | _O4TL | _TLG | _2C
, # first 2 bytes are 0x?5 0x??
101 _TLN | _TS | _O4TL | _TLG | _2C
, # first 2 bytes are 0x?6 0x??
102 _TLN | _TS | _O4TL | _TLG | _2C
, # first 2 bytes are 0x?7 0x??
103 _TLN | _TS | _O4TL | _TLG | _2C
, # first 2 bytes are 0x?8 0x??
104 _TLN | _TS | _O4TL | _TLG | _2C
, # first 2 bytes are 0x?9 0x??
105 _TLN | _TS | _O4TL | _TLG | _2C
, # first 2 bytes are 0x?A 0x??
106 _TLN | _TS | _O4TL | _TLG | _2C
, # first 2 bytes are 0x?B 0x??
107 _TLN | _TS | _O4TL | _TLG | _2C
, # first 2 bytes are 0x?C 0x??
108 _TLN | _TS | _SG | _O4TL | _TLG | _2C
, # first 2 bytes are 0x?D 0x??
109 _TLN | _TS | _O4TL | _TLG | _2C
, # first 2 bytes are 0x?E 0x??
110 _TLN | _TS | _O4TL | _TLG | _2C
, # first 2 bytes are 0x?F 0x??
112 SECOND_BYTE_HIGH_NIBBLE_LUT_ADDR
= 0xFF20
113 SECOND_BYTE_HIGH_NIBBLE_LUT
= [
114 _TS
, # first 2 bytes are 0x?? 0x0?
115 _TS
, # first 2 bytes are 0x?? 0x1?
116 _TS
, # first 2 bytes are 0x?? 0x2?
117 _TS
, # first 2 bytes are 0x?? 0x3?
118 _TS
, # first 2 bytes are 0x?? 0x4?
119 _TS
, # first 2 bytes are 0x?? 0x5?
120 _TS
, # first 2 bytes are 0x?? 0x6?
121 _TS
, # first 2 bytes are 0x?? 0x7?
122 _TLN | _O2 | _O3 | _O4TL | _2C
, # first 2 bytes are 0x?? 0x8?
123 _TLN | _O2 | _O3 | _TLG | _2C
, # first 2 bytes are 0x?? 0x9?
124 _TLN | _O2 | _SG | _TLG | _2C
, # first 2 bytes are 0x?? 0xA?
125 _TLN | _O2 | _SG | _TLG | _2C
, # first 2 bytes are 0x?? 0xB?
126 _TS
, # first 2 bytes are 0x?? 0xC?
127 _TS
, # first 2 bytes are 0x?? 0xD?
128 _TS
, # first 2 bytes are 0x?? 0xE?
129 _TS
, # first 2 bytes are 0x?? 0xF?
133 def svp64_utf8_validation_asm():
134 # raise NotImplementedError("not finished")
136 # input addr in r3, input length in r4
137 # prev bytes in r45-r47 -- u64x3
138 # cur bytes in r48-r63 -- u64x16
139 # nibbles to look up in r80-r95 -- u64x16
140 # error flags in r64-r79 -- u64x16
141 "setvl 0, 0, 3, 0, 1, 1", # set VL to 3
142 "sv.addi *45, 0, 0", # clear prev bytes
143 f
"lis 6, {FIRST_BYTE_HIGH_NIBBLE_LUT_ADDR >> 16}",
144 f
"ori 6, 6, {FIRST_BYTE_HIGH_NIBBLE_LUT_ADDR & 0xFFFF}",
145 f
"lis 7, {FIRST_BYTE_LOW_NIBBLE_LUT_ADDR >> 16}",
146 f
"ori 7, 7, {FIRST_BYTE_LOW_NIBBLE_LUT_ADDR & 0xFFFF}",
147 f
"lis 8, {SECOND_BYTE_HIGH_NIBBLE_LUT_ADDR >> 16}",
148 f
"ori 8, 8, {SECOND_BYTE_HIGH_NIBBLE_LUT_ADDR & 0xFFFF}",
150 "setvl 0, 0, 3, 0, 1, 1", # set VL to 3
151 "sv.ori *45, *61, 0", # copy prev bytes from end of cur bytes
153 # clear cur bytes, so bytes beyond end end up being zeros
154 "setvl 0, 0, 16, 0, 1, 1", # set VL to 16
155 "sv.addi *48, 0, 0", # clear cur bytes
156 "setvl. 5, 4, 16, 0, 1, 1", # set VL to min(16, r4)
157 "beq final_check", # if no bytes left to load, run final check
158 "sv.lbz/els *48, 0(3)", # load bytes
159 "setvl 0, 0, 16, 0, 1, 1", # set VL to 16
160 # now we can operate on 16 byte chunks, branch to `fail` if they don't
163 # get high nibbles of input shifted by 1 byte
164 f
"sv.rldicl *80, *47, {64 - 4}, 4", # srdi *80, *47, 4
165 # look-up nibbles in table, writing to error flags
166 "sv.lbzx *64, 6, *80",
168 # get low nibbles of input shifted by 1 byte
169 f
"sv.andi. *80, *47, {0xF}", # there is no andi without Rc
170 # look-up nibbles in table
171 "sv.lbzx *80, 6, *80",
172 # bitwise and into error flags
173 "sv.and *64, *64, *80",
175 # get high nibbles of input
176 "sv.srdi *80, *48, 4",
177 # look-up nibbles in table
178 "sv.lbzx *80, 6, *80",
179 # bitwise and into error flags
180 "sv.and *64, *64, *80",
182 # or-reduce error flags into r96
184 "sv.or *81, *80, *81",
185 # check for any actual error flags set
186 f
"sv.andi. 96, 96, {UTF8FirstTwoBytesError.AllActualErrors}",
189 # check for the correct number of continuation bytes for 3/4-byte cases
190 # set bit 0x80 (TwoContinuations) if input is >= 0xE0
191 f
"sv.subi/satu *80, *46, {0xE0 - 0x80}",
192 # xor into error flags
193 "sv.xor *64, *64, *80",
194 # set bit 0x80 (TwoContinuations) if input is >= 0xF0
195 f
"sv.subi/satu *80, *45, {0xF0 - 0x80}",
196 # xor into error flags
197 "sv.xor *80, *64, *80",
198 # now bit 0x80 is set in r80-95 if there's an error
200 "sv.or *81, *80, *81",
201 # adjust count/pointer
202 "add 3, 3, 5", # increment pointer
203 "sub 4, 4, 5", # decrement count
204 f
"sv.andi. 96, 96, {0x80}", # check if any errors
205 "beq loop", # if no errors loop, else fail
210 # check if prev input is incomplete
211 # check if byte 3 bytes from end needed 4 bytes
212 f
"sv.cmpli 0, 1, 45, {0xF0}",
214 # check if byte 2 bytes from end needed 3 bytes
215 f
"sv.cmpli 0, 1, 46, {0xE0}",
217 # check if byte 1 bytes from end needed 2 bytes
218 f
"sv.cmpli 0, 1, 47, {0xC0}",
225 class SVP64UTF8ValidationTestCase(TestAccumulatorBase
):
226 def run_case(self
, data
):
227 # type: (bytes) -> None
231 except UnicodeDecodeError:
233 isa
= SVP64Asm(svp64_utf8_validation_asm())
235 initial_regs
= [0x15cee3293aa9bfbe] * 128 # fill with junk
236 initial_regs
[3] = 0x10000 # pointer to bytes to check
237 initial_regs
[4] = len(data
) # length of bytes to check
240 for i
, v
in enumerate(data
):
241 initial_mem
[i
+ initial_regs
[3]] = v
, 1
242 stop_at_pc
= 0x10000000
243 initial_sprs
= {8: SelectableInt(stop_at_pc
, 64)}
244 e
= ExpectedState(pc
=stop_at_pc
)
245 e
.intregs
[3] = expected
246 self
.add_case(Program(lst
, 0), initial_regs
, initial_mem
=initial_mem
,
247 initial_sprs
=initial_sprs
, stop_at_pc
=stop_at_pc
,
250 def run_cases(self
, data
):
251 # type: (bytes | str) -> None
252 if isinstance(data
, str):
253 data
= data
.encode("utf-8")
254 data
= b
' ' * 8 + data
+ b
' ' * 8
255 for i
in range(len(data
)):
257 for j
in range(len(part
)):
258 self
.run_case(part
[:j
])
260 def case_empty(self
):
264 self
.run_cases("\u0000") # min 1-byte
270 self
.run_cases("\u007F") # max 1-byte
272 def case_c0_80(self
):
273 self
.run_cases(b
"\xC0\x80") # min 2-byte overlong encoding
275 def case_c1_bf(self
):
276 self
.run_cases(b
"\xC1\xBF") # max 2-byte overlong encoding
278 def case_u0080(self
):
279 self
.run_cases("\u0080") # min 2-byte
281 def case_u07ff(self
):
282 self
.run_cases("\u07FF") # max 2-byte
284 def case_e0_80_80(self
):
285 self
.run_cases(b
"\xE0\x80\x80") # min 3-byte overlong encoding
287 def case_e0_9f_bf(self
):
288 self
.run_cases(b
"\xE0\x9F\xBF") # max 3-byte overlong encoding
290 def case_u0800(self
):
291 self
.run_cases("\u0800") # min 3-byte
293 def case_u0fff(self
):
294 self
.run_cases("\u0FFF")
296 def case_u1000(self
):
297 self
.run_cases("\u1000")
299 def case_ucfff(self
):
300 self
.run_cases("\uCFFF")
302 def case_ud000(self
):
303 self
.run_cases("\uD000")
305 def case_ud7ff(self
):
306 self
.run_cases("\uD7FF")
308 def case_ud800(self
):
309 self
.run_cases("\uD800") # surrogate
311 def case_udbff(self
):
312 self
.run_cases("\uDBFF") # surrogate
314 def case_udc00(self
):
315 self
.run_cases("\uDC00") # surrogate
317 def case_udfff(self
):
318 self
.run_cases("\uDFFF") # surrogate
320 def case_ue000(self
):
321 self
.run_cases("\uE000")
323 def case_uffff(self
):
324 self
.run_cases("\uFFFF") # max 3-byte
326 def case_f0_80_80_80(self
):
327 self
.run_cases(b
"\xF0\x80\x80\x80") # min 4-byte overlong encoding
329 def case_f0_bf_bf_bf(self
):
330 self
.run_cases(b
"\xF0\x8F\xBF\xBF") # max 4-byte overlong encoding
332 def case_u00010000(self
):
333 self
.run_cases("\U00010000") # min 4-byte
335 def case_u0003ffff(self
):
336 self
.run_cases("\U0003FFFF")
338 def case_u00040000(self
):
339 self
.run_cases("\U00040000")
341 def case_u000fffff(self
):
342 self
.run_cases("\U000FFFFF")
344 def case_u00100000(self
):
345 self
.run_cases("\U00100000")
347 def case_u0010ffff(self
):
348 self
.run_cases("\U0010FFFF") # max 4-byte
350 def case_f4_90_80_80(self
):
351 self
.run_cases(b
"\xF4\x90\x80\x80") # first too-big encoding
353 def case_f7_bf_bf_bf(self
):
354 self
.run_cases(b
"\xF7\xBF\xBF\xBF") # max too-big 4-byte encoding
356 def case_f8_x4_80(self
):
357 self
.run_cases(b
"\xF8" + b
"\x80" * 4) # min too-big 5-byte encoding
359 def case_fb_x4_bf(self
):
360 self
.run_cases(b
"\xFB" + b
"\xBF" * 4) # max too-big 5-byte encoding
362 def case_fc_x5_80(self
):
363 self
.run_cases(b
"\xFC" + b
"\x80" * 5) # min too-big 6-byte encoding
365 def case_fd_x5_bf(self
):
366 self
.run_cases(b
"\xFD" + b
"\xBF" * 5) # max too-big 6-byte encoding
368 def case_fe_x6_80(self
):
369 self
.run_cases(b
"\xFE" + b
"\x80" * 6) # min too-big 7-byte encoding
371 def case_fe_x6_bf(self
):
372 self
.run_cases(b
"\xFE" + b
"\xBF" * 6) # max too-big 7-byte encoding
374 def case_ff_x7_80(self
):
375 self
.run_cases(b
"\xFF" + b
"\x80" * 7) # min too-big 8-byte encoding
377 def case_ff_x7_bf(self
):
378 self
.run_cases(b
"\xFF" + b
"\xBF" * 7) # max too-big 8-byte encoding