ec82977d8d25352a969c1e3f5179bd5f102f45bf
[gcc.git] / gcc / config / i386 / ppro.md
1 ;; Scheduling for the Intel P6 family of processors
2 ;; Copyright (C) 2004 Free Software Foundation, Inc.
3 ;;
4 ;; This file is part of GCC.
5 ;;
6 ;; GCC is free software; you can redistribute it and/or modify
7 ;; it under the terms of the GNU General Public License as published by
8 ;; the Free Software Foundation; either version 2, or (at your option)
9 ;; any later version.
10 ;;
11 ;; GCC is distributed in the hope that it will be useful,
12 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
13 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 ;; GNU General Public License for more details.
15 ;;
16 ;; You should have received a copy of the GNU General Public License
17 ;; along with GCC; see the file COPYING. If not, write to
18 ;; the Free Software Foundation, 59 Temple Place - Suite 330,
19 ;; Boston, MA 02111-1307, USA. */
20
21 ;; The P6 family includes the Pentium Pro, Pentium II, Pentium III, Celeron
22 ;; and Xeon lines of CPUs. The DFA scheduler description in this file is
23 ;; based on information that can be found in the following three documents:
24 ;;
25 ;; "P6 Family of Processors Hardware Developer's Manual",
26 ;; Intel, September 1999.
27 ;;
28 ;; "Intel Architecture Optimization Manual",
29 ;; Intel, 1999 (Order Number: 245127-001).
30 ;;
31 ;; "How to optimize for the Pentium family of microprocessors",
32 ;; by Agner Fog, PhD.
33 ;;
34 ;; The P6 pipeline has three major components:
35 ;; 1) the FETCH/DECODE unit, an in-order issue front-end
36 ;; 2) the DISPATCH/EXECUTE unit, which is the out-of-order core
37 ;; 3) the RETIRE unit, an in-order retirement unit
38 ;;
39 ;; So, the P6 CPUs have out-of-order cores, but the instruction decoder and
40 ;; retirement unit are naturally in-order.
41 ;;
42 ;; BUS INTERFACE UNIT
43 ;; / \
44 ;; L1 ICACHE L1 DCACHE
45 ;; / | \ | \
46 ;; DECODER0 DECODER1 DECODER2 DISP/EXEC RETIRE
47 ;; \ | / | |
48 ;; INSTRUCTION POOL __________|_______/
49 ;; (inc. reorder buffer)
50 ;;
51 ;; Since the P6 CPUs execute instructions out-of-order, the most important
52 ;; consideration in performance tuning is making sure enough micro-ops are
53 ;; ready for execution in the out-of-order core, while not stalling the
54 ;; decoder.
55 ;;
56 ;; TODO:
57 ;; - Find a less crude way to model complex instructions, in
58 ;; particular how many cycles they take to be decoded.
59 ;; - Include decoder latencies in the total reservation latencies.
60 ;; This isn't necessary right now because we assume for every
61 ;; instruction that it never blocks a decoder.
62 ;; - Figure out where the p0 and p1 reservations come from. These
63 ;; appear not to be in the manual (e.g. why is cld "(p0+p1)*2"
64 ;; better than "(p0|p1)*4" ???)
65 ;; - Lots more because I'm sure this is still far from optimal :-)
66
67 ;; The ppro_idiv and ppro_fdiv automata are used to model issue
68 ;; latencies of idiv and fdiv type insns.
69 (define_automaton "ppro_decoder,ppro_core,ppro_idiv,ppro_fdiv,ppro_load,ppro_store")
70
71 ;; Simple instructions of the register-register form have only one uop.
72 ;; Load instructions are also only one uop. Store instructions decode to
73 ;; two uops, and simple read-modify instructions also take two uops.
74 ;; Simple instructions of the register-memory form have two to three uops.
75 ;; Simple read-modify-write instructions have four uops. The rules for
76 ;; the decoder are simple:
77 ;; - an instruction with 1 uop can be decoded by any of the three
78 ;; decoders in one cycle.
79 ;; - an instruction with 1 to 4 uops can be decoded only by decoder 0
80 ;; but still in only one cycle.
81 ;; - a complex (microcode) instruction can also only be decoded by
82 ;; decoder 0, and this takes an unspecified number of cycles.
83 ;;
84 ;; The goal is to schedule such that we have a few-one-one uops sequence
85 ;; in each cycle, to decode as many instructions per cycle as possible.
86 (define_cpu_unit "decoder0" "ppro_decoder")
87 (define_cpu_unit "decoder1" "ppro_decoder")
88 (define_cpu_unit "decoder2" "ppro_decoder")
89
90 ;; We first wish to find an instruction for decoder0, so exclude
91 ;; decoder1 and decoder2 from being reserved until decoder 0 is
92 ;; reserved.
93 (presence_set "decoder1" "decoder0")
94 (presence_set "decoder2" "decoder0")
95
96 ;; Most instructions can be decoded on any of the three decoders.
97 (define_reservation "decodern" "(decoder0|decoder1|decoder2)")
98
99 ;; The out-of-order core has five pipelines. During each cycle, the core
100 ;; may dispatch zero or one uop on the port of any of the five pipelines
101 ;; so the maximum number of dispatched uops per cycle is 5. In practicer,
102 ;; 3 uops per cycle is more realistic.
103 ;;
104 ;; Two of the five pipelines contain several execution units:
105 ;;
106 ;; Port 0 Port 1 Port 2 Port 3 Port 4
107 ;; ALU ALU LOAD SAC SDA
108 ;; FPU JUE
109 ;; AGU MMX
110 ;; MMX P3FPU
111 ;; P3FPU
112 ;;
113 ;; (SAC=Store Address Calculation, SDA=Store Data Unit, P3FPU = SSE unit,
114 ;; JUE = Jump Execution Unit, AGU = Address Generation Unit)
115 ;;
116 (define_cpu_unit "p0,p1" "ppro_core")
117 (define_cpu_unit "p2" "ppro_load")
118 (define_cpu_unit "p3,p4" "ppro_store")
119 (define_cpu_unit "idiv" "ppro_idiv")
120 (define_cpu_unit "fdiv" "ppro_fdiv")
121
122 ;; Only the irregular instructions have to be modeled here. A load
123 ;; increases the latency by 2 or 3, or by nothing if the manual gives
124 ;; a latency already. Store latencies are not accounted for.
125 ;;
126 ;; The simple instructions follow a very regular pattern of 1 uop per
127 ;; reg-reg operation, 1 uop per load on port 2. and 2 uops per store
128 ;; on port 4 and port 3. These instructions are modelled at the bottom
129 ;; of this file.
130 ;;
131 ;; For microcoded instructions we don't know how many uops are produced.
132 ;; These instructions are the "complex" ones in the Intel manuals. All
133 ;; we _do_ know is that they typically produce four or more uops, so
134 ;; they can only be decoded on decoder0. Modelling their latencies
135 ;; doesn't make sense because we don't know how these instructions are
136 ;; executed in the core. So we just model that they can only be decoded
137 ;; on decoder 0, and say that it takes a little while before the result
138 ;; is available.
139 (define_insn_reservation "ppro_complex_insn" 6
140 (and (eq_attr "cpu" "pentiumpro")
141 (eq_attr "type" "other,multi,call,callv,str"))
142 "decoder0")
143
144 ;; imov with memory operands does not use the integer units.
145 (define_insn_reservation "ppro_imov" 1
146 (and (eq_attr "cpu" "pentiumpro")
147 (and (eq_attr "memory" "none")
148 (eq_attr "type" "imov")))
149 "decodern,(p0|p1)")
150
151 (define_insn_reservation "ppro_imov_load" 4
152 (and (eq_attr "cpu" "pentiumpro")
153 (and (eq_attr "memory" "load")
154 (eq_attr "type" "imov")))
155 "decodern,p2")
156
157 (define_insn_reservation "ppro_imov_store" 1
158 (and (eq_attr "cpu" "pentiumpro")
159 (and (eq_attr "memory" "store")
160 (eq_attr "type" "imov")))
161 "decoder0,p4+p3")
162
163 ;; imovx always decodes to one uop, and also doesn't use the integer
164 ;; units if it has memory operands.
165 (define_insn_reservation "ppro_imovx" 1
166 (and (eq_attr "cpu" "pentiumpro")
167 (and (eq_attr "memory" "none")
168 (eq_attr "type" "imovx")))
169 "decodern,(p0|p1)")
170
171 (define_insn_reservation "ppro_imovx_load" 4
172 (and (eq_attr "cpu" "pentiumpro")
173 (and (eq_attr "memory" "load")
174 (eq_attr "type" "imovx")))
175 "decodern,p2")
176
177 ;; lea executes on port 0 with latency one and throughput 1.
178 (define_insn_reservation "ppro_lea" 1
179 (and (eq_attr "cpu" "pentiumpro")
180 (and (eq_attr "memory" "none")
181 (eq_attr "type" "lea")))
182 "decodern,p0")
183
184 ;; Shift and rotate execute on port 0 with latency and throughput 1.
185 ;; The load and store units need to be reserved when memory operands
186 ;; are involved.
187 (define_insn_reservation "ppro_shift_rotate" 1
188 (and (eq_attr "cpu" "pentiumpro")
189 (and (eq_attr "memory" "none")
190 (eq_attr "type" "ishift,ishift1,rotate,rotate1")))
191 "decodern,p0")
192
193 (define_insn_reservation "ppro_shift_rotate_mem" 4
194 (and (eq_attr "cpu" "pentiumpro")
195 (and (eq_attr "memory" "!none")
196 (eq_attr "type" "ishift,ishift1,rotate,rotate1")))
197 "decoder0,p2+p0,p4+p3")
198
199 (define_insn_reservation "ppro_cld" 2
200 (and (eq_attr "cpu" "pentiumpro")
201 (eq_attr "type" "cld"))
202 "decoder0,(p0+p1)*2")
203
204 ;; The P6 has a sophisticated branch prediction mechanism to minimize
205 ;; latencies due to branching. In particular, it has a fast way to
206 ;; execute branches that are taken multiple times (such as in loops).
207 ;; Branches not taken suffer no penalty, and correctly predicted
208 ;; branches cost only one fetch cycle. Mispredicted branches are very
209 ;; costly: typically 15 cycles and possibly as many as 26 cycles.
210 ;;
211 ;; Unfortunately all this makes it quite difficult to properly model
212 ;; the latencies for the compiler. Here I've made the choice to be
213 ;; optimistic and assume branches are often predicted correctly, so
214 ;; they have latency 1, and the decoders are not blocked.
215 ;;
216 ;; In addition, the model assumes a branch always decodes to only 1 uop,
217 ;; which is not exactly true because there are a few instructions that
218 ;; decode to 2 uops or microcode. But this probably gives the best
219 ;; results because we can assume these instructions can decode on all
220 ;; decoders.
221 (define_insn_reservation "ppro_branch" 1
222 (and (eq_attr "cpu" "pentiumpro")
223 (and (eq_attr "memory" "none")
224 (eq_attr "type" "ibr")))
225 "decodern,p1")
226
227 ;; ??? Indirect branches probably have worse latency than this.
228 (define_insn_reservation "ppro_indirect_branch" 6
229 (and (eq_attr "cpu" "pentiumpro")
230 (and (eq_attr "memory" "!none")
231 (eq_attr "type" "ibr")))
232 "decoder0,p2+p1")
233
234 (define_insn_reservation "ppro_leave" 4
235 (and (eq_attr "cpu" "pentiumpro")
236 (eq_attr "type" "leave"))
237 "decoder0,p2+(p0|p1),(p0|p1)")
238
239 ;; imul has throughput one, but latency 4, and can only execute on port 0.
240 (define_insn_reservation "ppro_imul" 4
241 (and (eq_attr "cpu" "pentiumpro")
242 (and (eq_attr "memory" "none")
243 (eq_attr "type" "imul")))
244 "decodern,p0")
245
246 (define_insn_reservation "ppro_imul_mem" 4
247 (and (eq_attr "cpu" "pentiumpro")
248 (and (eq_attr "memory" "!none")
249 (eq_attr "type" "imul")))
250 "decoder0,p2+p0")
251
252 ;; div and idiv are very similar, so we model them the same.
253 ;; QI, HI, and SI have issue latency 12, 21, and 37, respectively.
254 ;; These issue latencies are modelled via the ppro_div automaton.
255 (define_insn_reservation "ppro_idiv_QI" 19
256 (and (eq_attr "cpu" "pentiumpro")
257 (and (eq_attr "memory" "none")
258 (and (eq_attr "mode" "QI")
259 (eq_attr "type" "idiv"))))
260 "decoder0,(p0+idiv)*2,(p0|p1)+idiv,idiv*9")
261
262 (define_insn_reservation "ppro_idiv_QI_load" 19
263 (and (eq_attr "cpu" "pentiumpro")
264 (and (eq_attr "memory" "load")
265 (and (eq_attr "mode" "QI")
266 (eq_attr "type" "idiv"))))
267 "decoder0,p2+p0+idiv,p0+idiv,(p0|p1)+idiv,idiv*9")
268
269 (define_insn_reservation "ppro_idiv_HI" 23
270 (and (eq_attr "cpu" "pentiumpro")
271 (and (eq_attr "memory" "none")
272 (and (eq_attr "mode" "HI")
273 (eq_attr "type" "idiv"))))
274 "decoder0,(p0+idiv)*3,(p0|p1)+idiv,idiv*17")
275
276 (define_insn_reservation "ppro_idiv_HI_load" 23
277 (and (eq_attr "cpu" "pentiumpro")
278 (and (eq_attr "memory" "load")
279 (and (eq_attr "mode" "HI")
280 (eq_attr "type" "idiv"))))
281 "decoder0,p2+p0+idiv,p0+idiv,(p0|p1)+idiv,idiv*18")
282
283 (define_insn_reservation "ppro_idiv_SI" 39
284 (and (eq_attr "cpu" "pentiumpro")
285 (and (eq_attr "memory" "none")
286 (and (eq_attr "mode" "SI")
287 (eq_attr "type" "idiv"))))
288 "decoder0,(p0+idiv)*3,(p0|p1)+idiv,idiv*33")
289
290 (define_insn_reservation "ppro_idiv_SI_load" 39
291 (and (eq_attr "cpu" "pentiumpro")
292 (and (eq_attr "memory" "load")
293 (and (eq_attr "mode" "SI")
294 (eq_attr "type" "idiv"))))
295 "decoder0,p2+p0+idiv,p0+idiv,(p0|p1)+idiv,idiv*34")
296
297 ;; Floating point operations always execute on port 0.
298 ;; ??? where do these latencies come from? fadd has latency 3 and
299 ;; has throughput "1/cycle (align with FADD)". What do they
300 ;; mean and how can we model that?
301 (define_insn_reservation "ppro_fop" 3
302 (and (eq_attr "cpu" "pentiumpro")
303 (and (eq_attr "memory" "none,unknown")
304 (eq_attr "type" "fop")))
305 "decodern,p0")
306
307 (define_insn_reservation "ppro_fop_load" 5
308 (and (eq_attr "cpu" "pentiumpro")
309 (and (eq_attr "memory" "load")
310 (eq_attr "type" "fop")))
311 "decoder0,p2+p0,p0")
312
313 (define_insn_reservation "ppro_fop_store" 3
314 (and (eq_attr "cpu" "pentiumpro")
315 (and (eq_attr "memory" "store")
316 (eq_attr "type" "fop")))
317 "decoder0,p0,p0,p0+p4+p3")
318
319 (define_insn_reservation "ppro_fop_both" 5
320 (and (eq_attr "cpu" "pentiumpro")
321 (and (eq_attr "memory" "both")
322 (eq_attr "type" "fop")))
323 "decoder0,p2+p0,p0+p4+p3")
324
325 (define_insn_reservation "ppro_fsgn" 1
326 (and (eq_attr "cpu" "pentiumpro")
327 (eq_attr "type" "fsgn"))
328 "decodern,p0")
329
330 (define_insn_reservation "ppro_fistp" 5
331 (and (eq_attr "cpu" "pentiumpro")
332 (eq_attr "type" "fistp"))
333 "decoder0,p0*2,p4+p3")
334
335 (define_insn_reservation "ppro_fcmov" 2
336 (and (eq_attr "cpu" "pentiumpro")
337 (eq_attr "type" "fcmov"))
338 "decoder0,p0*2")
339
340 (define_insn_reservation "ppro_fcmp" 1
341 (and (eq_attr "cpu" "pentiumpro")
342 (and (eq_attr "memory" "none")
343 (eq_attr "type" "fcmp")))
344 "decodern,p0")
345
346 (define_insn_reservation "ppro_fcmp_load" 4
347 (and (eq_attr "cpu" "pentiumpro")
348 (and (eq_attr "memory" "load")
349 (eq_attr "type" "fcmp")))
350 "decoder0,p2+p0")
351
352 (define_insn_reservation "ppro_fmov" 1
353 (and (eq_attr "cpu" "pentiumpro")
354 (and (eq_attr "memory" "none")
355 (eq_attr "type" "fmov")))
356 "decodern,p0")
357
358 (define_insn_reservation "ppro_fmov_load" 1
359 (and (eq_attr "cpu" "pentiumpro")
360 (and (eq_attr "memory" "load")
361 (and (eq_attr "mode" "!XF")
362 (eq_attr "type" "fmov"))))
363 "decodern,p2")
364
365 (define_insn_reservation "ppro_fmov_XF_load" 3
366 (and (eq_attr "cpu" "pentiumpro")
367 (and (eq_attr "memory" "load")
368 (and (eq_attr "mode" "XF")
369 (eq_attr "type" "fmov"))))
370 "decoder0,(p2+p0)*2")
371
372 (define_insn_reservation "ppro_fmov_store" 1
373 (and (eq_attr "cpu" "pentiumpro")
374 (and (eq_attr "memory" "store")
375 (and (eq_attr "mode" "!XF")
376 (eq_attr "type" "fmov"))))
377 "decodern,p0")
378
379 (define_insn_reservation "ppro_fmov_XF_store" 3
380 (and (eq_attr "cpu" "pentiumpro")
381 (and (eq_attr "memory" "store")
382 (and (eq_attr "mode" "XF")
383 (eq_attr "type" "fmov"))))
384 "decoder0,(p0+p4),(p0+p3)")
385
386 ;; fmul executes on port 0 with latency 5. It has issue latency 2,
387 ;; but we don't model this.
388 (define_insn_reservation "ppro_fmul" 5
389 (and (eq_attr "cpu" "pentiumpro")
390 (and (eq_attr "memory" "none")
391 (eq_attr "type" "fmul")))
392 "decoder0,p0*2")
393
394 (define_insn_reservation "ppro_fmul_load" 6
395 (and (eq_attr "cpu" "pentiumpro")
396 (and (eq_attr "memory" "load")
397 (eq_attr "type" "fmul")))
398 "decoder0,p2+p0,p0")
399
400 ;; fdiv latencies depend on the mode of the operands. XFmode gives
401 ;; a latency of 38 cycles, DFmode gives 32, and SFmode gives latency 18.
402 ;; Division by a power of 2 takes only 9 cycles, but we cannot model
403 ;; that. Throughput is equal to latency - 1, which we model using the
404 ;; ppro_div automaton.
405 (define_insn_reservation "ppro_fdiv_SF" 18
406 (and (eq_attr "cpu" "pentiumpro")
407 (and (eq_attr "memory" "none")
408 (and (eq_attr "mode" "SF")
409 (eq_attr "type" "fdiv,fpspc"))))
410 "decodern,p0+fdiv,fdiv*16")
411
412 (define_insn_reservation "ppro_fdiv_SF_load" 19
413 (and (eq_attr "cpu" "pentiumpro")
414 (and (eq_attr "memory" "load")
415 (and (eq_attr "mode" "SF")
416 (eq_attr "type" "fdiv,fpspc"))))
417 "decoder0,p2+p0+fdiv,fdiv*16")
418
419 (define_insn_reservation "ppro_fdiv_DF" 32
420 (and (eq_attr "cpu" "pentiumpro")
421 (and (eq_attr "memory" "none")
422 (and (eq_attr "mode" "DF")
423 (eq_attr "type" "fdiv,fpspc"))))
424 "decodern,p0+fdiv,fdiv*30")
425
426 (define_insn_reservation "ppro_fdiv_DF_load" 33
427 (and (eq_attr "cpu" "pentiumpro")
428 (and (eq_attr "memory" "load")
429 (and (eq_attr "mode" "DF")
430 (eq_attr "type" "fdiv,fpspc"))))
431 "decoder0,p2+p0+fdiv,fdiv*30")
432
433 (define_insn_reservation "ppro_fdiv_XF" 38
434 (and (eq_attr "cpu" "pentiumpro")
435 (and (eq_attr "memory" "none")
436 (and (eq_attr "mode" "XF")
437 (eq_attr "type" "fdiv,fpspc"))))
438 "decodern,p0+fdiv,fdiv*36")
439
440 (define_insn_reservation "ppro_fdiv_XF_load" 39
441 (and (eq_attr "cpu" "pentiumpro")
442 (and (eq_attr "memory" "load")
443 (and (eq_attr "mode" "XF")
444 (eq_attr "type" "fdiv,fpspc"))))
445 "decoder0,p2+p0+fdiv,fdiv*36")
446
447 ;; MMX instructions can execute on either port 0 or port 1 with a
448 ;; throughput of 1/cycle.
449 ;; on port 0: - ALU (latency 1)
450 ;; - Multiplier Unit (latency 3)
451 ;; on port 1: - ALU (latency 1)
452 ;; - Shift Unit (latency 1)
453 ;;
454 ;; MMX instructions are either of the type reg-reg, or read-modify, and
455 ;; except for mmxshft and mmxmul they can execute on port 0 or port 1,
456 ;; so they behave as "simple" instructions that need no special modelling.
457 ;; We only have to model mmxshft and mmxmul.
458 (define_insn_reservation "ppro_mmx_shft" 1
459 (and (eq_attr "cpu" "pentiumpro")
460 (and (eq_attr "memory" "none")
461 (eq_attr "type" "mmxshft")))
462 "decodern,p1")
463
464 (define_insn_reservation "ppro_mmx_shft_load" 2
465 (and (eq_attr "cpu" "pentiumpro")
466 (and (eq_attr "memory" "none")
467 (eq_attr "type" "mmxshft")))
468 "decoder0,p2+p1")
469
470 (define_insn_reservation "ppro_mmx_mul" 3
471 (and (eq_attr "cpu" "pentiumpro")
472 (and (eq_attr "memory" "none")
473 (eq_attr "type" "mmxmul")))
474 "decodern,p0")
475
476 (define_insn_reservation "ppro_mmx_mul_load" 3
477 (and (eq_attr "cpu" "pentiumpro")
478 (and (eq_attr "memory" "none")
479 (eq_attr "type" "mmxmul")))
480 "decoder0,p2+p0")
481
482 (define_insn_reservation "ppro_sse_mmxcvt" 4
483 (and (eq_attr "cpu" "pentiumpro")
484 (and (eq_attr "mode" "DI")
485 (eq_attr "type" "mmxcvt")))
486 "decodern,p1")
487
488 ;; FIXME: These are Pentium III only, but we cannot tell here if
489 ;; we're generating code for PentiumPro/Pentium II or Pentium III
490 ;; (define_insn_reservation "ppro_sse_mmxshft" 2
491 ;; (and (eq_attr "cpu" "pentiumpro")
492 ;; (and (eq_attr "mode" "DI")
493 ;; (eq_attr "type" "mmxshft")))
494 ;; "decodern,p0")
495
496 ;; SSE is very complicated, and takes a bit more effort.
497 ;; ??? I assumed that all SSE instructions decode on decoder0,
498 ;; but is this correct?
499
500 ;; The sfence instruction.
501 (define_insn_reservation "ppro_sse_sfence" 3
502 (and (eq_attr "cpu" "pentiumpro")
503 (and (eq_attr "memory" "unknown")
504 (eq_attr "type" "sse")))
505 "decoder0,p4+p3")
506
507 ;; FIXME: This reservation is all wrong when we're scheduling sqrtss.
508 (define_insn_reservation "ppro_sse_SF" 3
509 (and (eq_attr "cpu" "pentiumpro")
510 (and (eq_attr "mode" "SF")
511 (eq_attr "type" "sse")))
512 "decodern,p0")
513
514 (define_insn_reservation "ppro_sse_add_SF" 3
515 (and (eq_attr "cpu" "pentiumpro")
516 (and (eq_attr "memory" "none")
517 (and (eq_attr "mode" "SF")
518 (eq_attr "type" "sseadd"))))
519 "decodern,p1")
520
521 (define_insn_reservation "ppro_sse_add_SF_load" 3
522 (and (eq_attr "cpu" "pentiumpro")
523 (and (eq_attr "memory" "load")
524 (and (eq_attr "mode" "SF")
525 (eq_attr "type" "sseadd"))))
526 "decoder0,p2+p1")
527
528 (define_insn_reservation "ppro_sse_cmp_SF" 3
529 (and (eq_attr "cpu" "pentiumpro")
530 (and (eq_attr "memory" "none")
531 (and (eq_attr "mode" "SF")
532 (eq_attr "type" "ssecmp"))))
533 "decoder0,p1")
534
535 (define_insn_reservation "ppro_sse_cmp_SF_load" 3
536 (and (eq_attr "cpu" "pentiumpro")
537 (and (eq_attr "memory" "load")
538 (and (eq_attr "mode" "SF")
539 (eq_attr "type" "ssecmp"))))
540 "decoder0,p2+p1")
541
542 (define_insn_reservation "ppro_sse_comi_SF" 1
543 (and (eq_attr "cpu" "pentiumpro")
544 (and (eq_attr "memory" "none")
545 (and (eq_attr "mode" "SF")
546 (eq_attr "type" "ssecomi"))))
547 "decodern,p0")
548
549 (define_insn_reservation "ppro_sse_comi_SF_load" 1
550 (and (eq_attr "cpu" "pentiumpro")
551 (and (eq_attr "memory" "load")
552 (and (eq_attr "mode" "SF")
553 (eq_attr "type" "ssecomi"))))
554 "decoder0,p2+p0")
555
556 (define_insn_reservation "ppro_sse_mul_SF" 4
557 (and (eq_attr "cpu" "pentiumpro")
558 (and (eq_attr "memory" "none")
559 (and (eq_attr "mode" "SF")
560 (eq_attr "type" "ssemul"))))
561 "decodern,p0")
562
563 (define_insn_reservation "ppro_sse_mul_SF_load" 4
564 (and (eq_attr "cpu" "pentiumpro")
565 (and (eq_attr "memory" "load")
566 (and (eq_attr "mode" "SF")
567 (eq_attr "type" "ssemul"))))
568 "decoder0,p2+p0")
569
570 ;; FIXME: ssediv doesn't close p0 for 17 cycles, surely???
571 (define_insn_reservation "ppro_sse_div_SF" 18
572 (and (eq_attr "cpu" "pentiumpro")
573 (and (eq_attr "memory" "none")
574 (and (eq_attr "mode" "SF")
575 (eq_attr "type" "ssediv"))))
576 "decoder0,p0*17")
577
578 (define_insn_reservation "ppro_sse_div_SF_load" 18
579 (and (eq_attr "cpu" "pentiumpro")
580 (and (eq_attr "memory" "none")
581 (and (eq_attr "mode" "SF")
582 (eq_attr "type" "ssediv"))))
583 "decoder0,(p2+p0),p0*16")
584
585 (define_insn_reservation "ppro_sse_icvt_SF" 4
586 (and (eq_attr "cpu" "pentiumpro")
587 (and (eq_attr "mode" "SF")
588 (eq_attr "type" "sseicvt")))
589 "decoder0,(p2+p1)*2")
590
591 (define_insn_reservation "ppro_sse_icvt_SI" 3
592 (and (eq_attr "cpu" "pentiumpro")
593 (and (eq_attr "mode" "SI")
594 (eq_attr "type" "sseicvt")))
595 "decoder0,(p2+p1)")
596
597 (define_insn_reservation "ppro_sse_mov_SF" 3
598 (and (eq_attr "cpu" "pentiumpro")
599 (and (eq_attr "memory" "none")
600 (and (eq_attr "mode" "SF")
601 (eq_attr "type" "ssemov"))))
602 "decoder0,(p0|p1)")
603
604 (define_insn_reservation "ppro_sse_mov_SF_load" 3
605 (and (eq_attr "cpu" "pentiumpro")
606 (and (eq_attr "memory" "load")
607 (and (eq_attr "mode" "SF")
608 (eq_attr "type" "ssemov"))))
609 "decoder0,p2+(p0|p1)")
610
611 (define_insn_reservation "ppro_sse_mov_SF_store" 3
612 (and (eq_attr "cpu" "pentiumpro")
613 (and (eq_attr "memory" "store")
614 (and (eq_attr "mode" "SF")
615 (eq_attr "type" "ssemov"))))
616 "decoder0,p4+p3")
617
618 (define_insn_reservation "ppro_sse_V4SF" 4
619 (and (eq_attr "cpu" "pentiumpro")
620 (and (eq_attr "mode" "V4SF")
621 (eq_attr "type" "sse")))
622 "decoder0,p1*2")
623
624 (define_insn_reservation "ppro_sse_add_V4SF" 3
625 (and (eq_attr "cpu" "pentiumpro")
626 (and (eq_attr "memory" "none")
627 (and (eq_attr "mode" "V4SF")
628 (eq_attr "type" "sseadd"))))
629 "decoder0,p1*2")
630
631 (define_insn_reservation "ppro_sse_add_V4SF_load" 3
632 (and (eq_attr "cpu" "pentiumpro")
633 (and (eq_attr "memory" "load")
634 (and (eq_attr "mode" "V4SF")
635 (eq_attr "type" "sseadd"))))
636 "decoder0,(p2+p1)*2")
637
638 (define_insn_reservation "ppro_sse_cmp_V4SF" 3
639 (and (eq_attr "cpu" "pentiumpro")
640 (and (eq_attr "memory" "none")
641 (and (eq_attr "mode" "V4SF")
642 (eq_attr "type" "ssecmp"))))
643 "decoder0,p1*2")
644
645 (define_insn_reservation "ppro_sse_cmp_V4SF_load" 3
646 (and (eq_attr "cpu" "pentiumpro")
647 (and (eq_attr "memory" "load")
648 (and (eq_attr "mode" "V4SF")
649 (eq_attr "type" "ssecmp"))))
650 "decoder0,(p2+p1)*2")
651
652 (define_insn_reservation "ppro_sse_cvt_V4SF" 3
653 (and (eq_attr "cpu" "pentiumpro")
654 (and (eq_attr "memory" "none,unknown")
655 (and (eq_attr "mode" "V4SF")
656 (eq_attr "type" "ssecvt"))))
657 "decoder0,p1*2")
658
659 (define_insn_reservation "ppro_sse_cvt_V4SF_other" 4
660 (and (eq_attr "cpu" "pentiumpro")
661 (and (eq_attr "memory" "!none,unknown")
662 (and (eq_attr "mode" "V4SF")
663 (eq_attr "type" "ssecmp"))))
664 "decoder0,p1,p4+p3")
665
666 (define_insn_reservation "ppro_sse_mul_V4SF" 5
667 (and (eq_attr "cpu" "pentiumpro")
668 (and (eq_attr "memory" "none")
669 (and (eq_attr "mode" "V4SF")
670 (eq_attr "type" "ssemul"))))
671 "decoder0,p0*2")
672
673 (define_insn_reservation "ppro_sse_mul_V4SF_load" 5
674 (and (eq_attr "cpu" "pentiumpro")
675 (and (eq_attr "memory" "load")
676 (and (eq_attr "mode" "V4SF")
677 (eq_attr "type" "ssemul"))))
678 "decoder0,(p2+p0)*2")
679
680 ;; FIXME: p0 really closed this long???
681 (define_insn_reservation "ppro_sse_div_V4SF" 48
682 (and (eq_attr "cpu" "pentiumpro")
683 (and (eq_attr "memory" "none")
684 (and (eq_attr "mode" "V4SF")
685 (eq_attr "type" "ssediv"))))
686 "decoder0,p0*34")
687
688 (define_insn_reservation "ppro_sse_div_V4SF_load" 48
689 (and (eq_attr "cpu" "pentiumpro")
690 (and (eq_attr "memory" "none")
691 (and (eq_attr "mode" "V4SF")
692 (eq_attr "type" "ssediv"))))
693 "decoder0,(p2+p0)*2,p0*32")
694
695 (define_insn_reservation "ppro_sse_log_V4SF" 2
696 (and (eq_attr "cpu" "pentiumpro")
697 (and (eq_attr "memory" "none")
698 (and (eq_attr "mode" "V4SF")
699 (eq_attr "type" "sselog"))))
700 "decodern,p1")
701
702 (define_insn_reservation "ppro_sse_log_V4SF_load" 2
703 (and (eq_attr "cpu" "pentiumpro")
704 (and (eq_attr "memory" "none")
705 (and (eq_attr "mode" "V4SF")
706 (eq_attr "type" "sselog"))))
707 "decoder0,(p2+p1)")
708
709 (define_insn_reservation "ppro_sse_mov_V4SF" 1
710 (and (eq_attr "cpu" "pentiumpro")
711 (and (eq_attr "memory" "none")
712 (and (eq_attr "mode" "V4SF")
713 (eq_attr "type" "ssemov"))))
714 "decoder0,(p0|p1)*2")
715
716 (define_insn_reservation "ppro_sse_mov_V4SF_load" 2
717 (and (eq_attr "cpu" "pentiumpro")
718 (and (eq_attr "memory" "load")
719 (and (eq_attr "mode" "V4SF")
720 (eq_attr "type" "ssemov"))))
721 "decoder0,p2*2")
722
723 (define_insn_reservation "ppro_sse_mov_V4SF_store" 3
724 (and (eq_attr "cpu" "pentiumpro")
725 (and (eq_attr "memory" "store")
726 (and (eq_attr "mode" "V4SF")
727 (eq_attr "type" "ssemov"))))
728 "decoder0,(p4+p3)*2")
729
730 ;; All other instructions are modelled as simple instructions.
731 ;; We have already modelled all i387 floating point instructions, so all
732 ;; other instructions execute on either port 0 or port 1. This includes
733 ;; the ALU units, and the MMX units.
734 ;;
735 ;; reg-reg instructions produce 1 uop so they can be decoded on any of
736 ;; the three decoders.
737 (define_insn_reservation "ppro_insn" 1
738 (and (eq_attr "cpu" "pentiumpro")
739 (and (eq_attr "memory" "none,unknown")
740 (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseimul,mmx,mmxadd,mmxcmp")))
741 "decodern,(p0|p1)")
742
743 ;; read-modify and register-memory instructions have 2 or three uops,
744 ;; so they have to be decoded on decoder0.
745 (define_insn_reservation "ppro_insn_load" 3
746 (and (eq_attr "cpu" "pentiumpro")
747 (and (eq_attr "memory" "load")
748 (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseimul,mmx,mmxadd,mmxcmp")))
749 "decoder0,p2+(p0|p1)")
750
751 (define_insn_reservation "ppro_insn_store" 1
752 (and (eq_attr "cpu" "pentiumpro")
753 (and (eq_attr "memory" "store")
754 (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseimul,mmx,mmxadd,mmxcmp")))
755 "decoder0,(p0|p1),p4+p3")
756
757 ;; read-modify-store instructions produce 4 uops so they have to be
758 ;; decoded on decoder0 as well.
759 (define_insn_reservation "ppro_insn_both" 4
760 (and (eq_attr "cpu" "pentiumpro")
761 (and (eq_attr "memory" "both")
762 (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseimul,mmx,mmxadd,mmxcmp")))
763 "decoder0,p2+(p0|p1),p4+p3")
764