x86-tune-costs.h source code [gcc/config/i386/x86-tune-costs.h]

1	/ Costs of operations of individual x86 CPUs.*
2	Copyright (C) 1988-2023 Free Software Foundation, Inc.
3
4	This file is part of GCC.
5
6	GCC is free software; you can redistribute it and/or modify
7	it under the terms of the GNU General Public License as published by
8	the Free Software Foundation; either version 3, or (at your option)
9	any later version.
10
11	GCC is distributed in the hope that it will be useful,
12	but WITHOUT ANY WARRANTY; without even the implied warranty of
13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	GNU General Public License for more details.
15
16	Under Section 7 of GPL version 3, you are granted additional
17	permissions described in the GCC Runtime Library Exception, version
18	3.1, as published by the Free Software Foundation.
19
20	You should have received a copy of the GNU General Public License and
21	a copy of the GCC Runtime Library Exception along with this program;
22	see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23	<http://www.gnu.org/licenses/>. /*
24	/ Processor costs (relative to an add) /
25	/ We assume COSTS_N_INSNS is defined as (N)4 and an addition is 2 bytes. /*
26	#define COSTS_N_BYTES(N) ((N) * 2)
27
28	#define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
29
30	static stringop_algs ix86_size_memcpy[`2`] = {
31	{.unknown_size: rep_prefix_1_byte, .size: {{-`1`, rep_prefix_1_byte, false}}},
32	{.unknown_size: rep_prefix_1_byte, .size: {{-`1`, rep_prefix_1_byte, false}}}};
33	static stringop_algs ix86_size_memset[`2`] = {
34	{.unknown_size: rep_prefix_1_byte, .size: {{-`1`, rep_prefix_1_byte, false}}},
35	{.unknown_size: rep_prefix_1_byte, .size: {{-`1`, rep_prefix_1_byte, false}}}};
36
37	const
38	struct processor_costs ix86_size_cost = {/ costs for tuning for size /
39	.hard_register: {
40	/ Start of register allocator costs. integer->integer move cost is 2. /
41	.movzbl_load: `2`, / cost for loading QImode using movzbl /
42	.int_load: {`2`, `2`, `2`}, / cost of loading integer registers*
43	in QImode, HImode and SImode.
44	Relative to reg-reg move (2). /*
45	.int_store: {`2`, `2`, `2`}, / cost of storing integer registers /
46	.fp_move: `2`, / cost of reg,reg fld/fst /
47	.fp_load: {`2`, `2`, `2`}, / cost of loading fp registers*
48	in SFmode, DFmode and XFmode /*
49	.fp_store: {`2`, `2`, `2`}, / cost of storing fp registers*
50	in SFmode, DFmode and XFmode /*
51	.mmx_move: `3`, / cost of moving MMX register /
52	.mmx_load: {`3`, `3`}, / cost of loading MMX registers*
53	in SImode and DImode /*
54	.mmx_store: {`3`, `3`}, / cost of storing MMX registers*
55	in SImode and DImode /*
56	.xmm_move: `3`, .ymm_move: `3`, .zmm_move: `3`, / cost of moving XMM,YMM,ZMM register /
57	.sse_load: {`3`, `3`, `3`, `3`, `3`}, / cost of loading SSE registers*
58	in 32,64,128,256 and 512-bit /*
59	.sse_store: {`3`, `3`, `3`, `3`, `3`}, / cost of storing SSE registers*
60	in 32,64,128,256 and 512-bit /*
61	.sse_to_integer: `3`, .integer_to_sse: `3`, / SSE->integer and integer->SSE moves /
62	.mask_to_integer: `3`, .integer_to_mask: `3`, / mask->integer and integer->mask moves /
63	.mask_load: {`2`, `2`, `2`}, / cost of loading mask register*
64	in QImode, HImode, SImode. /*
65	.mask_store: {`2`, `2`, `2`}, / cost if storing mask register*
66	in QImode, HImode, SImode. /*
67	.mask_move: `2`, / cost of moving mask register. /
68	/ End of register allocator costs. /
69	},
70
71	COSTS_N_BYTES (`2`), / cost of an add instruction /
72	COSTS_N_BYTES (`3`), / cost of a lea instruction /
73	COSTS_N_BYTES (`2`), / variable shift costs /
74	COSTS_N_BYTES (`3`), / constant shift costs /
75	.mult_init: {COSTS_N_BYTES (`3`), / cost of starting multiply for QI /
76	COSTS_N_BYTES (`3`), / HI /
77	COSTS_N_BYTES (`3`), / SI /
78	COSTS_N_BYTES (`3`), / DI /
79	COSTS_N_BYTES (`5`)}, / other /
80	.mult_bit: `0`, / cost of multiply per each bit set /
81	.divide: {COSTS_N_BYTES (`3`), / cost of a divide/mod for QI /
82	COSTS_N_BYTES (`3`), / HI /
83	COSTS_N_BYTES (`3`), / SI /
84	COSTS_N_BYTES (`3`), / DI /
85	COSTS_N_BYTES (`5`)}, / other /
86	COSTS_N_BYTES (`3`), / cost of movsx /
87	COSTS_N_BYTES (`3`), / cost of movzx /
88	.large_insn: `0`, / "large" insn /
89	.move_ratio: `2`, / MOVE_RATIO /
90	.clear_ratio: `2`, / CLEAR_RATIO /
91	.int_load: {`2`, `2`, `2`}, / cost of loading integer registers*
92	in QImode, HImode and SImode.
93	Relative to reg-reg move (2). /*
94	.int_store: {`2`, `2`, `2`}, / cost of storing integer registers /
95	.sse_load: {`3`, `3`, `3`, `3`, `3`}, / cost of loading SSE register*
96	in 32bit, 64bit, 128bit, 256bit and 512bit /*
97	.sse_store: {`3`, `3`, `3`, `3`, `3`}, / cost of storing SSE register*
98	in 32bit, 64bit, 128bit, 256bit and 512bit /*
99	.sse_unaligned_load: {`3`, `3`, `3`, `3`, `3`}, / cost of unaligned SSE load*
100	in 128bit, 256bit and 512bit /*
101	.sse_unaligned_store: {`3`, `3`, `3`, `3`, `3`}, / cost of unaligned SSE store*
102	in 128bit, 256bit and 512bit /*
103	.xmm_move: `3`, .ymm_move: `3`, .zmm_move: `3`, / cost of moving XMM,YMM,ZMM register /
104	.sse_to_integer: `3`, / cost of moving SSE register to integer. /
105	.gather_static: `5`, .gather_per_elt: `0`, / Gather load static, per_elt. /
106	.scatter_static: `5`, .scatter_per_elt: `0`, / Gather store static, per_elt. /
107	.l1_cache_size: `0`, / size of l1 cache /
108	.l2_cache_size: `0`, / size of l2 cache /
109	.prefetch_block: `0`, / size of prefetch block /
110	.simultaneous_prefetches: `0`, / number of parallel prefetches /
111	.branch_cost: `2`, / Branch cost /
112	COSTS_N_BYTES (`2`), / cost of FADD and FSUB insns. /
113	COSTS_N_BYTES (`2`), / cost of FMUL instruction. /
114	COSTS_N_BYTES (`2`), / cost of FDIV instruction. /
115	COSTS_N_BYTES (`2`), / cost of FABS instruction. /
116	COSTS_N_BYTES (`2`), / cost of FCHS instruction. /
117	COSTS_N_BYTES (`2`), / cost of FSQRT instruction. /
118
119	COSTS_N_BYTES (`2`), / cost of cheap SSE instruction. /
120	COSTS_N_BYTES (`2`), / cost of ADDSS/SD SUBSS/SD insns. /
121	COSTS_N_BYTES (`2`), / cost of MULSS instruction. /
122	COSTS_N_BYTES (`2`), / cost of MULSD instruction. /
123	COSTS_N_BYTES (`2`), / cost of FMA SS instruction. /
124	COSTS_N_BYTES (`2`), / cost of FMA SD instruction. /
125	COSTS_N_BYTES (`2`), / cost of DIVSS instruction. /
126	COSTS_N_BYTES (`2`), / cost of DIVSD instruction. /
127	COSTS_N_BYTES (`2`), / cost of SQRTSS instruction. /
128	COSTS_N_BYTES (`2`), / cost of SQRTSD instruction. /
129	.reassoc_int: `1`, .reassoc_fp: `1`, .reassoc_vec_int: `1`, .reassoc_vec_fp: `1`, / reassoc int, fp, vec_int, vec_fp. /
130	.memcpy: ix86_size_memcpy,
131	.memset: ix86_size_memset,
132	COSTS_N_BYTES (`1`), / cond_taken_branch_cost. /
133	COSTS_N_BYTES (`1`), / cond_not_taken_branch_cost. /
134	NULL, / Loop alignment. /
135	NULL, / Jump alignment. /
136	NULL, / Label alignment. /
137	NULL, / Func alignment. /
138	.small_unroll_ninsns: `4`, / Small unroll limit. /
139	.small_unroll_factor: `2`, / Small unroll factor. /
140	};
141
142	/ Processor costs (relative to an add) /
143	static stringop_algs i386_memcpy[`2`] = {
144	{.unknown_size: rep_prefix_1_byte, .size: {{-`1`, rep_prefix_1_byte, false}}},
145	DUMMY_STRINGOP_ALGS};
146	static stringop_algs i386_memset[`2`] = {
147	{.unknown_size: rep_prefix_1_byte, .size: {{-`1`, rep_prefix_1_byte, false}}},
148	DUMMY_STRINGOP_ALGS};
149
150	static const
151	struct processor_costs i386_cost = { / 386 specific costs /
152	.hard_register: {
153	/ Start of register allocator costs. integer->integer move cost is 2. /
154	.movzbl_load: `4`, / cost for loading QImode using movzbl /
155	.int_load: {`2`, `4`, `2`}, / cost of loading integer registers*
156	in QImode, HImode and SImode.
157	Relative to reg-reg move (2). /*
158	.int_store: {`2`, `4`, `2`}, / cost of storing integer registers /
159	.fp_move: `2`, / cost of reg,reg fld/fst /
160	.fp_load: {`8`, `8`, `8`}, / cost of loading fp registers*
161	in SFmode, DFmode and XFmode /*
162	.fp_store: {`8`, `8`, `8`}, / cost of storing fp registers*
163	in SFmode, DFmode and XFmode /*
164	.mmx_move: `2`, / cost of moving MMX register /
165	.mmx_load: {`4`, `8`}, / cost of loading MMX registers*
166	in SImode and DImode /*
167	.mmx_store: {`4`, `8`}, / cost of storing MMX registers*
168	in SImode and DImode /*
169	.xmm_move: `2`, .ymm_move: `4`, .zmm_move: `8`, / cost of moving XMM,YMM,ZMM register /
170	.sse_load: {`4`, `8`, `16`, `32`, `64`}, / cost of loading SSE registers*
171	in 32,64,128,256 and 512-bit /*
172	.sse_store: {`4`, `8`, `16`, `32`, `64`}, / cost of storing SSE registers*
173	in 32,64,128,256 and 512-bit /*
174	.sse_to_integer: `3`, .integer_to_sse: `3`, / SSE->integer and integer->SSE moves /
175	.mask_to_integer: `3`, .integer_to_mask: `3`, / mask->integer and integer->mask moves /
176	.mask_load: {`2`, `4`, `2`}, / cost of loading mask register*
177	in QImode, HImode, SImode. /*
178	.mask_store: {`2`, `4`, `2`}, / cost if storing mask register*
179	in QImode, HImode, SImode. /*
180	.mask_move: `2`, / cost of moving mask register. /
181	/ End of register allocator costs. /
182	},
183
184	COSTS_N_INSNS (`1`), / cost of an add instruction /
185	COSTS_N_INSNS (`1`), / cost of a lea instruction /
186	COSTS_N_INSNS (`3`), / variable shift costs /
187	COSTS_N_INSNS (`2`), / constant shift costs /
188	.mult_init: {COSTS_N_INSNS (`6`), / cost of starting multiply for QI /
189	COSTS_N_INSNS (`6`), / HI /
190	COSTS_N_INSNS (`6`), / SI /
191	COSTS_N_INSNS (`6`), / DI /
192	COSTS_N_INSNS (`6`)}, / other /
193	COSTS_N_INSNS (`1`), / cost of multiply per each bit set /
194	.divide: {COSTS_N_INSNS (`23`), / cost of a divide/mod for QI /
195	COSTS_N_INSNS (`23`), / HI /
196	COSTS_N_INSNS (`23`), / SI /
197	COSTS_N_INSNS (`23`), / DI /
198	COSTS_N_INSNS (`23`)}, / other /
199	COSTS_N_INSNS (`3`), / cost of movsx /
200	COSTS_N_INSNS (`2`), / cost of movzx /
201	.large_insn: `15`, / "large" insn /
202	.move_ratio: `3`, / MOVE_RATIO /
203	.clear_ratio: `3`, / CLEAR_RATIO /
204	.int_load: {`2`, `4`, `2`}, / cost of loading integer registers*
205	in QImode, HImode and SImode.
206	Relative to reg-reg move (2). /*
207	.int_store: {`2`, `4`, `2`}, / cost of storing integer registers /
208	.sse_load: {`4`, `8`, `16`, `32`, `64`}, / cost of loading SSE register*
209	in 32bit, 64bit, 128bit, 256bit and 512bit /*
210	.sse_store: {`4`, `8`, `16`, `32`, `64`}, / cost of storing SSE register*
211	in 32bit, 64bit, 128bit, 256bit and 512bit /*
212	.sse_unaligned_load: {`4`, `8`, `16`, `32`, `64`}, / cost of unaligned loads. /
213	.sse_unaligned_store: {`4`, `8`, `16`, `32`, `64`}, / cost of unaligned stores. /
214	.xmm_move: `2`, .ymm_move: `4`, .zmm_move: `8`, / cost of moving XMM,YMM,ZMM register /
215	.sse_to_integer: `3`, / cost of moving SSE register to integer. /
216	.gather_static: `4`, .gather_per_elt: `4`, / Gather load static, per_elt. /
217	.scatter_static: `4`, .scatter_per_elt: `4`, / Gather store static, per_elt. /
218	.l1_cache_size: `0`, / size of l1 cache /
219	.l2_cache_size: `0`, / size of l2 cache /
220	.prefetch_block: `0`, / size of prefetch block /
221	.simultaneous_prefetches: `0`, / number of parallel prefetches /
222	.branch_cost: `1`, / Branch cost /
223	COSTS_N_INSNS (`23`), / cost of FADD and FSUB insns. /
224	COSTS_N_INSNS (`27`), / cost of FMUL instruction. /
225	COSTS_N_INSNS (`88`), / cost of FDIV instruction. /
226	COSTS_N_INSNS (`22`), / cost of FABS instruction. /
227	COSTS_N_INSNS (`24`), / cost of FCHS instruction. /
228	COSTS_N_INSNS (`122`), / cost of FSQRT instruction. /
229
230	COSTS_N_INSNS (`1`), / cost of cheap SSE instruction. /
231	COSTS_N_INSNS (`23`), / cost of ADDSS/SD SUBSS/SD insns. /
232	COSTS_N_INSNS (`27`), / cost of MULSS instruction. /
233	COSTS_N_INSNS (`27`), / cost of MULSD instruction. /
234	COSTS_N_INSNS (`27`), / cost of FMA SS instruction. /
235	COSTS_N_INSNS (`27`), / cost of FMA SD instruction. /
236	COSTS_N_INSNS (`88`), / cost of DIVSS instruction. /
237	COSTS_N_INSNS (`88`), / cost of DIVSD instruction. /
238	COSTS_N_INSNS (`122`), / cost of SQRTSS instruction. /
239	COSTS_N_INSNS (`122`), / cost of SQRTSD instruction. /
240	.reassoc_int: `1`, .reassoc_fp: `1`, .reassoc_vec_int: `1`, .reassoc_vec_fp: `1`, / reassoc int, fp, vec_int, vec_fp. /
241	.memcpy: i386_memcpy,
242	.memset: i386_memset,
243	COSTS_N_INSNS (`3`), / cond_taken_branch_cost. /
244	COSTS_N_INSNS (`1`), / cond_not_taken_branch_cost. /
245	.align_loop: "4", / Loop alignment. /
246	.align_jump: "4", / Jump alignment. /
247	NULL, / Label alignment. /
248	.align_func: "4", / Func alignment. /
249	.small_unroll_ninsns: `4`, / Small unroll limit. /
250	.small_unroll_factor: `2`, / Small unroll factor. /
251	};
252
253	static stringop_algs i486_memcpy[`2`] = {
254	{.unknown_size: rep_prefix_4_byte, .size: {{-`1`, rep_prefix_4_byte, false}}},
255	DUMMY_STRINGOP_ALGS};
256	static stringop_algs i486_memset[`2`] = {
257	{.unknown_size: rep_prefix_4_byte, .size: {{-`1`, rep_prefix_4_byte, false}}},
258	DUMMY_STRINGOP_ALGS};
259
260	static const
261	struct processor_costs i486_cost = { / 486 specific costs /
262	.hard_register: {
263	/ Start of register allocator costs. integer->integer move cost is 2. /
264	.movzbl_load: `4`, / cost for loading QImode using movzbl /
265	.int_load: {`2`, `4`, `2`}, / cost of loading integer registers*
266	in QImode, HImode and SImode.
267	Relative to reg-reg move (2). /*
268	.int_store: {`2`, `4`, `2`}, / cost of storing integer registers /
269	.fp_move: `2`, / cost of reg,reg fld/fst /
270	.fp_load: {`8`, `8`, `8`}, / cost of loading fp registers*
271	in SFmode, DFmode and XFmode /*
272	.fp_store: {`8`, `8`, `8`}, / cost of storing fp registers*
273	in SFmode, DFmode and XFmode /*
274	.mmx_move: `2`, / cost of moving MMX register /
275	.mmx_load: {`4`, `8`}, / cost of loading MMX registers*
276	in SImode and DImode /*
277	.mmx_store: {`4`, `8`}, / cost of storing MMX registers*
278	in SImode and DImode /*
279	.xmm_move: `2`, .ymm_move: `4`, .zmm_move: `8`, / cost of moving XMM,YMM,ZMM register /
280	.sse_load: {`4`, `8`, `16`, `32`, `64`}, / cost of loading SSE registers*
281	in 32,64,128,256 and 512-bit /*
282	.sse_store: {`4`, `8`, `16`, `32`, `64`}, / cost of storing SSE registers*
283	in 32,64,128,256 and 512-bit /*
284	.sse_to_integer: `3`, .integer_to_sse: `3`, / SSE->integer and integer->SSE moves /
285	.mask_to_integer: `3`, .integer_to_mask: `3`, / mask->integer and integer->mask moves /
286	.mask_load: {`2`, `4`, `2`}, / cost of loading mask register*
287	in QImode, HImode, SImode. /*
288	.mask_store: {`2`, `4`, `2`}, / cost if storing mask register*
289	in QImode, HImode, SImode. /*
290	.mask_move: `2`, / cost of moving mask register. /
291	/ End of register allocator costs. /
292	},
293
294	COSTS_N_INSNS (`1`), / cost of an add instruction /
295	COSTS_N_INSNS (`1`), / cost of a lea instruction /
296	COSTS_N_INSNS (`3`), / variable shift costs /
297	COSTS_N_INSNS (`2`), / constant shift costs /
298	.mult_init: {COSTS_N_INSNS (`12`), / cost of starting multiply for QI /
299	COSTS_N_INSNS (`12`), / HI /
300	COSTS_N_INSNS (`12`), / SI /
301	COSTS_N_INSNS (`12`), / DI /
302	COSTS_N_INSNS (`12`)}, / other /
303	.mult_bit: `1`, / cost of multiply per each bit set /
304	.divide: {COSTS_N_INSNS (`40`), / cost of a divide/mod for QI /
305	COSTS_N_INSNS (`40`), / HI /
306	COSTS_N_INSNS (`40`), / SI /
307	COSTS_N_INSNS (`40`), / DI /
308	COSTS_N_INSNS (`40`)}, / other /
309	COSTS_N_INSNS (`3`), / cost of movsx /
310	COSTS_N_INSNS (`2`), / cost of movzx /
311	.large_insn: `15`, / "large" insn /
312	.move_ratio: `3`, / MOVE_RATIO /
313	.clear_ratio: `3`, / CLEAR_RATIO /
314	.int_load: {`2`, `4`, `2`}, / cost of loading integer registers*
315	in QImode, HImode and SImode.
316	Relative to reg-reg move (2). /*
317	.int_store: {`2`, `4`, `2`}, / cost of storing integer registers /
318	.sse_load: {`4`, `8`, `16`, `32`, `64`}, / cost of loading SSE register*
319	in 32bit, 64bit, 128bit, 256bit and 512bit /*
320	.sse_store: {`4`, `8`, `16`, `32`, `64`}, / cost of storing SSE register*
321	in 32bit, 64bit, 128bit, 256bit and 512bit /*
322	.sse_unaligned_load: {`4`, `8`, `16`, `32`, `64`}, / cost of unaligned loads. /
323	.sse_unaligned_store: {`4`, `8`, `16`, `32`, `64`}, / cost of unaligned stores. /
324	.xmm_move: `2`, .ymm_move: `4`, .zmm_move: `8`, / cost of moving XMM,YMM,ZMM register /
325	.sse_to_integer: `3`, / cost of moving SSE register to integer. /
326	.gather_static: `4`, .gather_per_elt: `4`, / Gather load static, per_elt. /
327	.scatter_static: `4`, .scatter_per_elt: `4`, / Gather store static, per_elt. /
328	.l1_cache_size: `4`, / size of l1 cache. 486 has 8kB cache*
329	shared for code and data, so 4kB is
330	not really precise. /*
331	.l2_cache_size: `4`, / size of l2 cache /
332	.prefetch_block: `0`, / size of prefetch block /
333	.simultaneous_prefetches: `0`, / number of parallel prefetches /
334	.branch_cost: `1`, / Branch cost /
335	COSTS_N_INSNS (`8`), / cost of FADD and FSUB insns. /
336	COSTS_N_INSNS (`16`), / cost of FMUL instruction. /
337	COSTS_N_INSNS (`73`), / cost of FDIV instruction. /
338	COSTS_N_INSNS (`3`), / cost of FABS instruction. /
339	COSTS_N_INSNS (`3`), / cost of FCHS instruction. /
340	COSTS_N_INSNS (`83`), / cost of FSQRT instruction. /
341
342	COSTS_N_INSNS (`1`), / cost of cheap SSE instruction. /
343	COSTS_N_INSNS (`8`), / cost of ADDSS/SD SUBSS/SD insns. /
344	COSTS_N_INSNS (`16`), / cost of MULSS instruction. /
345	COSTS_N_INSNS (`16`), / cost of MULSD instruction. /
346	COSTS_N_INSNS (`16`), / cost of FMA SS instruction. /
347	COSTS_N_INSNS (`16`), / cost of FMA SD instruction. /
348	COSTS_N_INSNS (`73`), / cost of DIVSS instruction. /
349	COSTS_N_INSNS (`74`), / cost of DIVSD instruction. /
350	COSTS_N_INSNS (`83`), / cost of SQRTSS instruction. /
351	COSTS_N_INSNS (`83`), / cost of SQRTSD instruction. /
352	.reassoc_int: `1`, .reassoc_fp: `1`, .reassoc_vec_int: `1`, .reassoc_vec_fp: `1`, / reassoc int, fp, vec_int, vec_fp. /
353	.memcpy: i486_memcpy,
354	.memset: i486_memset,
355	COSTS_N_INSNS (`3`), / cond_taken_branch_cost. /
356	COSTS_N_INSNS (`1`), / cond_not_taken_branch_cost. /
357	.align_loop: "16", / Loop alignment. /
358	.align_jump: "16", / Jump alignment. /
359	.align_label: "0:0:8", / Label alignment. /
360	.align_func: "16", / Func alignment. /
361	.small_unroll_ninsns: `4`, / Small unroll limit. /
362	.small_unroll_factor: `2`, / Small unroll factor. /
363	};
364
365	static stringop_algs pentium_memcpy[`2`] = {
366	{.unknown_size: libcall, .size: {{`256`, rep_prefix_4_byte, false}, {-`1`, libcall, false}}},
367	DUMMY_STRINGOP_ALGS};
368	static stringop_algs pentium_memset[`2`] = {
369	{.unknown_size: libcall, .size: {{-`1`, rep_prefix_4_byte, false}}},
370	DUMMY_STRINGOP_ALGS};
371
372	static const
373	struct processor_costs pentium_cost = {
374	.hard_register: {
375	/ Start of register allocator costs. integer->integer move cost is 2. /
376	.movzbl_load: `6`, / cost for loading QImode using movzbl /
377	.int_load: {`2`, `4`, `2`}, / cost of loading integer registers*
378	in QImode, HImode and SImode.
379	Relative to reg-reg move (2). /*
380	.int_store: {`2`, `4`, `2`}, / cost of storing integer registers /
381	.fp_move: `2`, / cost of reg,reg fld/fst /
382	.fp_load: {`2`, `2`, `6`}, / cost of loading fp registers*
383	in SFmode, DFmode and XFmode /*
384	.fp_store: {`4`, `4`, `6`}, / cost of storing fp registers*
385	in SFmode, DFmode and XFmode /*
386	.mmx_move: `8`, / cost of moving MMX register /
387	.mmx_load: {`8`, `8`}, / cost of loading MMX registers*
388	in SImode and DImode /*
389	.mmx_store: {`8`, `8`}, / cost of storing MMX registers*
390	in SImode and DImode /*
391	.xmm_move: `2`, .ymm_move: `4`, .zmm_move: `8`, / cost of moving XMM,YMM,ZMM register /
392	.sse_load: {`4`, `8`, `16`, `32`, `64`}, / cost of loading SSE registers*
393	in 32,64,128,256 and 512-bit /*
394	.sse_store: {`4`, `8`, `16`, `32`, `64`}, / cost of storing SSE registers*
395	in 32,64,128,256 and 512-bit /*
396	.sse_to_integer: `3`, .integer_to_sse: `3`, / SSE->integer and integer->SSE moves /
397	.mask_to_integer: `3`, .integer_to_mask: `3`, / mask->integer and integer->mask moves /
398	.mask_load: {`2`, `4`, `2`}, / cost of loading mask register*
399	in QImode, HImode, SImode. /*
400	.mask_store: {`2`, `4`, `2`}, / cost if storing mask register*
401	in QImode, HImode, SImode. /*
402	.mask_move: `2`, / cost of moving mask register. /
403	/ End of register allocator costs. /
404	},
405
406	COSTS_N_INSNS (`1`), / cost of an add instruction /
407	COSTS_N_INSNS (`1`), / cost of a lea instruction /
408	COSTS_N_INSNS (`4`), / variable shift costs /
409	COSTS_N_INSNS (`1`), / constant shift costs /
410	.mult_init: {COSTS_N_INSNS (`11`), / cost of starting multiply for QI /
411	COSTS_N_INSNS (`11`), / HI /
412	COSTS_N_INSNS (`11`), / SI /
413	COSTS_N_INSNS (`11`), / DI /
414	COSTS_N_INSNS (`11`)}, / other /
415	.mult_bit: `0`, / cost of multiply per each bit set /
416	.divide: {COSTS_N_INSNS (`25`), / cost of a divide/mod for QI /
417	COSTS_N_INSNS (`25`), / HI /
418	COSTS_N_INSNS (`25`), / SI /
419	COSTS_N_INSNS (`25`), / DI /
420	COSTS_N_INSNS (`25`)}, / other /
421	COSTS_N_INSNS (`3`), / cost of movsx /
422	COSTS_N_INSNS (`2`), / cost of movzx /
423	.large_insn: `8`, / "large" insn /
424	.move_ratio: `6`, / MOVE_RATIO /
425	.clear_ratio: `6`, / CLEAR_RATIO /
426	.int_load: {`2`, `4`, `2`}, / cost of loading integer registers*
427	in QImode, HImode and SImode.
428	Relative to reg-reg move (2). /*
429	.int_store: {`2`, `4`, `2`}, / cost of storing integer registers /
430	.sse_load: {`4`, `8`, `16`, `32`, `64`}, / cost of loading SSE register*
431	in 32bit, 64bit, 128bit, 256bit and 512bit /*
432	.sse_store: {`4`, `8`, `16`, `32`, `64`}, / cost of storing SSE register*
433	in 32bit, 64bit, 128bit, 256bit and 512bit /*
434	.sse_unaligned_load: {`4`, `8`, `16`, `32`, `64`}, / cost of unaligned loads. /
435	.sse_unaligned_store: {`4`, `8`, `16`, `32`, `64`}, / cost of unaligned stores. /
436	.xmm_move: `2`, .ymm_move: `4`, .zmm_move: `8`, / cost of moving XMM,YMM,ZMM register /
437	.sse_to_integer: `3`, / cost of moving SSE register to integer. /
438	.gather_static: `4`, .gather_per_elt: `4`, / Gather load static, per_elt. /
439	.scatter_static: `4`, .scatter_per_elt: `4`, / Gather store static, per_elt. /
440	.l1_cache_size: `8`, / size of l1 cache. /
441	.l2_cache_size: `8`, / size of l2 cache /
442	.prefetch_block: `0`, / size of prefetch block /
443	.simultaneous_prefetches: `0`, / number of parallel prefetches /
444	.branch_cost: `2`, / Branch cost /
445	COSTS_N_INSNS (`3`), / cost of FADD and FSUB insns. /
446	COSTS_N_INSNS (`3`), / cost of FMUL instruction. /
447	COSTS_N_INSNS (`39`), / cost of FDIV instruction. /
448	COSTS_N_INSNS (`1`), / cost of FABS instruction. /
449	COSTS_N_INSNS (`1`), / cost of FCHS instruction. /
450	COSTS_N_INSNS (`70`), / cost of FSQRT instruction. /
451
452	COSTS_N_INSNS (`1`), / cost of cheap SSE instruction. /
453	COSTS_N_INSNS (`3`), / cost of ADDSS/SD SUBSS/SD insns. /
454	COSTS_N_INSNS (`3`), / cost of MULSS instruction. /
455	COSTS_N_INSNS (`3`), / cost of MULSD instruction. /
456	COSTS_N_INSNS (`6`), / cost of FMA SS instruction. /
457	COSTS_N_INSNS (`6`), / cost of FMA SD instruction. /
458	COSTS_N_INSNS (`39`), / cost of DIVSS instruction. /
459	COSTS_N_INSNS (`39`), / cost of DIVSD instruction. /
460	COSTS_N_INSNS (`70`), / cost of SQRTSS instruction. /
461	COSTS_N_INSNS (`70`), / cost of SQRTSD instruction. /
462	.reassoc_int: `1`, .reassoc_fp: `1`, .reassoc_vec_int: `1`, .reassoc_vec_fp: `1`, / reassoc int, fp, vec_int, vec_fp. /
463	.memcpy: pentium_memcpy,
464	.memset: pentium_memset,
465	COSTS_N_INSNS (`3`), / cond_taken_branch_cost. /
466	COSTS_N_INSNS (`1`), / cond_not_taken_branch_cost. /
467	.align_loop: "16:8:8", / Loop alignment. /
468	.align_jump: "16:8:8", / Jump alignment. /
469	.align_label: "0:0:8", / Label alignment. /
470	.align_func: "16", / Func alignment. /
471	.small_unroll_ninsns: `4`, / Small unroll limit. /
472	.small_unroll_factor: `2`, / Small unroll factor. /
473	};
474
475	static const
476	struct processor_costs lakemont_cost = {
477	.hard_register: {
478	/ Start of register allocator costs. integer->integer move cost is 2. /
479	.movzbl_load: `6`, / cost for loading QImode using movzbl /
480	.int_load: {`2`, `4`, `2`}, / cost of loading integer registers*
481	in QImode, HImode and SImode.
482	Relative to reg-reg move (2). /*
483	.int_store: {`2`, `4`, `2`}, / cost of storing integer registers /
484	.fp_move: `2`, / cost of reg,reg fld/fst /
485	.fp_load: {`2`, `2`, `6`}, / cost of loading fp registers*
486	in SFmode, DFmode and XFmode /*
487	.fp_store: {`4`, `4`, `6`}, / cost of storing fp registers*
488	in SFmode, DFmode and XFmode /*
489	.mmx_move: `8`, / cost of moving MMX register /
490	.mmx_load: {`8`, `8`}, / cost of loading MMX registers*
491	in SImode and DImode /*
492	.mmx_store: {`8`, `8`}, / cost of storing MMX registers*
493	in SImode and DImode /*
494	.xmm_move: `2`, .ymm_move: `4`, .zmm_move: `8`, / cost of moving XMM,YMM,ZMM register /
495	.sse_load: {`4`, `8`, `16`, `32`, `64`}, / cost of loading SSE registers*
496	in 32,64,128,256 and 512-bit /*
497	.sse_store: {`4`, `8`, `16`, `32`, `64`}, / cost of storing SSE registers*
498	in 32,64,128,256 and 512-bit /*
499	.sse_to_integer: `3`, .integer_to_sse: `3`, / SSE->integer and integer->SSE moves /
500	.mask_to_integer: `3`, .integer_to_mask: `3`, / mask->integer and integer->mask moves /
501	.mask_load: {`2`, `4`, `2`}, / cost of loading mask register*
502	in QImode, HImode, SImode. /*
503	.mask_store: {`2`, `4`, `2`}, / cost if storing mask register*
504	in QImode, HImode, SImode. /*
505	.mask_move: `2`, / cost of moving mask register. /
506	/ End of register allocator costs. /
507	},
508
509	COSTS_N_INSNS (`1`), / cost of an add instruction /
510	COSTS_N_INSNS (`1`) + `1`, / cost of a lea instruction /
511	COSTS_N_INSNS (`1`), / variable shift costs /
512	COSTS_N_INSNS (`1`), / constant shift costs /
513	.mult_init: {COSTS_N_INSNS (`11`), / cost of starting multiply for QI /
514	COSTS_N_INSNS (`11`), / HI /
515	COSTS_N_INSNS (`11`), / SI /
516	COSTS_N_INSNS (`11`), / DI /
517	COSTS_N_INSNS (`11`)}, / other /
518	.mult_bit: `0`, / cost of multiply per each bit set /
519	.divide: {COSTS_N_INSNS (`25`), / cost of a divide/mod for QI /
520	COSTS_N_INSNS (`25`), / HI /
521	COSTS_N_INSNS (`25`), / SI /
522	COSTS_N_INSNS (`25`), / DI /
523	COSTS_N_INSNS (`25`)}, / other /
524	COSTS_N_INSNS (`3`), / cost of movsx /
525	COSTS_N_INSNS (`2`), / cost of movzx /
526	.large_insn: `8`, / "large" insn /
527	.move_ratio: `17`, / MOVE_RATIO /
528	.clear_ratio: `6`, / CLEAR_RATIO /
529	.int_load: {`2`, `4`, `2`}, / cost of loading integer registers*
530	in QImode, HImode and SImode.
531	Relative to reg-reg move (2). /*
532	.int_store: {`2`, `4`, `2`}, / cost of storing integer registers /
533	.sse_load: {`4`, `8`, `16`, `32`, `64`}, / cost of loading SSE register*
534	in 32bit, 64bit, 128bit, 256bit and 512bit /*
535	.sse_store: {`4`, `8`, `16`, `32`, `64`}, / cost of storing SSE register*
536	in 32bit, 64bit, 128bit, 256bit and 512bit /*
537	.sse_unaligned_load: {`4`, `8`, `16`, `32`, `64`}, / cost of unaligned loads. /
538	.sse_unaligned_store: {`4`, `8`, `16`, `32`, `64`}, / cost of unaligned stores. /
539	.xmm_move: `2`, .ymm_move: `4`, .zmm_move: `8`, / cost of moving XMM,YMM,ZMM register /
540	.sse_to_integer: `3`, / cost of moving SSE register to integer. /
541	.gather_static: `4`, .gather_per_elt: `4`, / Gather load static, per_elt. /
542	.scatter_static: `4`, .scatter_per_elt: `4`, / Gather store static, per_elt. /
543	.l1_cache_size: `8`, / size of l1 cache. /
544	.l2_cache_size: `8`, / size of l2 cache /
545	.prefetch_block: `0`, / size of prefetch block /
546	.simultaneous_prefetches: `0`, / number of parallel prefetches /
547	.branch_cost: `2`, / Branch cost /
548	COSTS_N_INSNS (`3`), / cost of FADD and FSUB insns. /
549	COSTS_N_INSNS (`3`), / cost of FMUL instruction. /
550	COSTS_N_INSNS (`39`), / cost of FDIV instruction. /
551	COSTS_N_INSNS (`1`), / cost of FABS instruction. /
552	COSTS_N_INSNS (`1`), / cost of FCHS instruction. /
553	COSTS_N_INSNS (`70`), / cost of FSQRT instruction. /
554
555	COSTS_N_INSNS (`1`), / cost of cheap SSE instruction. /
556	COSTS_N_INSNS (`5`), / cost of ADDSS/SD SUBSS/SD insns. /
557	COSTS_N_INSNS (`5`), / cost of MULSS instruction. /
558	COSTS_N_INSNS (`5`), / cost of MULSD instruction. /
559	COSTS_N_INSNS (`10`), / cost of FMA SS instruction. /
560	COSTS_N_INSNS (`10`), / cost of FMA SD instruction. /
561	COSTS_N_INSNS (`31`), / cost of DIVSS instruction. /
562	COSTS_N_INSNS (`60`), / cost of DIVSD instruction. /
563	COSTS_N_INSNS (`31`), / cost of SQRTSS instruction. /
564	COSTS_N_INSNS (`63`), / cost of SQRTSD instruction. /
565	.reassoc_int: `1`, .reassoc_fp: `1`, .reassoc_vec_int: `1`, .reassoc_vec_fp: `1`, / reassoc int, fp, vec_int, vec_fp. /
566	.memcpy: pentium_memcpy,
567	.memset: pentium_memset,
568	COSTS_N_INSNS (`3`), / cond_taken_branch_cost. /
569	COSTS_N_INSNS (`1`), / cond_not_taken_branch_cost. /
570	.align_loop: "16:8:8", / Loop alignment. /
571	.align_jump: "16:8:8", / Jump alignment. /
572	.align_label: "0:0:8", / Label alignment. /
573	.align_func: "16", / Func alignment. /
574	.small_unroll_ninsns: `4`, / Small unroll limit. /
575	.small_unroll_factor: `2`, / Small unroll factor. /
576	};
577
578	/ PentiumPro has optimized rep instructions for blocks aligned by 8 bytes*
579	(we ensure the alignment). For small blocks inline loop is still a
580	noticeable win, for bigger blocks either rep movsl or rep movsb is
581	way to go. Rep movsb has apparently more expensive startup time in CPU,
582	but after 4K the difference is down in the noise. /*
583	static stringop_algs pentiumpro_memcpy[`2`] = {
584	{.unknown_size: rep_prefix_4_byte, .size: {{`128`, loop, false}, {`1024`, unrolled_loop, false},
585	{`8192`, rep_prefix_4_byte, false},
586	{-`1`, rep_prefix_1_byte, false}}},
587	DUMMY_STRINGOP_ALGS};
588	static stringop_algs pentiumpro_memset[`2`] = {
589	{.unknown_size: rep_prefix_4_byte, .size: {{`1024`, unrolled_loop, false},
590	{`8192`, rep_prefix_4_byte, false},
591	{-`1`, libcall, false}}},
592	DUMMY_STRINGOP_ALGS};
593	static const
594	struct processor_costs pentiumpro_cost = {
595	.hard_register: {
596	/ Start of register allocator costs. integer->integer move cost is 2. /
597	.movzbl_load: `2`, / cost for loading QImode using movzbl /
598	.int_load: {`4`, `4`, `4`}, / cost of loading integer registers*
599	in QImode, HImode and SImode.
600	Relative to reg-reg move (2). /*
601	.int_store: {`2`, `2`, `2`}, / cost of storing integer registers /
602	.fp_move: `2`, / cost of reg,reg fld/fst /
603	.fp_load: {`2`, `2`, `6`}, / cost of loading fp registers*
604	in SFmode, DFmode and XFmode /*
605	.fp_store: {`4`, `4`, `6`}, / cost of storing fp registers*
606	in SFmode, DFmode and XFmode /*
607	.mmx_move: `2`, / cost of moving MMX register /
608	.mmx_load: {`2`, `2`}, / cost of loading MMX registers*
609	in SImode and DImode /*
610	.mmx_store: {`2`, `2`}, / cost of storing MMX registers*
611	in SImode and DImode /*
612	.xmm_move: `2`, .ymm_move: `4`, .zmm_move: `8`, / cost of moving XMM,YMM,ZMM register /
613	.sse_load: {`4`, `8`, `16`, `32`, `64`}, / cost of loading SSE registers*
614	in 32,64,128,256 and 512-bit /*
615	.sse_store: {`4`, `8`, `16`, `32`, `64`}, / cost of storing SSE registers*
616	in 32,64,128,256 and 512-bit /*
617	.sse_to_integer: `3`, .integer_to_sse: `3`, / SSE->integer and integer->SSE moves /
618	.mask_to_integer: `3`, .integer_to_mask: `3`, / mask->integer and integer->mask moves /
619	.mask_load: {`4`, `4`, `4`}, / cost of loading mask register*
620	in QImode, HImode, SImode. /*
621	.mask_store: {`2`, `2`, `2`}, / cost if storing mask register*
622	in QImode, HImode, SImode. /*
623	.mask_move: `2`, / cost of moving mask register. /
624	/ End of register allocator costs. /
625	},
626
627	COSTS_N_INSNS (`1`), / cost of an add instruction /
628	COSTS_N_INSNS (`1`), / cost of a lea instruction /
629	COSTS_N_INSNS (`1`), / variable shift costs /
630	COSTS_N_INSNS (`1`), / constant shift costs /
631	.mult_init: {COSTS_N_INSNS (`4`), / cost of starting multiply for QI /
632	COSTS_N_INSNS (`4`), / HI /
633	COSTS_N_INSNS (`4`), / SI /
634	COSTS_N_INSNS (`4`), / DI /
635	COSTS_N_INSNS (`4`)}, / other /
636	.mult_bit: `0`, / cost of multiply per each bit set /
637	.divide: {COSTS_N_INSNS (`17`), / cost of a divide/mod for QI /
638	COSTS_N_INSNS (`17`), / HI /
639	COSTS_N_INSNS (`17`), / SI /
640	COSTS_N_INSNS (`17`), / DI /
641	COSTS_N_INSNS (`17`)}, / other /
642	COSTS_N_INSNS (`1`), / cost of movsx /
643	COSTS_N_INSNS (`1`), / cost of movzx /
644	.large_insn: `8`, / "large" insn /
645	.move_ratio: `6`, / MOVE_RATIO /
646	.clear_ratio: `6`, / CLEAR_RATIO /
647	.int_load: {`4`, `4`, `4`}, / cost of loading integer registers*
648	in QImode, HImode and SImode.
649	Relative to reg-reg move (2). /*
650	.int_store: {`2`, `2`, `2`}, / cost of storing integer registers /
651	.sse_load: {`4`, `8`, `16`, `32`, `64`}, / cost of loading SSE register*
652	in 32bit, 64bit, 128bit, 256bit and 512bit /*
653	.sse_store: {`4`, `8`, `16`, `32`, `64`}, / cost of storing SSE register*
654	in 32bit, 64bit, 128bit, 256bit and 512bit /*
655	.sse_unaligned_load: {`4`, `8`, `16`, `32`, `64`}, / cost of unaligned loads. /
656	.sse_unaligned_store: {`4`, `8`, `16`, `32`, `64`}, / cost of unaligned stores. /
657	.xmm_move: `2`, .ymm_move: `4`, .zmm_move: `8`, / cost of moving XMM,YMM,ZMM register /
658	.sse_to_integer: `3`, / cost of moving SSE register to integer. /
659	.gather_static: `4`, .gather_per_elt: `4`, / Gather load static, per_elt. /
660	.scatter_static: `4`, .scatter_per_elt: `4`, / Gather store static, per_elt. /
661	.l1_cache_size: `8`, / size of l1 cache. /
662	.l2_cache_size: `256`, / size of l2 cache /
663	.prefetch_block: `32`, / size of prefetch block /
664	.simultaneous_prefetches: `6`, / number of parallel prefetches /
665	.branch_cost: `2`, / Branch cost /
666	COSTS_N_INSNS (`3`), / cost of FADD and FSUB insns. /
667	COSTS_N_INSNS (`5`), / cost of FMUL instruction. /
668	COSTS_N_INSNS (`56`), / cost of FDIV instruction. /
669	COSTS_N_INSNS (`2`), / cost of FABS instruction. /
670	COSTS_N_INSNS (`2`), / cost of FCHS instruction. /
671	COSTS_N_INSNS (`56`), / cost of FSQRT instruction. /
672
673	COSTS_N_INSNS (`1`), / cost of cheap SSE instruction. /
674	COSTS_N_INSNS (`3`), / cost of ADDSS/SD SUBSS/SD insns. /
675	COSTS_N_INSNS (`4`), / cost of MULSS instruction. /
676	COSTS_N_INSNS (`4`), / cost of MULSD instruction. /
677	COSTS_N_INSNS (`7`), / cost of FMA SS instruction. /
678	COSTS_N_INSNS (`7`), / cost of FMA SD instruction. /
679	COSTS_N_INSNS (`18`), / cost of DIVSS instruction. /
680	COSTS_N_INSNS (`18`), / cost of DIVSD instruction. /
681	COSTS_N_INSNS (`31`), / cost of SQRTSS instruction. /
682	COSTS_N_INSNS (`31`), / cost of SQRTSD instruction. /
683	.reassoc_int: `1`, .reassoc_fp: `1`, .reassoc_vec_int: `1`, .reassoc_vec_fp: `1`, / reassoc int, fp, vec_int, vec_fp. /
684	.memcpy: pentiumpro_memcpy,
685	.memset: pentiumpro_memset,
686	COSTS_N_INSNS (`3`), / cond_taken_branch_cost. /
687	COSTS_N_INSNS (`1`), / cond_not_taken_branch_cost. /
688	.align_loop: "16", / Loop alignment. /
689	.align_jump: "16:11:8", / Jump alignment. /
690	.align_label: "0:0:8", / Label alignment. /
691	.align_func: "16", / Func alignment. /
692	.small_unroll_ninsns: `4`, / Small unroll limit. /
693	.small_unroll_factor: `2`, / Small unroll factor. /
694	};
695
696	static stringop_algs geode_memcpy[`2`] = {
697	{.unknown_size: libcall, .size: {{`256`, rep_prefix_4_byte, false}, {-`1`, libcall, false}}},
698	DUMMY_STRINGOP_ALGS};
699	static stringop_algs geode_memset[`2`] = {
700	{.unknown_size: libcall, .size: {{`256`, rep_prefix_4_byte, false}, {-`1`, libcall, false}}},
701	DUMMY_STRINGOP_ALGS};
702	static const
703	struct processor_costs geode_cost = {
704	.hard_register: {
705	/ Start of register allocator costs. integer->integer move cost is 2. /
706	.movzbl_load: `2`, / cost for loading QImode using movzbl /
707	.int_load: {`2`, `2`, `2`}, / cost of loading integer registers*
708	in QImode, HImode and SImode.
709	Relative to reg-reg move (2). /*
710	.int_store: {`2`, `2`, `2`}, / cost of storing integer registers /
711	.fp_move: `2`, / cost of reg,reg fld/fst /
712	.fp_load: {`2`, `2`, `2`}, / cost of loading fp registers*
713	in SFmode, DFmode and XFmode /*
714	.fp_store: {`4`, `6`, `6`}, / cost of storing fp registers*
715	in SFmode, DFmode and XFmode /*
716	.mmx_move: `2`, / cost of moving MMX register /
717	.mmx_load: {`2`, `2`}, / cost of loading MMX registers*
718	in SImode and DImode /*
719	.mmx_store: {`2`, `2`}, / cost of storing MMX registers*
720	in SImode and DImode /*
721	.xmm_move: `2`, .ymm_move: `4`, .zmm_move: `8`, / cost of moving XMM,YMM,ZMM register /
722	.sse_load: {`2`, `2`, `8`, `16`, `32`}, / cost of loading SSE registers*
723	in 32,64,128,256 and 512-bit /*
724	.sse_store: {`2`, `2`, `8`, `16`, `32`}, / cost of storing SSE registers*
725	in 32,64,128,256 and 512-bit /*
726	.sse_to_integer: `6`, .integer_to_sse: `6`, / SSE->integer and integer->SSE moves /
727	.mask_to_integer: `6`, .integer_to_mask: `6`, / mask->integer and integer->mask moves /
728	.mask_load: {`2`, `2`, `2`}, / cost of loading mask register*
729	in QImode, HImode, SImode. /*
730	.mask_store: {`2`, `2`, `2`}, / cost if storing mask register*
731	in QImode, HImode, SImode. /*
732	.mask_move: `2`, / cost of moving mask register. /
733	/ End of register allocator costs. /
734	},
735
736	COSTS_N_INSNS (`1`), / cost of an add instruction /
737	COSTS_N_INSNS (`1`), / cost of a lea instruction /
738	COSTS_N_INSNS (`2`), / variable shift costs /
739	COSTS_N_INSNS (`1`), / constant shift costs /
740	.mult_init: {COSTS_N_INSNS (`3`), / cost of starting multiply for QI /
741	COSTS_N_INSNS (`4`), / HI /
742	COSTS_N_INSNS (`7`), / SI /
743	COSTS_N_INSNS (`7`), / DI /
744	COSTS_N_INSNS (`7`)}, / other /
745	.mult_bit: `0`, / cost of multiply per each bit set /
746	.divide: {COSTS_N_INSNS (`15`), / cost of a divide/mod for QI /
747	COSTS_N_INSNS (`23`), / HI /
748	COSTS_N_INSNS (`39`), / SI /
749	COSTS_N_INSNS (`39`), / DI /
750	COSTS_N_INSNS (`39`)}, / other /
751	COSTS_N_INSNS (`1`), / cost of movsx /
752	COSTS_N_INSNS (`1`), / cost of movzx /
753	.large_insn: `8`, / "large" insn /
754	.move_ratio: `4`, / MOVE_RATIO /
755	.clear_ratio: `4`, / CLEAR_RATIO /
756	.int_load: {`2`, `2`, `2`}, / cost of loading integer registers*
757	in QImode, HImode and SImode.
758	Relative to reg-reg move (2). /*
759	.int_store: {`2`, `2`, `2`}, / cost of storing integer registers /
760	.sse_load: {`2`, `2`, `8`, `16`, `32`}, / cost of loading SSE register*
761	in 32bit, 64bit, 128bit, 256bit and 512bit /*
762	.sse_store: {`2`, `2`, `8`, `16`, `32`}, / cost of storing SSE register*
763	in 32bit, 64bit, 128bit, 256bit and 512bit /*
764	.sse_unaligned_load: {`2`, `2`, `8`, `16`, `32`}, / cost of unaligned loads. /
765	.sse_unaligned_store: {`2`, `2`, `8`, `16`, `32`}, / cost of unaligned stores. /
766	.xmm_move: `2`, .ymm_move: `4`, .zmm_move: `8`, / cost of moving XMM,YMM,ZMM register /
767	.sse_to_integer: `6`, / cost of moving SSE register to integer. /
768	.gather_static: `2`, .gather_per_elt: `2`, / Gather load static, per_elt. /
769	.scatter_static: `2`, .scatter_per_elt: `2`, / Gather store static, per_elt. /
770	.l1_cache_size: `64`, / size of l1 cache. /
771	.l2_cache_size: `128`, / size of l2 cache. /
772	.prefetch_block: `32`, / size of prefetch block /
773	.simultaneous_prefetches: `1`, / number of parallel prefetches /
774	.branch_cost: `1`, / Branch cost /
775	COSTS_N_INSNS (`6`), / cost of FADD and FSUB insns. /
776	COSTS_N_INSNS (`11`), / cost of FMUL instruction. /
777	COSTS_N_INSNS (`47`), / cost of FDIV instruction. /
778	COSTS_N_INSNS (`1`), / cost of FABS instruction. /
779	COSTS_N_INSNS (`1`), / cost of FCHS instruction. /
780	COSTS_N_INSNS (`54`), / cost of FSQRT instruction. /
781
782	COSTS_N_INSNS (`1`), / cost of cheap SSE instruction. /
783	COSTS_N_INSNS (`6`), / cost of ADDSS/SD SUBSS/SD insns. /
784	COSTS_N_INSNS (`11`), / cost of MULSS instruction. /
785	COSTS_N_INSNS (`11`), / cost of MULSD instruction. /
786	COSTS_N_INSNS (`17`), / cost of FMA SS instruction. /
787	COSTS_N_INSNS (`17`), / cost of FMA SD instruction. /
788	COSTS_N_INSNS (`47`), / cost of DIVSS instruction. /
789	COSTS_N_INSNS (`47`), / cost of DIVSD instruction. /
790	COSTS_N_INSNS (`54`), / cost of SQRTSS instruction. /
791	COSTS_N_INSNS (`54`), / cost of SQRTSD instruction. /
792	.reassoc_int: `1`, .reassoc_fp: `1`, .reassoc_vec_int: `1`, .reassoc_vec_fp: `1`, / reassoc int, fp, vec_int, vec_fp. /
793	.memcpy: geode_memcpy,
794	.memset: geode_memset,
795	COSTS_N_INSNS (`3`), / cond_taken_branch_cost. /
796	COSTS_N_INSNS (`1`), / cond_not_taken_branch_cost. /
797	NULL, / Loop alignment. /
798	NULL, / Jump alignment. /
799	NULL, / Label alignment. /
800	NULL, / Func alignment. /
801	.small_unroll_ninsns: `4`, / Small unroll limit. /
802	.small_unroll_factor: `2`, / Small unroll factor. /
803	};
804
805	static stringop_algs k6_memcpy[`2`] = {
806	{.unknown_size: libcall, .size: {{`256`, rep_prefix_4_byte, false}, {-`1`, libcall, false}}},
807	DUMMY_STRINGOP_ALGS};
808	static stringop_algs k6_memset[`2`] = {
809	{.unknown_size: libcall, .size: {{`256`, rep_prefix_4_byte, false}, {-`1`, libcall, false}}},
810	DUMMY_STRINGOP_ALGS};
811	static const
812	struct processor_costs k6_cost = {
813	.hard_register: {
814	/ Start of register allocator costs. integer->integer move cost is 2. /
815	.movzbl_load: `3`, / cost for loading QImode using movzbl /
816	.int_load: {`4`, `5`, `4`}, / cost of loading integer registers*
817	in QImode, HImode and SImode.
818	Relative to reg-reg move (2). /*
819	.int_store: {`2`, `3`, `2`}, / cost of storing integer registers /
820	.fp_move: `4`, / cost of reg,reg fld/fst /
821	.fp_load: {`6`, `6`, `6`}, / cost of loading fp registers*
822	in SFmode, DFmode and XFmode /*
823	.fp_store: {`4`, `4`, `4`}, / cost of storing fp registers*
824	in SFmode, DFmode and XFmode /*
825	.mmx_move: `2`, / cost of moving MMX register /
826	.mmx_load: {`2`, `2`}, / cost of loading MMX registers*
827	in SImode and DImode /*
828	.mmx_store: {`2`, `2`}, / cost of storing MMX registers*
829	in SImode and DImode /*
830	.xmm_move: `2`, .ymm_move: `4`, .zmm_move: `8`, / cost of moving XMM,YMM,ZMM register /
831	.sse_load: {`2`, `2`, `8`, `16`, `32`}, / cost of loading SSE registers*
832	in 32,64,128,256 and 512-bit /*
833	.sse_store: {`2`, `2`, `8`, `16`, `32`}, / cost of storing SSE registers*
834	in 32,64,128,256 and 512-bit /*
835	.sse_to_integer: `6`, .integer_to_sse: `6`, / SSE->integer and integer->SSE moves /
836	.mask_to_integer: `6`, .integer_to_mask: `6`, / mask->integer and integer->mask moves /
837	.mask_load: {`4`, `5`, `4`}, / cost of loading mask register*
838	in QImode, HImode, SImode. /*
839	.mask_store: {`2`, `3`, `2`}, / cost if storing mask register*
840	in QImode, HImode, SImode. /*
841	.mask_move: `2`, / cost of moving mask register. /
842	/ End of register allocator costs. /
843	},
844
845	COSTS_N_INSNS (`1`), / cost of an add instruction /
846	COSTS_N_INSNS (`2`), / cost of a lea instruction /
847	COSTS_N_INSNS (`1`), / variable shift costs /
848	COSTS_N_INSNS (`1`), / constant shift costs /
849	.mult_init: {COSTS_N_INSNS (`3`), / cost of starting multiply for QI /
850	COSTS_N_INSNS (`3`), / HI /
851	COSTS_N_INSNS (`3`), / SI /
852	COSTS_N_INSNS (`3`), / DI /
853	COSTS_N_INSNS (`3`)}, / other /
854	.mult_bit: `0`, / cost of multiply per each bit set /
855	.divide: {COSTS_N_INSNS (`18`), / cost of a divide/mod for QI /
856	COSTS_N_INSNS (`18`), / HI /
857	COSTS_N_INSNS (`18`), / SI /
858	COSTS_N_INSNS (`18`), / DI /
859	COSTS_N_INSNS (`18`)}, / other /
860	COSTS_N_INSNS (`2`), / cost of movsx /
861	COSTS_N_INSNS (`2`), / cost of movzx /
862	.large_insn: `8`, / "large" insn /
863	.move_ratio: `4`, / MOVE_RATIO /
864	.clear_ratio: `4`, / CLEAR_RATIO /
865	.int_load: {`4`, `5`, `4`}, / cost of loading integer registers*
866	in QImode, HImode and SImode.
867	Relative to reg-reg move (2). /*
868	.int_store: {`2`, `3`, `2`}, / cost of storing integer registers /
869	.sse_load: {`2`, `2`, `8`, `16`, `32`}, / cost of loading SSE register*
870	in 32bit, 64bit, 128bit, 256bit and 512bit /*
871	.sse_store: {`2`, `2`, `8`, `16`, `32`}, / cost of storing SSE register*
872	in 32bit, 64bit, 128bit, 256bit and 512bit /*
873	.sse_unaligned_load: {`2`, `2`, `8`, `16`, `32`}, / cost of unaligned loads. /
874	.sse_unaligned_store: {`2`, `2`, `8`, `16`, `32`}, / cost of unaligned stores. /
875	.xmm_move: `2`, .ymm_move: `4`, .zmm_move: `8`, / cost of moving XMM,YMM,ZMM register /
876	.sse_to_integer: `6`, / cost of moving SSE register to integer. /
877	.gather_static: `2`, .gather_per_elt: `2`, / Gather load static, per_elt. /
878	.scatter_static: `2`, .scatter_per_elt: `2`, / Gather store static, per_elt. /
879	.l1_cache_size: `32`, / size of l1 cache. /
880	.l2_cache_size: `32`, / size of l2 cache. Some models*
881	have integrated l2 cache, but
882	optimizing for k6 is not important
883	enough to worry about that. /*
884	.prefetch_block: `32`, / size of prefetch block /
885	.simultaneous_prefetches: `1`, / number of parallel prefetches /
886	.branch_cost: `1`, / Branch cost /
887	COSTS_N_INSNS (`2`), / cost of FADD and FSUB insns. /
888	COSTS_N_INSNS (`2`), / cost of FMUL instruction. /
889	COSTS_N_INSNS (`56`), / cost of FDIV instruction. /
890	COSTS_N_INSNS (`2`), / cost of FABS instruction. /
891	COSTS_N_INSNS (`2`), / cost of FCHS instruction. /
892	COSTS_N_INSNS (`56`), / cost of FSQRT instruction. /
893
894	COSTS_N_INSNS (`1`), / cost of cheap SSE instruction. /
895	COSTS_N_INSNS (`2`), / cost of ADDSS/SD SUBSS/SD insns. /
896	COSTS_N_INSNS (`2`), / cost of MULSS instruction. /
897	COSTS_N_INSNS (`2`), / cost of MULSD instruction. /
898	COSTS_N_INSNS (`4`), / cost of FMA SS instruction. /
899	COSTS_N_INSNS (`4`), / cost of FMA SD instruction. /
900	COSTS_N_INSNS (`56`), / cost of DIVSS instruction. /
901	COSTS_N_INSNS (`56`), / cost of DIVSD instruction. /
902	COSTS_N_INSNS (`56`), / cost of SQRTSS instruction. /
903	COSTS_N_INSNS (`56`), / cost of SQRTSD instruction. /
904	.reassoc_int: `1`, .reassoc_fp: `1`, .reassoc_vec_int: `1`, .reassoc_vec_fp: `1`, / reassoc int, fp, vec_int, vec_fp. /
905	.memcpy: k6_memcpy,
906	.memset: k6_memset,
907	COSTS_N_INSNS (`3`), / cond_taken_branch_cost. /
908	COSTS_N_INSNS (`1`), / cond_not_taken_branch_cost. /
909	.align_loop: "32:8:8", / Loop alignment. /
910	.align_jump: "32:8:8", / Jump alignment. /
911	.align_label: "0:0:8", / Label alignment. /
912	.align_func: "32", / Func alignment. /
913	.small_unroll_ninsns: `4`, / Small unroll limit. /
914	.small_unroll_factor: `2`, / Small unroll factor. /
915	};
916
917	/ For some reason, Athlon deals better with REP prefix (relative to loops)*
918	compared to K8. Alignment becomes important after 8 bytes for memcpy and
919	128 bytes for memset. /*
920	static stringop_algs athlon_memcpy[`2`] = {
921	{.unknown_size: libcall, .size: {{`2048`, rep_prefix_4_byte, false}, {-`1`, libcall, false}}},
922	DUMMY_STRINGOP_ALGS};
923	static stringop_algs athlon_memset[`2`] = {
924	{.unknown_size: libcall, .size: {{`2048`, rep_prefix_4_byte, false}, {-`1`, libcall, false}}},
925	DUMMY_STRINGOP_ALGS};
926	static const
927	struct processor_costs athlon_cost = {
928	.hard_register: {
929	/ Start of register allocator costs. integer->integer move cost is 2. /
930	.movzbl_load: `4`, / cost for loading QImode using movzbl /
931	.int_load: {`3`, `4`, `3`}, / cost of loading integer registers*
932	in QImode, HImode and SImode.
933	Relative to reg-reg move (2). /*
934	.int_store: {`3`, `4`, `3`}, / cost of storing integer registers /
935	.fp_move: `4`, / cost of reg,reg fld/fst /
936	.fp_load: {`4`, `4`, `12`}, / cost of loading fp registers*
937	in SFmode, DFmode and XFmode /*
938	.fp_store: {`6`, `6`, `8`}, / cost of storing fp registers*
939	in SFmode, DFmode and XFmode /*
940	.mmx_move: `2`, / cost of moving MMX register /
941	.mmx_load: {`4`, `4`}, / cost of loading MMX registers*
942	in SImode and DImode /*
943	.mmx_store: {`4`, `4`}, / cost of storing MMX registers*
944	in SImode and DImode /*
945	.xmm_move: `2`, .ymm_move: `4`, .zmm_move: `8`, / cost of moving XMM,YMM,ZMM register /
946	.sse_load: {`4`, `4`, `12`, `12`, `24`}, / cost of loading SSE registers*
947	in 32,64,128,256 and 512-bit /*
948	.sse_store: {`4`, `4`, `10`, `10`, `20`}, / cost of storing SSE registers*
949	in 32,64,128,256 and 512-bit /*
950	.sse_to_integer: `5`, .integer_to_sse: `5`, / SSE->integer and integer->SSE moves /
951	.mask_to_integer: `5`, .integer_to_mask: `5`, / mask->integer and integer->mask moves /
952	.mask_load: {`3`, `4`, `3`}, / cost of loading mask register*
953	in QImode, HImode, SImode. /*
954	.mask_store: {`3`, `4`, `3`}, / cost if storing mask register*
955	in QImode, HImode, SImode. /*
956	.mask_move: `2`, / cost of moving mask register. /
957	/ End of register allocator costs. /
958	},
959
960	COSTS_N_INSNS (`1`), / cost of an add instruction /
961	COSTS_N_INSNS (`2`), / cost of a lea instruction /
962	COSTS_N_INSNS (`1`), / variable shift costs /
963	COSTS_N_INSNS (`1`), / constant shift costs /
964	.mult_init: {COSTS_N_INSNS (`5`), / cost of starting multiply for QI /
965	COSTS_N_INSNS (`5`), / HI /
966	COSTS_N_INSNS (`5`), / SI /
967	COSTS_N_INSNS (`5`), / DI /
968	COSTS_N_INSNS (`5`)}, / other /
969	.mult_bit: `0`, / cost of multiply per each bit set /
970	.divide: {COSTS_N_INSNS (`18`), / cost of a divide/mod for QI /
971	COSTS_N_INSNS (`26`), / HI /
972	COSTS_N_INSNS (`42`), / SI /
973	COSTS_N_INSNS (`74`), / DI /
974	COSTS_N_INSNS (`74`)}, / other /
975	COSTS_N_INSNS (`1`), / cost of movsx /
976	COSTS_N_INSNS (`1`), / cost of movzx /
977	.large_insn: `8`, / "large" insn /
978	.move_ratio: `9`, / MOVE_RATIO /
979	.clear_ratio: `6`, / CLEAR_RATIO /
980	.int_load: {`3`, `4`, `3`}, / cost of loading integer registers*
981	in QImode, HImode and SImode.
982	Relative to reg-reg move (2). /*
983	.int_store: {`3`, `4`, `3`}, / cost of storing integer registers /
984	.sse_load: {`4`, `4`, `12`, `12`, `24`}, / cost of loading SSE register*
985	in 32bit, 64bit, 128bit, 256bit and 512bit /*
986	.sse_store: {`4`, `4`, `10`, `10`, `20`}, / cost of storing SSE register*
987	in 32bit, 64bit, 128bit, 256bit and 512bit /*
988	.sse_unaligned_load: {`4`, `4`, `12`, `12`, `24`}, / cost of unaligned loads. /
989	.sse_unaligned_store: {`4`, `4`, `10`, `10`, `20`}, / cost of unaligned stores. /
990	.xmm_move: `2`, .ymm_move: `4`, .zmm_move: `8`, / cost of moving XMM,YMM,ZMM register /
991	.sse_to_integer: `5`, / cost of moving SSE register to integer. /
992	.gather_static: `4`, .gather_per_elt: `4`, / Gather load static, per_elt. /
993	.scatter_static: `4`, .scatter_per_elt: `4`, / Gather store static, per_elt. /
994	.l1_cache_size: `64`, / size of l1 cache. /
995	.l2_cache_size: `256`, / size of l2 cache. /
996	.prefetch_block: `64`, / size of prefetch block /
997	.simultaneous_prefetches: `6`, / number of parallel prefetches /
998	.branch_cost: `5`, / Branch cost /
999	COSTS_N_INSNS (`4`), / cost of FADD and FSUB insns. /
1000	COSTS_N_INSNS (`4`), / cost of FMUL instruction. /
1001	COSTS_N_INSNS (`24`), / cost of FDIV instruction. /
1002	COSTS_N_INSNS (`2`), / cost of FABS instruction. /
1003	COSTS_N_INSNS (`2`), / cost of FCHS instruction. /
1004	COSTS_N_INSNS (`35`), / cost of FSQRT instruction. /
1005
1006	COSTS_N_INSNS (`2`), / cost of cheap SSE instruction. /
1007	COSTS_N_INSNS (`4`), / cost of ADDSS/SD SUBSS/SD insns. /
1008	COSTS_N_INSNS (`4`), / cost of MULSS instruction. /
1009	COSTS_N_INSNS (`4`), / cost of MULSD instruction. /
1010	COSTS_N_INSNS (`8`), / cost of FMA SS instruction. /
1011	COSTS_N_INSNS (`8`), / cost of FMA SD instruction. /
1012	/ 11-16 /
1013	COSTS_N_INSNS (`16`), / cost of DIVSS instruction. /
1014	COSTS_N_INSNS (`24`), / cost of DIVSD instruction. /
1015	COSTS_N_INSNS (`19`), / cost of SQRTSS instruction. /
1016	COSTS_N_INSNS (`19`), / cost of SQRTSD instruction. /
1017	.reassoc_int: `1`, .reassoc_fp: `1`, .reassoc_vec_int: `1`, .reassoc_vec_fp: `1`, / reassoc int, fp, vec_int, vec_fp. /
1018	.memcpy: athlon_memcpy,
1019	.memset: athlon_memset,
1020	COSTS_N_INSNS (`3`), / cond_taken_branch_cost. /
1021	COSTS_N_INSNS (`1`), / cond_not_taken_branch_cost. /
1022	.align_loop: "16:8:8", / Loop alignment. /
1023	.align_jump: "16:8:8", / Jump alignment. /
1024	.align_label: "0:0:8", / Label alignment. /
1025	.align_func: "16", / Func alignment. /
1026	.small_unroll_ninsns: `4`, / Small unroll limit. /
1027	.small_unroll_factor: `2`, / Small unroll factor. /
1028	};
1029
1030	/ K8 has optimized REP instruction for medium sized blocks, but for very*
1031	small blocks it is better to use loop. For large blocks, libcall can
1032	do nontemporary accesses and beat inline considerably. /*
1033	static stringop_algs k8_memcpy[`2`] = {
1034	{.unknown_size: libcall, .size: {{`6`, loop, false}, {`14`, unrolled_loop, false},
1035	{-`1`, rep_prefix_4_byte, false}}},
1036	{.unknown_size: libcall, .size: {{`16`, loop, false}, {`8192`, rep_prefix_8_byte, false},
1037	{-`1`, libcall, false}}}};
1038	static stringop_algs k8_memset[`2`] = {
1039	{.unknown_size: libcall, .size: {{`8`, loop, false}, {`24`, unrolled_loop, false},
1040	{`2048`, rep_prefix_4_byte, false}, {-`1`, libcall, false}}},
1041	{.unknown_size: libcall, .size: {{`48`, unrolled_loop, false},
1042	{`8192`, rep_prefix_8_byte, false}, {-`1`, libcall, false}}}};
1043	static const
1044	struct processor_costs k8_cost = {
1045	.hard_register: {
1046	/ Start of register allocator costs. integer->integer move cost is 2. /
1047	.movzbl_load: `4`, / cost for loading QImode using movzbl /
1048	.int_load: {`3`, `4`, `3`}, / cost of loading integer registers*
1049	in QImode, HImode and SImode.
1050	Relative to reg-reg move (2). /*
1051	.int_store: {`3`, `4`, `3`}, / cost of storing integer registers /
1052	.fp_move: `4`, / cost of reg,reg fld/fst /
1053	.fp_load: {`4`, `4`, `12`}, / cost of loading fp registers*
1054	in SFmode, DFmode and XFmode /*
1055	.fp_store: {`6`, `6`, `8`}, / cost of storing fp registers*
1056	in SFmode, DFmode and XFmode /*
1057	.mmx_move: `2`, / cost of moving MMX register /
1058	.mmx_load: {`3`, `3`}, / cost of loading MMX registers*
1059	in SImode and DImode /*
1060	.mmx_store: {`4`, `4`}, / cost of storing MMX registers*
1061	in SImode and DImode /*
1062	.xmm_move: `2`, .ymm_move: `4`, .zmm_move: `8`, / cost of moving XMM,YMM,ZMM register /
1063	.sse_load: {`4`, `3`, `12`, `12`, `24`}, / cost of loading SSE registers*
1064	in 32,64,128,256 and 512-bit /*
1065	.sse_store: {`4`, `4`, `10`, `10`, `20`}, / cost of storing SSE registers*
1066	in 32,64,128,256 and 512-bit /*
1067	.sse_to_integer: `5`, .integer_to_sse: `5`, / SSE->integer and integer->SSE moves /
1068	.mask_to_integer: `5`, .integer_to_mask: `5`, / mask->integer and integer->mask moves /
1069	.mask_load: {`3`, `4`, `3`}, / cost of loading mask register*
1070	in QImode, HImode, SImode. /*
1071	.mask_store: {`3`, `4`, `3`}, / cost if storing mask register*
1072	in QImode, HImode, SImode. /*
1073	.mask_move: `2`, / cost of moving mask register. /
1074	/ End of register allocator costs. /
1075	},
1076
1077	COSTS_N_INSNS (`1`), / cost of an add instruction /
1078	COSTS_N_INSNS (`2`), / cost of a lea instruction /
1079	COSTS_N_INSNS (`1`), / variable shift costs /
1080	COSTS_N_INSNS (`1`), / constant shift costs /
1081	.mult_init: {COSTS_N_INSNS (`3`), / cost of starting multiply for QI /
1082	COSTS_N_INSNS (`4`), / HI /
1083	COSTS_N_INSNS (`3`), / SI /
1084	COSTS_N_INSNS (`4`), / DI /
1085	COSTS_N_INSNS (`5`)}, / other /
1086	.mult_bit: `0`, / cost of multiply per each bit set /
1087	.divide: {COSTS_N_INSNS (`18`), / cost of a divide/mod for QI /
1088	COSTS_N_INSNS (`26`), / HI /
1089	COSTS_N_INSNS (`42`), / SI /
1090	COSTS_N_INSNS (`74`), / DI /
1091	COSTS_N_INSNS (`74`)}, / other /
1092	COSTS_N_INSNS (`1`), / cost of movsx /
1093	COSTS_N_INSNS (`1`), / cost of movzx /
1094	.large_insn: `8`, / "large" insn /
1095	.move_ratio: `9`, / MOVE_RATIO /
1096	.clear_ratio: `6`, / CLEAR_RATIO /
1097	.int_load: {`3`, `4`, `3`}, / cost of loading integer registers*
1098	in QImode, HImode and SImode.
1099	Relative to reg-reg move (2). /*
1100	.int_store: {`3`, `4`, `3`}, / cost of storing integer registers /
1101	.sse_load: {`4`, `3`, `12`, `12`, `24`}, / cost of loading SSE register*
1102	in 32bit, 64bit, 128bit, 256bit and 512bit /*
1103	.sse_store: {`4`, `4`, `10`, `10`, `20`}, / cost of storing SSE register*
1104	in 32bit, 64bit, 128bit, 256bit and 512bit /*
1105	.sse_unaligned_load: {`4`, `3`, `12`, `12`, `24`}, / cost of unaligned loads. /
1106	.sse_unaligned_store: {`4`, `4`, `10`, `10`, `20`}, / cost of unaligned stores. /
1107	.xmm_move: `2`, .ymm_move: `4`, .zmm_move: `8`, / cost of moving XMM,YMM,ZMM register /
1108	.sse_to_integer: `5`, / cost of moving SSE register to integer. /
1109	.gather_static: `4`, .gather_per_elt: `4`, / Gather load static, per_elt. /
1110	.scatter_static: `4`, .scatter_per_elt: `4`, / Gather store static, per_elt. /
1111	.l1_cache_size: `64`, / size of l1 cache. /
1112	.l2_cache_size: `512`, / size of l2 cache. /
1113	.prefetch_block: `64`, / size of prefetch block /
1114	/ New AMD processors never drop prefetches; if they cannot be performed*
1115	immediately, they are queued. We set number of simultaneous prefetches
1116	to a large constant to reflect this (it probably is not a good idea not
1117	to limit number of prefetches at all, as their execution also takes some
1118	time). /*
1119	.simultaneous_prefetches: `100`, / number of parallel prefetches /
1120	.branch_cost: `3`, / Branch cost /
1121	COSTS_N_INSNS (`4`), / cost of FADD and FSUB insns. /
1122	COSTS_N_INSNS (`4`), / cost of FMUL instruction. /
1123	COSTS_N_INSNS (`19`), / cost of FDIV instruction. /
1124	COSTS_N_INSNS (`2`), / cost of FABS instruction. /
1125	COSTS_N_INSNS (`2`), / cost of FCHS instruction. /
1126	COSTS_N_INSNS (`35`), / cost of FSQRT instruction. /
1127
1128	COSTS_N_INSNS (`2`), / cost of cheap SSE instruction. /
1129	COSTS_N_INSNS (`4`), / cost of ADDSS/SD SUBSS/SD insns. /
1130	COSTS_N_INSNS (`4`), / cost of MULSS instruction. /
1131	COSTS_N_INSNS (`4`), / cost of MULSD instruction. /
1132	COSTS_N_INSNS (`8`), / cost of FMA SS instruction. /
1133	COSTS_N_INSNS (`8`), / cost of FMA SD instruction. /
1134	/ 11-16 /
1135	COSTS_N_INSNS (`16`), / cost of DIVSS instruction. /
1136	COSTS_N_INSNS (`20`), / cost of DIVSD instruction. /
1137	COSTS_N_INSNS (`19`), / cost of SQRTSS instruction. /
1138	COSTS_N_INSNS (`27`), / cost of SQRTSD instruction. /
1139	.reassoc_int: `1`, .reassoc_fp: `1`, .reassoc_vec_int: `1`, .reassoc_vec_fp: `1`, / reassoc int, fp, vec_int, vec_fp. /
1140	.memcpy: k8_memcpy,
1141	.memset: k8_memset,
1142	COSTS_N_INSNS (`3`), / cond_taken_branch_cost. /
1143	COSTS_N_INSNS (`2`), / cond_not_taken_branch_cost. /
1144	.align_loop: "16:8:8", / Loop alignment. /
1145	.align_jump: "16:8:8", / Jump alignment. /
1146	.align_label: "0:0:8", / Label alignment. /
1147	.align_func: "16", / Func alignment. /
1148	.small_unroll_ninsns: `4`, / Small unroll limit. /
1149	.small_unroll_factor: `2`, / Small unroll factor. /
1150	};
1151
1152	/ AMDFAM10 has optimized REP instruction for medium sized blocks, but for*
1153	very small blocks it is better to use loop. For large blocks, libcall can
1154	do nontemporary accesses and beat inline considerably. /*
1155	static stringop_algs amdfam10_memcpy[`2`] = {
1156	{.unknown_size: libcall, .size: {{`6`, loop, false}, {`14`, unrolled_loop, false},
1157	{-`1`, rep_prefix_4_byte, false}}},
1158	{.unknown_size: libcall, .size: {{`16`, loop, false}, {`8192`, rep_prefix_8_byte, false},
1159	{-`1`, libcall, false}}}};
1160	static stringop_algs amdfam10_memset[`2`] = {
1161	{.unknown_size: libcall, .size: {{`8`, loop, false}, {`24`, unrolled_loop, false},
1162	{`2048`, rep_prefix_4_byte, false}, {-`1`, libcall, false}}},
1163	{.unknown_size: libcall, .size: {{`48`, unrolled_loop, false}, {`8192`, rep_prefix_8_byte, false},
1164	{-`1`, libcall, false}}}};
1165	struct processor_costs amdfam10_cost = {
1166	.hard_register: {
1167	/ Start of register allocator costs. integer->integer move cost is 2. /
1168	.movzbl_load: `4`, / cost for loading QImode using movzbl /
1169	.int_load: {`3`, `4`, `3`}, / cost of loading integer registers*
1170	in QImode, HImode and SImode.
1171	Relative to reg-reg move (2). /*
1172	.int_store: {`3`, `4`, `3`}, / cost of storing integer registers /
1173	.fp_move: `4`, / cost of reg,reg fld/fst /
1174	.fp_load: {`4`, `4`, `12`}, / cost of loading fp registers*
1175	in SFmode, DFmode and XFmode /*
1176	.fp_store: {`6`, `6`, `8`}, / cost of storing fp registers*
1177	in SFmode, DFmode and XFmode /*
1178	.mmx_move: `2`, / cost of moving MMX register /
1179	.mmx_load: {`3`, `3`}, / cost of loading MMX registers*
1180	in SImode and DImode /*
1181	.mmx_store: {`4`, `4`}, / cost of storing MMX registers*
1182	in SImode and DImode /*
1183	.xmm_move: `2`, .ymm_move: `4`, .zmm_move: `8`, / cost of moving XMM,YMM,ZMM register /
1184	.sse_load: {`4`, `4`, `3`, `6`, `12`}, / cost of loading SSE registers*
1185	in 32,64,128,256 and 512-bit /*
1186	.sse_store: {`4`, `4`, `5`, `10`, `20`}, / cost of storing SSE registers*
1187	in 32,64,128,256 and 512-bit /*
1188	.sse_to_integer: `3`, .integer_to_sse: `3`, / SSE->integer and integer->SSE moves /
1189	.mask_to_integer: `3`, .integer_to_mask: `3`, / mask->integer and integer->mask moves /
1190	.mask_load: {`3`, `4`, `3`}, / cost of loading mask register*
1191	in QImode, HImode, SImode. /*
1192	.mask_store: {`3`, `4`, `3`}, / cost if storing mask register*
1193	in QImode, HImode, SImode. /*
1194	.mask_move: `2`, / cost of moving mask register. /
1195
1196	/ On K8:*
1197	MOVD reg64, xmmreg Double FSTORE 4
1198	MOVD reg32, xmmreg Double FSTORE 4
1199	On AMDFAM10:
1200	MOVD reg64, xmmreg Double FADD 3
1201	1/1 1/1
1202	MOVD reg32, xmmreg Double FADD 3
1203	1/1 1/1 /*
1204	/ End of register allocator costs. /
1205	},
1206
1207	COSTS_N_INSNS (`1`), / cost of an add instruction /
1208	COSTS_N_INSNS (`2`), / cost of a lea instruction /
1209	COSTS_N_INSNS (`1`), / variable shift costs /
1210	COSTS_N_INSNS (`1`), / constant shift costs /
1211	.mult_init: {COSTS_N_INSNS (`3`), / cost of starting multiply for QI /
1212	COSTS_N_INSNS (`4`), / HI /
1213	COSTS_N_INSNS (`3`), / SI /
1214	COSTS_N_INSNS (`4`), / DI /
1215	COSTS_N_INSNS (`5`)}, / other /
1216	.mult_bit: `0`, / cost of multiply per each bit set /
1217	.divide: {COSTS_N_INSNS (`19`), / cost of a divide/mod for QI /
1218	COSTS_N_INSNS (`35`), / HI /
1219	COSTS_N_INSNS (`51`), / SI /
1220	COSTS_N_INSNS (`83`), / DI /
1221	COSTS_N_INSNS (`83`)}, / other /
1222	COSTS_N_INSNS (`1`), / cost of movsx /
1223	COSTS_N_INSNS (`1`), / cost of movzx /
1224	.large_insn: `8`, / "large" insn /
1225	.move_ratio: `9`, / MOVE_RATIO /
1226	.clear_ratio: `6`, / CLEAR_RATIO /
1227	.int_load: {`3`, `4`, `3`}, / cost of loading integer registers*
1228	in QImode, HImode and SImode.
1229	Relative to reg-reg move (2). /*
1230	.int_store: {`3`, `4`, `3`}, / cost of storing integer registers /
1231	.sse_load: {`4`, `4`, `3`, `6`, `12`}, / cost of loading SSE register*
1232	in 32bit, 64bit, 128bit, 256bit and 512bit /*
1233	.sse_store: {`4`, `4`, `5`, `10`, `20`}, / cost of storing SSE register*
1234	in 32bit, 64bit, 128bit, 256bit and 512bit /*
1235	.sse_unaligned_load: {`4`, `4`, `3`, `7`, `12`}, / cost of unaligned loads. /
1236	.sse_unaligned_store: {`4`, `4`, `5`, `10`, `20`}, / cost of unaligned stores. /
1237	.xmm_move: `2`, .ymm_move: `4`, .zmm_move: `8`, / cost of moving XMM,YMM,ZMM register /
1238	.sse_to_integer: `3`, / cost of moving SSE register to integer. /
1239	.gather_static: `4`, .gather_per_elt: `4`, / Gather load static, per_elt. /
1240	.scatter_static: `4`, .scatter_per_elt: `4`, / Gather store static, per_elt. /
1241	.l1_cache_size: `64`, / size of l1 cache. /
1242	.l2_cache_size: `512`, / size of l2 cache. /
1243	.prefetch_block: `64`, / size of prefetch block /
1244	/ New AMD processors never drop prefetches; if they cannot be performed*
1245	immediately, they are queued. We set number of simultaneous prefetches
1246	to a large constant to reflect this (it probably is not a good idea not
1247	to limit number of prefetches at all, as their execution also takes some
1248	time). /*
1249	.simultaneous_prefetches: `100`, / number of parallel prefetches /
1250	.branch_cost: `2`, / Branch cost /
1251	COSTS_N_INSNS (`4`), / cost of FADD and FSUB insns. /
1252	COSTS_N_INSNS (`4`), / cost of FMUL instruction. /
1253	COSTS_N_INSNS (`19`), / cost of FDIV instruction. /
1254	COSTS_N_INSNS (`2`), / cost of FABS instruction. /
1255	COSTS_N_INSNS (`2`), / cost of FCHS instruction. /
1256	COSTS_N_INSNS (`35`), / cost of FSQRT instruction. /
1257
1258	COSTS_N_INSNS (`2`), / cost of cheap SSE instruction. /
1259	COSTS_N_INSNS (`4`), / cost of ADDSS/SD SUBSS/SD insns. /
1260	COSTS_N_INSNS (`4`), / cost of MULSS instruction. /
1261	COSTS_N_INSNS (`4`), / cost of MULSD instruction. /
1262	COSTS_N_INSNS (`8`), / cost of FMA SS instruction. /
1263	COSTS_N_INSNS (`8`), / cost of FMA SD instruction. /
1264	/ 11-16 /
1265	COSTS_N_INSNS (`16`), / cost of DIVSS instruction. /
1266	COSTS_N_INSNS (`20`), / cost of DIVSD instruction. /
1267	COSTS_N_INSNS (`19`), / cost of SQRTSS instruction. /
1268	COSTS_N_INSNS (`27`), / cost of SQRTSD instruction. /
1269	.reassoc_int: `1`, .reassoc_fp: `1`, .reassoc_vec_int: `1`, .reassoc_vec_fp: `1`, / reassoc int, fp, vec_int, vec_fp. /
1270	.memcpy: amdfam10_memcpy,
1271	.memset: amdfam10_memset,
1272	COSTS_N_INSNS (`2`), / cond_taken_branch_cost. /
1273	COSTS_N_INSNS (`1`), / cond_not_taken_branch_cost. /
1274	.align_loop: "32:25:8", / Loop alignment. /
1275	.align_jump: "32:8:8", / Jump alignment. /
1276	.align_label: "0:0:8", / Label alignment. /
1277	.align_func: "32", / Func alignment. /
1278	.small_unroll_ninsns: `4`, / Small unroll limit. /
1279	.small_unroll_factor: `2`, / Small unroll factor. /
1280	};
1281
1282	/ BDVER has optimized REP instruction for medium sized blocks, but for*
1283	very small blocks it is better to use loop. For large blocks, libcall
1284	can do nontemporary accesses and beat inline considerably. /*
1285	static stringop_algs bdver_memcpy[`2`] = {
1286	{.unknown_size: libcall, .size: {{`6`, loop, false}, {`14`, unrolled_loop, false},
1287	{-`1`, rep_prefix_4_byte, false}}},
1288	{.unknown_size: libcall, .size: {{`16`, loop, false}, {`8192`, rep_prefix_8_byte, false},
1289	{-`1`, libcall, false}}}};
1290	static stringop_algs bdver_memset[`2`] = {
1291	{.unknown_size: libcall, .size: {{`8`, loop, false}, {`24`, unrolled_loop, false},
1292	{`2048`, rep_prefix_4_byte, false}, {-`1`, libcall, false}}},
1293	{.unknown_size: libcall, .size: {{`48`, unrolled_loop, false}, {`8192`, rep_prefix_8_byte, false},
1294	{-`1`, libcall, false}}}};
1295
1296	const struct processor_costs bdver_cost = {
1297	.hard_register: {
1298	/ Start of register allocator costs. integer->integer move cost is 2. /
1299	.movzbl_load: `8`, / cost for loading QImode using movzbl /
1300	.int_load: {`8`, `8`, `8`}, / cost of loading integer registers*
1301	in QImode, HImode and SImode.
1302	Relative to reg-reg move (2). /*
1303	.int_store: {`8`, `8`, `8`}, / cost of storing integer registers /
1304	.fp_move: `4`, / cost of reg,reg fld/fst /
1305	.fp_load: {`12`, `12`, `28`}, / cost of loading fp registers*
1306	in SFmode, DFmode and XFmode /*
1307	.fp_store: {`10`, `10`, `18`}, / cost of storing fp registers*
1308	in SFmode, DFmode and XFmode /*
1309	.mmx_move: `4`, / cost of moving MMX register /
1310	.mmx_load: {`12`, `12`}, / cost of loading MMX registers*
1311	in SImode and DImode /*
1312	.mmx_store: {`10`, `10`}, / cost of storing MMX registers*
1313	in SImode and DImode /*
1314	.xmm_move: `2`, .ymm_move: `4`, .zmm_move: `8`, / cost of moving XMM,YMM,ZMM register /
1315	.sse_load: {`12`, `12`, `10`, `40`, `60`}, / cost of loading SSE registers*
1316	in 32,64,128,256 and 512-bit /*
1317	.sse_store: {`10`, `10`, `10`, `40`, `60`}, / cost of storing SSE registers*
1318	in 32,64,128,256 and 512-bit /*
1319	.sse_to_integer: `16`, .integer_to_sse: `20`, / SSE->integer and integer->SSE moves /
1320	.mask_to_integer: `16`, .integer_to_mask: `20`, / mask->integer and integer->mask moves /
1321	.mask_load: {`8`, `8`, `8`}, / cost of loading mask register*
1322	in QImode, HImode, SImode. /*
1323	.mask_store: {`8`, `8`, `8`}, / cost if storing mask register*
1324	in QImode, HImode, SImode. /*
1325	.mask_move: `2`, / cost of moving mask register. /
1326	/ End of register allocator costs. /
1327	},
1328
1329	COSTS_N_INSNS (`1`), / cost of an add instruction /
1330	COSTS_N_INSNS (`1`), / cost of a lea instruction /
1331	COSTS_N_INSNS (`1`), / variable shift costs /
1332	COSTS_N_INSNS (`1`), / constant shift costs /
1333	.mult_init: {COSTS_N_INSNS (`4`), / cost of starting multiply for QI /
1334	COSTS_N_INSNS (`4`), / HI /
1335	COSTS_N_INSNS (`4`), / SI /
1336	COSTS_N_INSNS (`6`), / DI /
1337	COSTS_N_INSNS (`6`)}, / other /
1338	.mult_bit: `0`, / cost of multiply per each bit set /
1339	.divide: {COSTS_N_INSNS (`19`), / cost of a divide/mod for QI /
1340	COSTS_N_INSNS (`35`), / HI /
1341	COSTS_N_INSNS (`51`), / SI /
1342	COSTS_N_INSNS (`83`), / DI /
1343	COSTS_N_INSNS (`83`)}, / other /
1344	COSTS_N_INSNS (`1`), / cost of movsx /
1345	COSTS_N_INSNS (`1`), / cost of movzx /
1346	.large_insn: `8`, / "large" insn /
1347	.move_ratio: `9`, / MOVE_RATIO /
1348	.clear_ratio: `6`, / CLEAR_RATIO /
1349	.int_load: {`8`, `8`, `8`}, / cost of loading integer registers*
1350	in QImode, HImode and SImode.
1351	Relative to reg-reg move (2). /*
1352	.int_store: {`8`, `8`, `8`}, / cost of storing integer registers /
1353	.sse_load: {`12`, `12`, `10`, `40`, `60`}, / cost of loading SSE register*
1354	in 32bit, 64bit, 128bit, 256bit and 512bit /*
1355	.sse_store: {`10`, `10`, `10`, `40`, `60`}, / cost of storing SSE register*
1356	in 32bit, 64bit, 128bit, 256bit and 512bit /*
1357	.sse_unaligned_load: {`12`, `12`, `10`, `40`, `60`}, / cost of unaligned loads. /
1358	.sse_unaligned_store: {`10`, `10`, `10`, `40`, `60`}, / cost of unaligned stores. /
1359	.xmm_move: `2`, .ymm_move: `4`, .zmm_move: `8`, / cost of moving XMM,YMM,ZMM register /
1360	.sse_to_integer: `16`, / cost of moving SSE register to integer. /
1361	.gather_static: `12`, .gather_per_elt: `12`, / Gather load static, per_elt. /
1362	.scatter_static: `10`, .scatter_per_elt: `10`, / Gather store static, per_elt. /
1363	.l1_cache_size: `16`, / size of l1 cache. /
1364	.l2_cache_size: `2048`, / size of l2 cache. /
1365	.prefetch_block: `64`, / size of prefetch block /
1366	/ New AMD processors never drop prefetches; if they cannot be performed*
1367	immediately, they are queued. We set number of simultaneous prefetches
1368	to a large constant to reflect this (it probably is not a good idea not
1369	to limit number of prefetches at all, as their execution also takes some
1370	time). /*
1371	.simultaneous_prefetches: `100`, / number of parallel prefetches /
1372	.branch_cost: `2`, / Branch cost /
1373	COSTS_N_INSNS (`6`), / cost of FADD and FSUB insns. /
1374	COSTS_N_INSNS (`6`), / cost of FMUL instruction. /
1375	COSTS_N_INSNS (`42`), / cost of FDIV instruction. /
1376	COSTS_N_INSNS (`2`), / cost of FABS instruction. /
1377	COSTS_N_INSNS (`2`), / cost of FCHS instruction. /
1378	COSTS_N_INSNS (`52`), / cost of FSQRT instruction. /
1379
1380	COSTS_N_INSNS (`2`), / cost of cheap SSE instruction. /
1381	COSTS_N_INSNS (`6`), / cost of ADDSS/SD SUBSS/SD insns. /
1382	COSTS_N_INSNS (`6`), / cost of MULSS instruction. /
1383	COSTS_N_INSNS (`6`), / cost of MULSD instruction. /
1384	COSTS_N_INSNS (`6`), / cost of FMA SS instruction. /
1385	COSTS_N_INSNS (`6`), / cost of FMA SD instruction. /
1386	/ 9-24 /
1387	COSTS_N_INSNS (`24`), / cost of DIVSS instruction. /
1388	/ 9-27 /
1389	COSTS_N_INSNS (`27`), / cost of DIVSD instruction. /
1390	COSTS_N_INSNS (`15`), / cost of SQRTSS instruction. /
1391	COSTS_N_INSNS (`26`), / cost of SQRTSD instruction. /
1392	.reassoc_int: `1`, .reassoc_fp: `2`, .reassoc_vec_int: `1`, .reassoc_vec_fp: `1`, / reassoc int, fp, vec_int, vec_fp. /
1393	.memcpy: bdver_memcpy,
1394	.memset: bdver_memset,
1395	COSTS_N_INSNS (`4`), / cond_taken_branch_cost. /
1396	COSTS_N_INSNS (`2`), / cond_not_taken_branch_cost. /
1397	.align_loop: "16:11:8", / Loop alignment. /
1398	.align_jump: "16:8:8", / Jump alignment. /
1399	.align_label: "0:0:8", / Label alignment. /
1400	.align_func: "11", / Func alignment. /
1401	.small_unroll_ninsns: `4`, / Small unroll limit. /
1402	.small_unroll_factor: `2`, / Small unroll factor. /
1403	};
1404
1405
1406	/ ZNVER1 has optimized REP instruction for medium sized blocks, but for*
1407	very small blocks it is better to use loop. For large blocks, libcall
1408	can do nontemporary accesses and beat inline considerably. /*
1409	static stringop_algs znver1_memcpy[`2`] = {
1410	/ 32-bit tuning. /
1411	{.unknown_size: libcall, .size: {{`6`, loop, false},
1412	{`14`, unrolled_loop, false},
1413	{-`1`, libcall, false}}},
1414	/ 64-bit tuning. /
1415	{.unknown_size: libcall, .size: {{`16`, loop, false},
1416	{`128`, rep_prefix_8_byte, false},
1417	{-`1`, libcall, false}}}};
1418	static stringop_algs znver1_memset[`2`] = {
1419	/ 32-bit tuning. /
1420	{.unknown_size: libcall, .size: {{`8`, loop, false},
1421	{`24`, unrolled_loop, false},
1422	{`128`, rep_prefix_4_byte, false},
1423	{-`1`, libcall, false}}},
1424	/ 64-bit tuning. /
1425	{.unknown_size: libcall, .size: {{`48`, unrolled_loop, false},
1426	{`128`, rep_prefix_8_byte, false},
1427	{-`1`, libcall, false}}}};
1428	struct processor_costs znver1_cost = {
1429	.hard_register: {
1430	/ Start of register allocator costs. integer->integer move cost is 2. /
1431
1432	/ reg-reg moves are done by renaming and thus they are even cheaper than*
1433	1 cycle. Becuase reg-reg move cost is 2 and the following tables correspond
1434	to doubles of latencies, we do not model this correctly. It does not
1435	seem to make practical difference to bump prices up even more. /*
1436	.movzbl_load: `6`, / cost for loading QImode using*
1437	movzbl. /*
1438	.int_load: {`6`, `6`, `6`}, / cost of loading integer registers*
1439	in QImode, HImode and SImode.
1440	Relative to reg-reg move (2). /*
1441	.int_store: {`8`, `8`, `8`}, / cost of storing integer*
1442	registers. /*
1443	.fp_move: `2`, / cost of reg,reg fld/fst. /
1444	.fp_load: {`6`, `6`, `16`}, / cost of loading fp registers*
1445	in SFmode, DFmode and XFmode. /*
1446	.fp_store: {`8`, `8`, `16`}, / cost of storing fp registers*
1447	in SFmode, DFmode and XFmode. /*
1448	.mmx_move: `2`, / cost of moving MMX register. /
1449	.mmx_load: {`6`, `6`}, / cost of loading MMX registers*
1450	in SImode and DImode. /*
1451	.mmx_store: {`8`, `8`}, / cost of storing MMX registers*
1452	in SImode and DImode. /*
1453	.xmm_move: `2`, .ymm_move: `3`, .zmm_move: `6`, / cost of moving XMM,YMM,ZMM register. /
1454	.sse_load: {`6`, `6`, `6`, `12`, `24`}, / cost of loading SSE registers*
1455	in 32,64,128,256 and 512-bit. /*
1456	.sse_store: {`8`, `8`, `8`, `16`, `32`}, / cost of storing SSE registers*
1457	in 32,64,128,256 and 512-bit. /*
1458	.sse_to_integer: `6`, .integer_to_sse: `6`, / SSE->integer and integer->SSE moves. /
1459	.mask_to_integer: `8`, .integer_to_mask: `8`, / mask->integer and integer->mask moves /
1460	.mask_load: {`6`, `6`, `6`}, / cost of loading mask register*
1461	in QImode, HImode, SImode. /*
1462	.mask_store: {`8`, `8`, `8`}, / cost if storing mask register*
1463	in QImode, HImode, SImode. /*
1464	.mask_move: `2`, / cost of moving mask register. /
1465	/ End of register allocator costs. /
1466	},
1467
1468	COSTS_N_INSNS (`1`), / cost of an add instruction. /
1469	COSTS_N_INSNS (`1`), / cost of a lea instruction. /
1470	COSTS_N_INSNS (`1`), / variable shift costs. /
1471	COSTS_N_INSNS (`1`), / constant shift costs. /
1472	.mult_init: {COSTS_N_INSNS (`3`), / cost of starting multiply for QI. /
1473	COSTS_N_INSNS (`3`), / HI. /
1474	COSTS_N_INSNS (`3`), / SI. /
1475	COSTS_N_INSNS (`3`), / DI. /
1476	COSTS_N_INSNS (`3`)}, / other. /
1477	.mult_bit: `0`, / cost of multiply per each bit*
1478	set. /*
1479	/ Depending on parameters, idiv can get faster on ryzen. This is upper*
1480	bound. /*
1481	.divide: {COSTS_N_INSNS (`16`), / cost of a divide/mod for QI. /
1482	COSTS_N_INSNS (`22`), / HI. /
1483	COSTS_N_INSNS (`30`), / SI. /
1484	COSTS_N_INSNS (`45`), / DI. /
1485	COSTS_N_INSNS (`45`)}, / other. /
1486	COSTS_N_INSNS (`1`), / cost of movsx. /
1487	COSTS_N_INSNS (`1`), / cost of movzx. /
1488	.large_insn: `8`, / "large" insn. /
1489	.move_ratio: `9`, / MOVE_RATIO. /
1490	.clear_ratio: `6`, / CLEAR_RATIO /
1491	.int_load: {`6`, `6`, `6`}, / cost of loading integer registers*
1492	in QImode, HImode and SImode.
1493	Relative to reg-reg move (2). /*
1494	.int_store: {`8`, `8`, `8`}, / cost of storing integer*
1495	registers. /*
1496	.sse_load: {`6`, `6`, `6`, `12`, `24`}, / cost of loading SSE register*
1497	in 32bit, 64bit, 128bit, 256bit and 512bit /*
1498	.sse_store: {`8`, `8`, `8`, `16`, `32`}, / cost of storing SSE register*
1499	in 32bit, 64bit, 128bit, 256bit and 512bit /*
1500	.sse_unaligned_load: {`6`, `6`, `6`, `12`, `24`}, / cost of unaligned loads. /
1501	.sse_unaligned_store: {`8`, `8`, `8`, `16`, `32`}, / cost of unaligned stores. /
1502	.xmm_move: `2`, .ymm_move: `3`, .zmm_move: `6`, / cost of moving XMM,YMM,ZMM register. /
1503	.sse_to_integer: `6`, / cost of moving SSE register to integer. /
1504	/ VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,*
1505	throughput 12. Approx 9 uops do not depend on vector size and every load
1506	is 7 uops. /*
1507	.gather_static: `18`, .gather_per_elt: `8`, / Gather load static, per_elt. /
1508	.scatter_static: `18`, .scatter_per_elt: `10`, / Gather store static, per_elt. /
1509	.l1_cache_size: `32`, / size of l1 cache. /
1510	.l2_cache_size: `512`, / size of l2 cache. /
1511	.prefetch_block: `64`, / size of prefetch block. /
1512	/ New AMD processors never drop prefetches; if they cannot be performed*
1513	immediately, they are queued. We set number of simultaneous prefetches
1514	to a large constant to reflect this (it probably is not a good idea not
1515	to limit number of prefetches at all, as their execution also takes some
1516	time). /*
1517	.simultaneous_prefetches: `100`, / number of parallel prefetches. /
1518	.branch_cost: `3`, / Branch cost. /
1519	COSTS_N_INSNS (`5`), / cost of FADD and FSUB insns. /
1520	COSTS_N_INSNS (`5`), / cost of FMUL instruction. /
1521	/ Latency of fdiv is 8-15. /
1522	COSTS_N_INSNS (`15`), / cost of FDIV instruction. /
1523	COSTS_N_INSNS (`1`), / cost of FABS instruction. /
1524	COSTS_N_INSNS (`1`), / cost of FCHS instruction. /
1525	/ Latency of fsqrt is 4-10. /
1526	COSTS_N_INSNS (`10`), / cost of FSQRT instruction. /
1527
1528	COSTS_N_INSNS (`1`), / cost of cheap SSE instruction. /
1529	COSTS_N_INSNS (`3`), / cost of ADDSS/SD SUBSS/SD insns. /
1530	COSTS_N_INSNS (`3`), / cost of MULSS instruction. /
1531	COSTS_N_INSNS (`4`), / cost of MULSD instruction. /
1532	COSTS_N_INSNS (`5`), / cost of FMA SS instruction. /
1533	COSTS_N_INSNS (`5`), / cost of FMA SD instruction. /
1534	COSTS_N_INSNS (`10`), / cost of DIVSS instruction. /
1535	/ 9-13 /
1536	COSTS_N_INSNS (`13`), / cost of DIVSD instruction. /
1537	COSTS_N_INSNS (`10`), / cost of SQRTSS instruction. /
1538	COSTS_N_INSNS (`15`), / cost of SQRTSD instruction. /
1539	/ Zen can execute 4 integer operations per cycle. FP operations take 3 cycles*
1540	and it can execute 2 integer additions and 2 multiplications thus
1541	reassociation may make sense up to with of 6. SPEC2k6 bencharks suggests
1542	that 4 works better than 6 probably due to register pressure.
1543
1544	Integer vector operations are taken by FP unit and execute 3 vector
1545	plus/minus operations per cycle but only one multiply. This is adjusted
1546	in ix86_reassociation_width. /*
1547	.reassoc_int: `4`, .reassoc_fp: `4`, .reassoc_vec_int: `3`, .reassoc_vec_fp: `6`, / reassoc int, fp, vec_int, vec_fp. /
1548	.memcpy: znver1_memcpy,
1549	.memset: znver1_memset,
1550	COSTS_N_INSNS (`4`), / cond_taken_branch_cost. /
1551	COSTS_N_INSNS (`2`), / cond_not_taken_branch_cost. /
1552	.align_loop: "16", / Loop alignment. /
1553	.align_jump: "16", / Jump alignment. /
1554	.align_label: "0:0:8", / Label alignment. /
1555	.align_func: "16", / Func alignment. /
1556	.small_unroll_ninsns: `4`, / Small unroll limit. /
1557	.small_unroll_factor: `2`, / Small unroll factor. /
1558	};
1559
1560	/ ZNVER2 has optimized REP instruction for medium sized blocks, but for*
1561	very small blocks it is better to use loop. For large blocks, libcall
1562	can do nontemporary accesses and beat inline considerably. /*
1563	static stringop_algs znver2_memcpy[`2`] = {
1564	/ 32-bit tuning. /
1565	{.unknown_size: libcall, .size: {{`6`, loop, false},
1566	{`14`, unrolled_loop, false},
1567	{-`1`, libcall, false}}},
1568	/ 64-bit tuning. /
1569	{.unknown_size: libcall, .size: {{`16`, loop, false},
1570	{`64`, rep_prefix_4_byte, false},
1571	{-`1`, libcall, false}}}};
1572	static stringop_algs znver2_memset[`2`] = {
1573	/ 32-bit tuning. /
1574	{.unknown_size: libcall, .size: {{`8`, loop, false},
1575	{`24`, unrolled_loop, false},
1576	{`128`, rep_prefix_4_byte, false},
1577	{-`1`, libcall, false}}},
1578	/ 64-bit tuning. /
1579	{.unknown_size: libcall, .size: {{`24`, rep_prefix_4_byte, false},
1580	{`128`, rep_prefix_8_byte, false},
1581	{-`1`, libcall, false}}}};
1582
1583	struct processor_costs znver2_cost = {
1584	.hard_register: {
1585	/ Start of register allocator costs. integer->integer move cost is 2. /
1586
1587	/ reg-reg moves are done by renaming and thus they are even cheaper than*
1588	1 cycle. Because reg-reg move cost is 2 and following tables correspond
1589	to doubles of latencies, we do not model this correctly. It does not
1590	seem to make practical difference to bump prices up even more. /*
1591	.movzbl_load: `6`, / cost for loading QImode using*
1592	movzbl. /*
1593	.int_load: {`6`, `6`, `6`}, / cost of loading integer registers*
1594	in QImode, HImode and SImode.
1595	Relative to reg-reg move (2). /*
1596	.int_store: {`8`, `8`, `8`}, / cost of storing integer*
1597	registers. /*
1598	.fp_move: `2`, / cost of reg,reg fld/fst. /
1599	.fp_load: {`6`, `6`, `16`}, / cost of loading fp registers*
1600	in SFmode, DFmode and XFmode. /*
1601	.fp_store: {`8`, `8`, `16`}, / cost of storing fp registers*
1602	in SFmode, DFmode and XFmode. /*
1603	.mmx_move: `2`, / cost of moving MMX register. /
1604	.mmx_load: {`6`, `6`}, / cost of loading MMX registers*
1605	in SImode and DImode. /*
1606	.mmx_store: {`8`, `8`}, / cost of storing MMX registers*
1607	in SImode and DImode. /*
1608	.xmm_move: `2`, .ymm_move: `2`, .zmm_move: `3`, / cost of moving XMM,YMM,ZMM*
1609	register. /*
1610	.sse_load: {`6`, `6`, `6`, `6`, `12`}, / cost of loading SSE registers*
1611	in 32,64,128,256 and 512-bit. /*
1612	.sse_store: {`8`, `8`, `8`, `8`, `16`}, / cost of storing SSE registers*
1613	in 32,64,128,256 and 512-bit. /*
1614	.sse_to_integer: `6`, .integer_to_sse: `6`, / SSE->integer and integer->SSE*
1615	moves. /*
1616	.mask_to_integer: `8`, .integer_to_mask: `8`, / mask->integer and integer->mask moves /
1617	.mask_load: {`6`, `6`, `6`}, / cost of loading mask register*
1618	in QImode, HImode, SImode. /*
1619	.mask_store: {`8`, `8`, `8`}, / cost if storing mask register*
1620	in QImode, HImode, SImode. /*
1621	.mask_move: `2`, / cost of moving mask register. /
1622	/ End of register allocator costs. /
1623	},
1624
1625	COSTS_N_INSNS (`1`), / cost of an add instruction. /
1626	COSTS_N_INSNS (`1`), / cost of a lea instruction. /
1627	COSTS_N_INSNS (`1`), / variable shift costs. /
1628	COSTS_N_INSNS (`1`), / constant shift costs. /
1629	.mult_init: {COSTS_N_INSNS (`3`), / cost of starting multiply for QI. /
1630	COSTS_N_INSNS (`3`), / HI. /
1631	COSTS_N_INSNS (`3`), / SI. /
1632	COSTS_N_INSNS (`3`), / DI. /
1633	COSTS_N_INSNS (`3`)}, / other. /
1634	.mult_bit: `0`, / cost of multiply per each bit*
1635	set. /*
1636	/ Depending on parameters, idiv can get faster on ryzen. This is upper*
1637	bound. /*
1638	.divide: {COSTS_N_INSNS (`16`), / cost of a divide/mod for QI. /
1639	COSTS_N_INSNS (`22`), / HI. /
1640	COSTS_N_INSNS (`30`), / SI. /
1641	COSTS_N_INSNS (`45`), / DI. /
1642	COSTS_N_INSNS (`45`)}, / other. /
1643	COSTS_N_INSNS (`1`), / cost of movsx. /
1644	COSTS_N_INSNS (`1`), / cost of movzx. /
1645	.large_insn: `8`, / "large" insn. /
1646	.move_ratio: `9`, / MOVE_RATIO. /
1647	.clear_ratio: `6`, / CLEAR_RATIO /
1648	.int_load: {`6`, `6`, `6`}, / cost of loading integer registers*
1649	in QImode, HImode and SImode.
1650	Relative to reg-reg move (2). /*
1651	.int_store: {`8`, `8`, `8`}, / cost of storing integer*
1652	registers. /*
1653	.sse_load: {`6`, `6`, `6`, `6`, `12`}, / cost of loading SSE registers*
1654	in 32bit, 64bit, 128bit, 256bit and 512bit /*
1655	.sse_store: {`8`, `8`, `8`, `8`, `16`}, / cost of storing SSE register*
1656	in 32bit, 64bit, 128bit, 256bit and 512bit /*
1657	.sse_unaligned_load: {`6`, `6`, `6`, `6`, `12`}, / cost of unaligned loads. /
1658	.sse_unaligned_store: {`8`, `8`, `8`, `8`, `16`}, / cost of unaligned stores. /
1659	.xmm_move: `2`, .ymm_move: `2`, .zmm_move: `3`, / cost of moving XMM,YMM,ZMM*
1660	register. /*
1661	.sse_to_integer: `6`, / cost of moving SSE register to integer. /
1662	/ VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,*
1663	throughput 12. Approx 9 uops do not depend on vector size and every load
1664	is 7 uops. /*
1665	.gather_static: `18`, .gather_per_elt: `8`, / Gather load static, per_elt. /
1666	.scatter_static: `18`, .scatter_per_elt: `10`, / Gather store static, per_elt. /
1667	.l1_cache_size: `32`, / size of l1 cache. /
1668	.l2_cache_size: `512`, / size of l2 cache. /
1669	.prefetch_block: `64`, / size of prefetch block. /
1670	/ New AMD processors never drop prefetches; if they cannot be performed*
1671	immediately, they are queued. We set number of simultaneous prefetches
1672	to a large constant to reflect this (it probably is not a good idea not
1673	to limit number of prefetches at all, as their execution also takes some
1674	time). /*
1675	.simultaneous_prefetches: `100`, / number of parallel prefetches. /
1676	.branch_cost: `3`, / Branch cost. /
1677	COSTS_N_INSNS (`5`), / cost of FADD and FSUB insns. /
1678	COSTS_N_INSNS (`5`), / cost of FMUL instruction. /
1679	/ Latency of fdiv is 8-15. /
1680	COSTS_N_INSNS (`15`), / cost of FDIV instruction. /
1681	COSTS_N_INSNS (`1`), / cost of FABS instruction. /
1682	COSTS_N_INSNS (`1`), / cost of FCHS instruction. /
1683	/ Latency of fsqrt is 4-10. /
1684	COSTS_N_INSNS (`10`), / cost of FSQRT instruction. /
1685
1686	COSTS_N_INSNS (`1`), / cost of cheap SSE instruction. /
1687	COSTS_N_INSNS (`3`), / cost of ADDSS/SD SUBSS/SD insns. /
1688	COSTS_N_INSNS (`3`), / cost of MULSS instruction. /
1689	COSTS_N_INSNS (`3`), / cost of MULSD instruction. /
1690	COSTS_N_INSNS (`5`), / cost of FMA SS instruction. /
1691	COSTS_N_INSNS (`5`), / cost of FMA SD instruction. /
1692	COSTS_N_INSNS (`10`), / cost of DIVSS instruction. /
1693	/ 9-13. /
1694	COSTS_N_INSNS (`13`), / cost of DIVSD instruction. /
1695	COSTS_N_INSNS (`10`), / cost of SQRTSS instruction. /
1696	COSTS_N_INSNS (`15`), / cost of SQRTSD instruction. /
1697	/ Zen can execute 4 integer operations per cycle. FP operations*
1698	take 3 cycles and it can execute 2 integer additions and 2
1699	multiplications thus reassociation may make sense up to with of 6.
1700	SPEC2k6 bencharks suggests
1701	that 4 works better than 6 probably due to register pressure.
1702
1703	Integer vector operations are taken by FP unit and execute 3 vector
1704	plus/minus operations per cycle but only one multiply. This is adjusted
1705	in ix86_reassociation_width. /*
1706	.reassoc_int: `4`, .reassoc_fp: `4`, .reassoc_vec_int: `3`, .reassoc_vec_fp: `6`, / reassoc int, fp, vec_int, vec_fp. /
1707	.memcpy: znver2_memcpy,
1708	.memset: znver2_memset,
1709	COSTS_N_INSNS (`4`), / cond_taken_branch_cost. /
1710	COSTS_N_INSNS (`2`), / cond_not_taken_branch_cost. /
1711	.align_loop: "16", / Loop alignment. /
1712	.align_jump: "16", / Jump alignment. /
1713	.align_label: "0:0:8", / Label alignment. /
1714	.align_func: "16", / Func alignment. /
1715	.small_unroll_ninsns: `4`, / Small unroll limit. /
1716	.small_unroll_factor: `2`, / Small unroll factor. /
1717	};
1718
1719	struct processor_costs znver3_cost = {
1720	.hard_register: {
1721	/ Start of register allocator costs. integer->integer move cost is 2. /
1722
1723	/ reg-reg moves are done by renaming and thus they are even cheaper than*
1724	1 cycle. Because reg-reg move cost is 2 and following tables correspond
1725	to doubles of latencies, we do not model this correctly. It does not
1726	seem to make practical difference to bump prices up even more. /*
1727	.movzbl_load: `6`, / cost for loading QImode using*
1728	movzbl. /*
1729	.int_load: {`6`, `6`, `6`}, / cost of loading integer registers*
1730	in QImode, HImode and SImode.
1731	Relative to reg-reg move (2). /*
1732	.int_store: {`8`, `8`, `8`}, / cost of storing integer*
1733	registers. /*
1734	.fp_move: `2`, / cost of reg,reg fld/fst. /
1735	.fp_load: {`6`, `6`, `16`}, / cost of loading fp registers*
1736	in SFmode, DFmode and XFmode. /*
1737	.fp_store: {`8`, `8`, `16`}, / cost of storing fp registers*
1738	in SFmode, DFmode and XFmode. /*
1739	.mmx_move: `2`, / cost of moving MMX register. /
1740	.mmx_load: {`6`, `6`}, / cost of loading MMX registers*
1741	in SImode and DImode. /*
1742	.mmx_store: {`8`, `8`}, / cost of storing MMX registers*
1743	in SImode and DImode. /*
1744	.xmm_move: `2`, .ymm_move: `2`, .zmm_move: `3`, / cost of moving XMM,YMM,ZMM*
1745	register. /*
1746	.sse_load: {`6`, `6`, `6`, `6`, `12`}, / cost of loading SSE registers*
1747	in 32,64,128,256 and 512-bit. /*
1748	.sse_store: {`8`, `8`, `8`, `8`, `16`}, / cost of storing SSE registers*
1749	in 32,64,128,256 and 512-bit. /*
1750	.sse_to_integer: `6`, .integer_to_sse: `6`, / SSE->integer and integer->SSE*
1751	moves. /*
1752	.mask_to_integer: `8`, .integer_to_mask: `8`, / mask->integer and integer->mask moves /
1753	.mask_load: {`6`, `6`, `6`}, / cost of loading mask register*
1754	in QImode, HImode, SImode. /*
1755	.mask_store: {`8`, `8`, `8`}, / cost if storing mask register*
1756	in QImode, HImode, SImode. /*
1757	.mask_move: `2`, / cost of moving mask register. /
1758	/ End of register allocator costs. /
1759	},
1760
1761	COSTS_N_INSNS (`1`), / cost of an add instruction. /
1762	COSTS_N_INSNS (`1`), / cost of a lea instruction. /
1763	COSTS_N_INSNS (`1`), / variable shift costs. /
1764	COSTS_N_INSNS (`1`), / constant shift costs. /
1765	.mult_init: {COSTS_N_INSNS (`3`), / cost of starting multiply for QI. /
1766	COSTS_N_INSNS (`3`), / HI. /
1767	COSTS_N_INSNS (`3`), / SI. /
1768	COSTS_N_INSNS (`3`), / DI. /
1769	COSTS_N_INSNS (`3`)}, / other. /
1770	.mult_bit: `0`, / cost of multiply per each bit*
1771	set. /*
1772	.divide: {COSTS_N_INSNS (`9`), / cost of a divide/mod for QI. /
1773	COSTS_N_INSNS (`10`), / HI. /
1774	COSTS_N_INSNS (`12`), / SI. /
1775	COSTS_N_INSNS (`17`), / DI. /
1776	COSTS_N_INSNS (`17`)}, / other. /
1777	COSTS_N_INSNS (`1`), / cost of movsx. /
1778	COSTS_N_INSNS (`1`), / cost of movzx. /
1779	.large_insn: `8`, / "large" insn. /
1780	.move_ratio: `9`, / MOVE_RATIO. /
1781	.clear_ratio: `6`, / CLEAR_RATIO /
1782	.int_load: {`6`, `6`, `6`}, / cost of loading integer registers*
1783	in QImode, HImode and SImode.
1784	Relative to reg-reg move (2). /*
1785	.int_store: {`8`, `8`, `8`}, / cost of storing integer*
1786	registers. /*
1787	.sse_load: {`6`, `6`, `6`, `6`, `12`}, / cost of loading SSE registers*
1788	in 32bit, 64bit, 128bit, 256bit and 512bit /*
1789	.sse_store: {`8`, `8`, `8`, `8`, `16`}, / cost of storing SSE register*
1790	in 32bit, 64bit, 128bit, 256bit and 512bit /*
1791	.sse_unaligned_load: {`6`, `6`, `6`, `6`, `12`}, / cost of unaligned loads. /
1792	.sse_unaligned_store: {`8`, `8`, `8`, `8`, `16`}, / cost of unaligned stores. /
1793	.xmm_move: `2`, .ymm_move: `2`, .zmm_move: `3`, / cost of moving XMM,YMM,ZMM*
1794	register. /*
1795	.sse_to_integer: `6`, / cost of moving SSE register to integer. /
1796	/ VGATHERDPD is 15 uops and throughput is 4, VGATHERDPS is 23 uops,*
1797	throughput 9. Approx 7 uops do not depend on vector size and every load
1798	is 4 uops. /*
1799	.gather_static: `14`, .gather_per_elt: `8`, / Gather load static, per_elt. /
1800	.scatter_static: `14`, .scatter_per_elt: `10`, / Gather store static, per_elt. /
1801	.l1_cache_size: `32`, / size of l1 cache. /
1802	.l2_cache_size: `512`, / size of l2 cache. /
1803	.prefetch_block: `64`, / size of prefetch block. /
1804	/ New AMD processors never drop prefetches; if they cannot be performed*
1805	immediately, they are queued. We set number of simultaneous prefetches
1806	to a large constant to reflect this (it probably is not a good idea not
1807	to limit number of prefetches at all, as their execution also takes some
1808	time). /*
1809	.simultaneous_prefetches: `100`, / number of parallel prefetches. /
1810	.branch_cost: `3`, / Branch cost. /
1811	COSTS_N_INSNS (`5`), / cost of FADD and FSUB insns. /
1812	COSTS_N_INSNS (`5`), / cost of FMUL instruction. /
1813	/ Latency of fdiv is 8-15. /
1814	COSTS_N_INSNS (`15`), / cost of FDIV instruction. /
1815	COSTS_N_INSNS (`1`), / cost of FABS instruction. /
1816	COSTS_N_INSNS (`1`), / cost of FCHS instruction. /
1817	/ Latency of fsqrt is 4-10. /
1818	COSTS_N_INSNS (`10`), / cost of FSQRT instruction. /
1819
1820	COSTS_N_INSNS (`1`), / cost of cheap SSE instruction. /
1821	COSTS_N_INSNS (`3`), / cost of ADDSS/SD SUBSS/SD insns. /
1822	COSTS_N_INSNS (`3`), / cost of MULSS instruction. /
1823	COSTS_N_INSNS (`3`), / cost of MULSD instruction. /
1824	COSTS_N_INSNS (`5`), / cost of FMA SS instruction. /
1825	COSTS_N_INSNS (`5`), / cost of FMA SD instruction. /
1826	COSTS_N_INSNS (`10`), / cost of DIVSS instruction. /
1827	/ 9-13. /
1828	COSTS_N_INSNS (`13`), / cost of DIVSD instruction. /
1829	COSTS_N_INSNS (`10`), / cost of SQRTSS instruction. /
1830	COSTS_N_INSNS (`15`), / cost of SQRTSD instruction. /
1831	/ Zen can execute 4 integer operations per cycle. FP operations*
1832	take 3 cycles and it can execute 2 integer additions and 2
1833	multiplications thus reassociation may make sense up to with of 6.
1834	SPEC2k6 bencharks suggests
1835	that 4 works better than 6 probably due to register pressure.
1836
1837	Integer vector operations are taken by FP unit and execute 3 vector
1838	plus/minus operations per cycle but only one multiply. This is adjusted
1839	in ix86_reassociation_width. /*
1840	.reassoc_int: `4`, .reassoc_fp: `4`, .reassoc_vec_int: `3`, .reassoc_vec_fp: `6`, / reassoc int, fp, vec_int, vec_fp. /
1841	.memcpy: znver2_memcpy,
1842	.memset: znver2_memset,
1843	COSTS_N_INSNS (`4`), / cond_taken_branch_cost. /
1844	COSTS_N_INSNS (`2`), / cond_not_taken_branch_cost. /
1845	.align_loop: "16", / Loop alignment. /
1846	.align_jump: "16", / Jump alignment. /
1847	.align_label: "0:0:8", / Label alignment. /
1848	.align_func: "16", / Func alignment. /
1849	.small_unroll_ninsns: `4`, / Small unroll limit. /
1850	.small_unroll_factor: `2`, / Small unroll factor. /
1851	};
1852
1853	/ This table currently replicates znver3_cost table. /
1854	struct processor_costs znver4_cost = {
1855	.hard_register: {
1856	/ Start of register allocator costs. integer->integer move cost is 2. /
1857
1858	/ reg-reg moves are done by renaming and thus they are even cheaper than*
1859	1 cycle. Because reg-reg move cost is 2 and following tables correspond
1860	to doubles of latencies, we do not model this correctly. It does not
1861	seem to make practical difference to bump prices up even more. /*
1862	.movzbl_load: `6`, / cost for loading QImode using*
1863	movzbl. /*
1864	.int_load: {`6`, `6`, `6`}, / cost of loading integer registers*
1865	in QImode, HImode and SImode.
1866	Relative to reg-reg move (2). /*
1867	.int_store: {`8`, `8`, `8`}, / cost of storing integer*
1868	registers. /*
1869	.fp_move: `2`, / cost of reg,reg fld/fst. /
1870	.fp_load: {`14`, `14`, `17`}, / cost of loading fp registers*
1871	in SFmode, DFmode and XFmode. /*
1872	.fp_store: {`12`, `12`, `16`}, / cost of storing fp registers*
1873	in SFmode, DFmode and XFmode. /*
1874	.mmx_move: `2`, / cost of moving MMX register. /
1875	.mmx_load: {`6`, `6`}, / cost of loading MMX registers*
1876	in SImode and DImode. /*
1877	.mmx_store: {`8`, `8`}, / cost of storing MMX registers*
1878	in SImode and DImode. /*
1879	.xmm_move: `2`, .ymm_move: `2`, .zmm_move: `3`, / cost of moving XMM,YMM,ZMM*
1880	register. /*
1881	.sse_load: {`6`, `6`, `10`, `10`, `12`}, / cost of loading SSE registers*
1882	in 32,64,128,256 and 512-bit. /*
1883	.sse_store: {`8`, `8`, `8`, `12`, `12`}, / cost of storing SSE registers*
1884	in 32,64,128,256 and 512-bit. /*
1885	.sse_to_integer: `6`, .integer_to_sse: `8`, / SSE->integer and integer->SSE*
1886	moves. /*
1887	.mask_to_integer: `8`, .integer_to_mask: `8`, / mask->integer and integer->mask moves /
1888	.mask_load: {`6`, `6`, `6`}, / cost of loading mask register*
1889	in QImode, HImode, SImode. /*
1890	.mask_store: {`8`, `8`, `8`}, / cost if storing mask register*
1891	in QImode, HImode, SImode. /*
1892	.mask_move: `2`, / cost of moving mask register. /
1893	/ End of register allocator costs. /
1894	},
1895
1896	COSTS_N_INSNS (`1`), / cost of an add instruction. /
1897	/ TODO: Lea with 3 components has cost 2. /
1898	COSTS_N_INSNS (`1`), / cost of a lea instruction. /
1899	COSTS_N_INSNS (`1`), / variable shift costs. /
1900	COSTS_N_INSNS (`1`), / constant shift costs. /
1901	.mult_init: {COSTS_N_INSNS (`3`), / cost of starting multiply for QI. /
1902	COSTS_N_INSNS (`3`), / HI. /
1903	COSTS_N_INSNS (`3`), / SI. /
1904	COSTS_N_INSNS (`3`), / DI. /
1905	COSTS_N_INSNS (`3`)}, / other. /
1906	.mult_bit: `0`, / cost of multiply per each bit*
1907	set. /*
1908	.divide: {COSTS_N_INSNS (`12`), / cost of a divide/mod for QI. /
1909	COSTS_N_INSNS (`13`), / HI. /
1910	COSTS_N_INSNS (`13`), / SI. /
1911	COSTS_N_INSNS (`18`), / DI. /
1912	COSTS_N_INSNS (`18`)}, / other. /
1913	COSTS_N_INSNS (`1`), / cost of movsx. /
1914	COSTS_N_INSNS (`1`), / cost of movzx. /
1915	.large_insn: `8`, / "large" insn. /
1916	.move_ratio: `9`, / MOVE_RATIO. /
1917	.clear_ratio: `6`, / CLEAR_RATIO /
1918	.int_load: {`6`, `6`, `6`}, / cost of loading integer registers*
1919	in QImode, HImode and SImode.
1920	Relative to reg-reg move (2). /*
1921	.int_store: {`8`, `8`, `8`}, / cost of storing integer*
1922	registers. /*
1923	.sse_load: {`6`, `6`, `10`, `10`, `12`}, / cost of loading SSE registers*
1924	in 32bit, 64bit, 128bit, 256bit and 512bit /*
1925	.sse_store: {`8`, `8`, `8`, `12`, `12`}, / cost of storing SSE register*
1926	in 32bit, 64bit, 128bit, 256bit and 512bit /*
1927	.sse_unaligned_load: {`6`, `6`, `6`, `6`, `6`}, / cost of unaligned loads. /
1928	.sse_unaligned_store: {`8`, `8`, `8`, `8`, `8`}, / cost of unaligned stores. /
1929	.xmm_move: `2`, .ymm_move: `2`, .zmm_move: `2`, / cost of moving XMM,YMM,ZMM*
1930	register. /*
1931	.sse_to_integer: `6`, / cost of moving SSE register to integer. /
1932	/ VGATHERDPD is 17 uops and throughput is 4, VGATHERDPS is 24 uops,*
1933	throughput 5. Approx 7 uops do not depend on vector size and every load
1934	is 5 uops. /*
1935	.gather_static: `14`, .gather_per_elt: `10`, / Gather load static, per_elt. /
1936	.scatter_static: `14`, .scatter_per_elt: `20`, / Gather store static, per_elt. /
1937	.l1_cache_size: `32`, / size of l1 cache. /
1938	.l2_cache_size: `1024`, / size of l2 cache. /
1939	.prefetch_block: `64`, / size of prefetch block. /
1940	/ New AMD processors never drop prefetches; if they cannot be performed*
1941	immediately, they are queued. We set number of simultaneous prefetches
1942	to a large constant to reflect this (it probably is not a good idea not
1943	to limit number of prefetches at all, as their execution also takes some
1944	time). /*
1945	.simultaneous_prefetches: `100`, / number of parallel prefetches. /
1946	.branch_cost: `3`, / Branch cost. /
1947	COSTS_N_INSNS (`7`), / cost of FADD and FSUB insns. /
1948	COSTS_N_INSNS (`7`), / cost of FMUL instruction. /
1949	/ Latency of fdiv is 8-15. /
1950	COSTS_N_INSNS (`15`), / cost of FDIV instruction. /
1951	COSTS_N_INSNS (`1`), / cost of FABS instruction. /
1952	COSTS_N_INSNS (`1`), / cost of FCHS instruction. /
1953	/ Latency of fsqrt is 4-10. /
1954	COSTS_N_INSNS (`25`), / cost of FSQRT instruction. /
1955
1956	COSTS_N_INSNS (`1`), / cost of cheap SSE instruction. /
1957	COSTS_N_INSNS (`3`), / cost of ADDSS/SD SUBSS/SD insns. /
1958	COSTS_N_INSNS (`3`), / cost of MULSS instruction. /
1959	COSTS_N_INSNS (`3`), / cost of MULSD instruction. /
1960	COSTS_N_INSNS (`4`), / cost of FMA SS instruction. /
1961	COSTS_N_INSNS (`4`), / cost of FMA SD instruction. /
1962	COSTS_N_INSNS (`13`), / cost of DIVSS instruction. /
1963	/ 9-13. /
1964	COSTS_N_INSNS (`13`), / cost of DIVSD instruction. /
1965	COSTS_N_INSNS (`15`), / cost of SQRTSS instruction. /
1966	COSTS_N_INSNS (`21`), / cost of SQRTSD instruction. /
1967	/ Zen can execute 4 integer operations per cycle. FP operations*
1968	take 3 cycles and it can execute 2 integer additions and 2
1969	multiplications thus reassociation may make sense up to with of 6.
1970	SPEC2k6 bencharks suggests
1971	that 4 works better than 6 probably due to register pressure.
1972
1973	Integer vector operations are taken by FP unit and execute 3 vector
1974	plus/minus operations per cycle but only one multiply. This is adjusted
1975	in ix86_reassociation_width. /*
1976	.reassoc_int: `4`, .reassoc_fp: `4`, .reassoc_vec_int: `3`, .reassoc_vec_fp: `6`, / reassoc int, fp, vec_int, vec_fp. /
1977	.memcpy: znver2_memcpy,
1978	.memset: znver2_memset,
1979	COSTS_N_INSNS (`4`), / cond_taken_branch_cost. /
1980	COSTS_N_INSNS (`2`), / cond_not_taken_branch_cost. /
1981	.align_loop: "16", / Loop alignment. /
1982	.align_jump: "16", / Jump alignment. /
1983	.align_label: "0:0:8", / Label alignment. /
1984	.align_func: "16", / Func alignment. /
1985	.small_unroll_ninsns: `4`, / Small unroll limit. /
1986	.small_unroll_factor: `2`, / Small unroll factor. /
1987	};
1988
1989	/ skylake_cost should produce code tuned for Skylake familly of CPUs. /
1990	static stringop_algs skylake_memcpy[`2`] = {
1991	{.unknown_size: libcall,
1992	.size: {{`256`, rep_prefix_1_byte, true},
1993	{`256`, loop, false},
1994	{-`1`, libcall, false}}},
1995	{.unknown_size: libcall,
1996	.size: {{`256`, rep_prefix_1_byte, true},
1997	{`256`, loop, false},
1998	{-`1`, libcall, false}}}};
1999
2000	static stringop_algs skylake_memset[`2`] = {
2001	{.unknown_size: libcall,
2002	.size: {{`256`, rep_prefix_1_byte, true},
2003	{`256`, loop, false},
2004	{-`1`, libcall, false}}},
2005	{.unknown_size: libcall,
2006	.size: {{`256`, rep_prefix_1_byte, true},
2007	{`256`, loop, false},
2008	{-`1`, libcall, false}}}};
2009
2010	static const
2011	struct processor_costs skylake_cost = {
2012	.hard_register: {
2013	/ Start of register allocator costs. integer->integer move cost is 2. /
2014	.movzbl_load: `6`, / cost for loading QImode using movzbl /
2015	.int_load: {`4`, `4`, `4`}, / cost of loading integer registers*
2016	in QImode, HImode and SImode.
2017	Relative to reg-reg move (2). /*
2018	.int_store: {`6`, `6`, `6`}, / cost of storing integer registers /
2019	.fp_move: `2`, / cost of reg,reg fld/fst /
2020	.fp_load: {`6`, `6`, `8`}, / cost of loading fp registers*
2021	in SFmode, DFmode and XFmode /*
2022	.fp_store: {`6`, `6`, `10`}, / cost of storing fp registers*
2023	in SFmode, DFmode and XFmode /*
2024	.mmx_move: `2`, / cost of moving MMX register /
2025	.mmx_load: {`6`, `6`}, / cost of loading MMX registers*
2026	in SImode and DImode /*
2027	.mmx_store: {`6`, `6`}, / cost of storing MMX registers*
2028	in SImode and DImode /*
2029	.xmm_move: `2`, .ymm_move: `2`, .zmm_move: `4`, / cost of moving XMM,YMM,ZMM register /
2030	.sse_load: {`6`, `6`, `6`, `10`, `20`}, / cost of loading SSE registers*
2031	in 32,64,128,256 and 512-bit /*
2032	.sse_store: {`8`, `8`, `8`, `12`, `24`}, / cost of storing SSE registers*
2033	in 32,64,128,256 and 512-bit /*
2034	.sse_to_integer: `6`, .integer_to_sse: `6`, / SSE->integer and integer->SSE moves /
2035	.mask_to_integer: `6`, .integer_to_mask: `6`, / mask->integer and integer->mask moves /
2036	.mask_load: {`8`, `8`, `8`}, / cost of loading mask register*
2037	in QImode, HImode, SImode. /*
2038	.mask_store: {`6`, `6`, `6`}, / cost if storing mask register*
2039	in QImode, HImode, SImode. /*
2040	.mask_move: `3`, / cost of moving mask register. /
2041	/ End of register allocator costs. /
2042	},
2043
2044	COSTS_N_INSNS (`1`), / cost of an add instruction /
2045	COSTS_N_INSNS (`1`)+`1`, / cost of a lea instruction /
2046	COSTS_N_INSNS (`1`), / variable shift costs /
2047	COSTS_N_INSNS (`1`), / constant shift costs /
2048	.mult_init: {COSTS_N_INSNS (`3`), / cost of starting multiply for QI /
2049	COSTS_N_INSNS (`4`), / HI /
2050	COSTS_N_INSNS (`3`), / SI /
2051	COSTS_N_INSNS (`3`), / DI /
2052	COSTS_N_INSNS (`3`)}, / other /
2053	.mult_bit: `0`, / cost of multiply per each bit set /
2054	/ Expanding div/mod currently doesn't consider parallelism. So the cost*
2055	model is not realistic. We compensate by increasing the latencies a bit. /*
2056	.divide: {COSTS_N_INSNS (`11`), / cost of a divide/mod for QI /
2057	COSTS_N_INSNS (`11`), / HI /
2058	COSTS_N_INSNS (`14`), / SI /
2059	COSTS_N_INSNS (`76`), / DI /
2060	COSTS_N_INSNS (`76`)}, / other /
2061	COSTS_N_INSNS (`1`), / cost of movsx /
2062	COSTS_N_INSNS (`0`), / cost of movzx /
2063	.large_insn: `8`, / "large" insn /
2064	.move_ratio: `17`, / MOVE_RATIO /
2065	.clear_ratio: `17`, / CLEAR_RATIO /
2066	.int_load: {`6`, `6`, `6`}, / cost of loading integer registers*
2067	in QImode, HImode and SImode.
2068	Relative to reg-reg move (2). /*
2069	.int_store: {`8`, `8`, `8`}, / cost of storing integer registers /
2070	.sse_load: {`8`, `8`, `8`, `8`, `16`}, / cost of loading SSE register*
2071	in 32bit, 64bit, 128bit, 256bit and 512bit /*
2072	.sse_store: {`8`, `8`, `8`, `8`, `16`}, / cost of storing SSE register*
2073	in 32bit, 64bit, 128bit, 256bit and 512bit /*
2074	.sse_unaligned_load: {`8`, `8`, `8`, `8`, `16`}, / cost of unaligned loads. /
2075	.sse_unaligned_store: {`8`, `8`, `8`, `8`, `16`}, / cost of unaligned stores. /
2076	.xmm_move: `2`, .ymm_move: `2`, .zmm_move: `4`, / cost of moving XMM,YMM,ZMM register /
2077	.sse_to_integer: `6`, / cost of moving SSE register to integer. /
2078	.gather_static: `20`, .gather_per_elt: `8`, / Gather load static, per_elt. /
2079	.scatter_static: `22`, .scatter_per_elt: `10`, / Gather store static, per_elt. /
2080	.l1_cache_size: `64`, / size of l1 cache. /
2081	.l2_cache_size: `512`, / size of l2 cache. /
2082	.prefetch_block: `64`, / size of prefetch block /
2083	.simultaneous_prefetches: `6`, / number of parallel prefetches /
2084	.branch_cost: `3`, / Branch cost /
2085	COSTS_N_INSNS (`3`), / cost of FADD and FSUB insns. /
2086	COSTS_N_INSNS (`4`), / cost of FMUL instruction. /
2087	COSTS_N_INSNS (`20`), / cost of FDIV instruction. /
2088	COSTS_N_INSNS (`1`), / cost of FABS instruction. /
2089	COSTS_N_INSNS (`1`), / cost of FCHS instruction. /
2090	COSTS_N_INSNS (`20`), / cost of FSQRT instruction. /
2091
2092	COSTS_N_INSNS (`1`), / cost of cheap SSE instruction. /
2093	COSTS_N_INSNS (`4`), / cost of ADDSS/SD SUBSS/SD insns. /
2094	COSTS_N_INSNS (`4`), / cost of MULSS instruction. /
2095	COSTS_N_INSNS (`4`), / cost of MULSD instruction. /
2096	COSTS_N_INSNS (`4`), / cost of FMA SS instruction. /
2097	COSTS_N_INSNS (`4`), / cost of FMA SD instruction. /
2098	COSTS_N_INSNS (`11`), / cost of DIVSS instruction. /
2099	COSTS_N_INSNS (`14`), / cost of DIVSD instruction. /
2100	COSTS_N_INSNS (`12`), / cost of SQRTSS instruction. /
2101	COSTS_N_INSNS (`18`), / cost of SQRTSD instruction. /
2102	.reassoc_int: `1`, .reassoc_fp: `4`, .reassoc_vec_int: `2`, .reassoc_vec_fp: `2`, / reassoc int, fp, vec_int, vec_fp. /
2103	.memcpy: skylake_memcpy,
2104	.memset: skylake_memset,
2105	COSTS_N_INSNS (`3`), / cond_taken_branch_cost. /
2106	COSTS_N_INSNS (`1`), / cond_not_taken_branch_cost. /
2107	.align_loop: "16:11:8", / Loop alignment. /
2108	.align_jump: "16:11:8", / Jump alignment. /
2109	.align_label: "0:0:8", / Label alignment. /
2110	.align_func: "16", / Func alignment. /
2111	.small_unroll_ninsns: `4`, / Small unroll limit. /
2112	.small_unroll_factor: `2`, / Small unroll factor. /
2113	};
2114
2115	/ icelake_cost should produce code tuned for Icelake family of CPUs.*
2116	NB: rep_prefix_1_byte is used only for known size. /*
2117
2118	static stringop_algs icelake_memcpy[`2`] = {
2119	{.unknown_size: libcall,
2120	.size: {{`256`, rep_prefix_1_byte, true},
2121	{`256`, loop, false},
2122	{-`1`, libcall, false}}},
2123	{.unknown_size: libcall,
2124	.size: {{`256`, rep_prefix_1_byte, true},
2125	{`256`, loop, false},
2126	{-`1`, libcall, false}}}};
2127
2128	static stringop_algs icelake_memset[`2`] = {
2129	{.unknown_size: libcall,
2130	.size: {{`256`, rep_prefix_1_byte, true},
2131	{`256`, loop, false},
2132	{-`1`, libcall, false}}},
2133	{.unknown_size: libcall,
2134	.size: {{`256`, rep_prefix_1_byte, true},
2135	{`256`, loop, false},
2136	{-`1`, libcall, false}}}};
2137
2138	static const
2139	struct processor_costs icelake_cost = {
2140	.hard_register: {
2141	/ Start of register allocator costs. integer->integer move cost is 2. /
2142	.movzbl_load: `6`, / cost for loading QImode using movzbl /
2143	.int_load: {`4`, `4`, `4`}, / cost of loading integer registers*
2144	in QImode, HImode and SImode.
2145	Relative to reg-reg move (2). /*
2146	.int_store: {`6`, `6`, `6`}, / cost of storing integer registers /
2147	.fp_move: `2`, / cost of reg,reg fld/fst /
2148	.fp_load: {`6`, `6`, `8`}, / cost of loading fp registers*
2149	in SFmode, DFmode and XFmode /*
2150	.fp_store: {`6`, `6`, `10`}, / cost of storing fp registers*
2151	in SFmode, DFmode and XFmode /*
2152	.mmx_move: `2`, / cost of moving MMX register /
2153	.mmx_load: {`6`, `6`}, / cost of loading MMX registers*
2154	in SImode and DImode /*
2155	.mmx_store: {`6`, `6`}, / cost of storing MMX registers*
2156	in SImode and DImode /*
2157	.xmm_move: `2`, .ymm_move: `2`, .zmm_move: `4`, / cost of moving XMM,YMM,ZMM register /
2158	.sse_load: {`6`, `6`, `6`, `10`, `20`}, / cost of loading SSE registers*
2159	in 32,64,128,256 and 512-bit /*
2160	.sse_store: {`8`, `8`, `8`, `12`, `24`}, / cost of storing SSE registers*
2161	in 32,64,128,256 and 512-bit /*
2162	.sse_to_integer: `6`, .integer_to_sse: `6`, / SSE->integer and integer->SSE moves /
2163	.mask_to_integer: `6`, .integer_to_mask: `6`, / mask->integer and integer->mask moves /
2164	.mask_load: {`8`, `8`, `8`}, / cost of loading mask register*
2165	in QImode, HImode, SImode. /*
2166	.mask_store: {`6`, `6`, `6`}, / cost if storing mask register*
2167	in QImode, HImode, SImode. /*
2168	.mask_move: `3`, / cost of moving mask register. /
2169	/ End of register allocator costs. /
2170	},
2171
2172	COSTS_N_INSNS (`1`), / cost of an add instruction /
2173	COSTS_N_INSNS (`1`)+`1`, / cost of a lea instruction /
2174	COSTS_N_INSNS (`1`), / variable shift costs /
2175	COSTS_N_INSNS (`1`), / constant shift costs /
2176	.mult_init: {COSTS_N_INSNS (`3`), / cost of starting multiply for QI /
2177	COSTS_N_INSNS (`4`), / HI /
2178	COSTS_N_INSNS (`3`), / SI /
2179	COSTS_N_INSNS (`3`), / DI /
2180	COSTS_N_INSNS (`3`)}, / other /
2181	.mult_bit: `0`, / cost of multiply per each bit set /
2182	/ Expanding div/mod currently doesn't consider parallelism. So the cost*
2183	model is not realistic. We compensate by increasing the latencies a bit. /*
2184	.divide: {COSTS_N_INSNS (`11`), / cost of a divide/mod for QI /
2185	COSTS_N_INSNS (`11`), / HI /
2186	COSTS_N_INSNS (`14`), / SI /
2187	COSTS_N_INSNS (`76`), / DI /
2188	COSTS_N_INSNS (`76`)}, / other /
2189	COSTS_N_INSNS (`1`), / cost of movsx /
2190	COSTS_N_INSNS (`0`), / cost of movzx /
2191	.large_insn: `8`, / "large" insn /
2192	.move_ratio: `17`, / MOVE_RATIO /
2193	.clear_ratio: `17`, / CLEAR_RATIO /
2194	.int_load: {`6`, `6`, `6`}, / cost of loading integer registers*
2195	in QImode, HImode and SImode.
2196	Relative to reg-reg move (2). /*
2197	.int_store: {`8`, `8`, `8`}, / cost of storing integer registers /
2198	.sse_load: {`8`, `8`, `8`, `8`, `16`}, / cost of loading SSE register*
2199	in 32bit, 64bit, 128bit, 256bit and 512bit /*
2200	.sse_store: {`8`, `8`, `8`, `8`, `16`}, / cost of storing SSE register*
2201	in 32bit, 64bit, 128bit, 256bit and 512bit /*
2202	.sse_unaligned_load: {`8`, `8`, `8`, `8`, `16`}, / cost of unaligned loads. /
2203	.sse_unaligned_store: {`8`, `8`, `8`, `8`, `16`}, / cost of unaligned stores. /
2204	.xmm_move: `2`, .ymm_move: `2`, .zmm_move: `4`, / cost of moving XMM,YMM,ZMM register /
2205	.sse_to_integer: `6`, / cost of moving SSE register to integer. /
2206	.gather_static: `20`, .gather_per_elt: `8`, / Gather load static, per_elt. /
2207	.scatter_static: `22`, .scatter_per_elt: `10`, / Gather store static, per_elt. /
2208	.l1_cache_size: `64`, / size of l1 cache. /
2209	.l2_cache_size: `512`, / size of l2 cache. /
2210	.prefetch_block: `64`, / size of prefetch block /
2211	.simultaneous_prefetches: `6`, / number of parallel prefetches /
2212	.branch_cost: `3`, / Branch cost /
2213	COSTS_N_INSNS (`3`), / cost of FADD and FSUB insns. /
2214	COSTS_N_INSNS (`4`), / cost of FMUL instruction. /
2215	COSTS_N_INSNS (`20`), / cost of FDIV instruction. /
2216	COSTS_N_INSNS (`1`), / cost of FABS instruction. /
2217	COSTS_N_INSNS (`1`), / cost of FCHS instruction. /
2218	COSTS_N_INSNS (`20`), / cost of FSQRT instruction. /
2219
2220	COSTS_N_INSNS (`1`), / cost of cheap SSE instruction. /
2221	COSTS_N_INSNS (`4`), / cost of ADDSS/SD SUBSS/SD insns. /
2222	COSTS_N_INSNS (`4`), / cost of MULSS instruction. /
2223	COSTS_N_INSNS (`4`), / cost of MULSD instruction. /
2224	COSTS_N_INSNS (`4`), / cost of FMA SS instruction. /
2225	COSTS_N_INSNS (`4`), / cost of FMA SD instruction. /
2226	COSTS_N_INSNS (`11`), / cost of DIVSS instruction. /
2227	COSTS_N_INSNS (`14`), / cost of DIVSD instruction. /
2228	COSTS_N_INSNS (`12`), / cost of SQRTSS instruction. /
2229	COSTS_N_INSNS (`18`), / cost of SQRTSD instruction. /
2230	.reassoc_int: `1`, .reassoc_fp: `4`, .reassoc_vec_int: `2`, .reassoc_vec_fp: `2`, / reassoc int, fp, vec_int, vec_fp. /
2231	.memcpy: icelake_memcpy,
2232	.memset: icelake_memset,
2233	COSTS_N_INSNS (`3`), / cond_taken_branch_cost. /
2234	COSTS_N_INSNS (`1`), / cond_not_taken_branch_cost. /
2235	.align_loop: "16:11:8", / Loop alignment. /
2236	.align_jump: "16:11:8", / Jump alignment. /
2237	.align_label: "0:0:8", / Label alignment. /
2238	.align_func: "16", / Func alignment. /
2239	.small_unroll_ninsns: `4`, / Small unroll limit. /
2240	.small_unroll_factor: `2`, / Small unroll factor. /
2241	};
2242
2243	/ alderlake_cost should produce code tuned for alderlake family of CPUs. /
2244	static stringop_algs alderlake_memcpy[`2`] = {
2245	{.unknown_size: libcall,
2246	.size: {{`256`, rep_prefix_1_byte, true},
2247	{`256`, loop, false},
2248	{-`1`, libcall, false}}},
2249	{.unknown_size: libcall,
2250	.size: {{`256`, rep_prefix_1_byte, true},
2251	{`256`, loop, false},
2252	{-`1`, libcall, false}}}};
2253	static stringop_algs alderlake_memset[`2`] = {
2254	{.unknown_size: libcall,
2255	.size: {{`256`, rep_prefix_1_byte, true},
2256	{`256`, loop, false},
2257	{-`1`, libcall, false}}},
2258	{.unknown_size: libcall,
2259	.size: {{`256`, rep_prefix_1_byte, true},
2260	{`256`, loop, false},
2261	{-`1`, libcall, false}}}};
2262	static const
2263	struct processor_costs alderlake_cost = {
2264	.hard_register: {
2265	/ Start of register allocator costs. integer->integer move cost is 2. /
2266	.movzbl_load: `6`, / cost for loading QImode using movzbl /
2267	.int_load: {`6`, `6`, `6`}, / cost of loading integer registers*
2268	in QImode, HImode and SImode.
2269	Relative to reg-reg move (2). /*
2270	.int_store: {`6`, `6`, `6`}, / cost of storing integer registers /
2271	.fp_move: `4`, / cost of reg,reg fld/fst /
2272	.fp_load: {`6`, `6`, `12`}, / cost of loading fp registers*
2273	in SFmode, DFmode and XFmode /*
2274	.fp_store: {`6`, `6`, `12`}, / cost of storing fp registers*
2275	in SFmode, DFmode and XFmode /*
2276	.mmx_move: `2`, / cost of moving MMX register /
2277	.mmx_load: {`6`, `6`}, / cost of loading MMX registers*
2278	in SImode and DImode /*
2279	.mmx_store: {`6`, `6`}, / cost of storing MMX registers*
2280	in SImode and DImode /*
2281	.xmm_move: `2`, .ymm_move: `3`, .zmm_move: `4`, / cost of moving XMM,YMM,ZMM register /
2282	.sse_load: {`6`, `6`, `6`, `10`, `15`}, / cost of loading SSE registers*
2283	in 32,64,128,256 and 512-bit /*
2284	.sse_store: {`6`, `6`, `6`, `10`, `15`}, / cost of storing SSE registers*
2285	in 32,64,128,256 and 512-bit /*
2286	.sse_to_integer: `6`, .integer_to_sse: `6`, / SSE->integer and integer->SSE moves /
2287	.mask_to_integer: `6`, .integer_to_mask: `6`, / mask->integer and integer->mask moves /
2288	.mask_load: {`6`, `6`, `6`}, / cost of loading mask register*
2289	in QImode, HImode, SImode. /*
2290	.mask_store: {`6`, `6`, `6`}, / cost if storing mask register*
2291	in QImode, HImode, SImode. /*
2292	.mask_move: `2`, / cost of moving mask register. /
2293	/ End of register allocator costs. /
2294	},
2295
2296	COSTS_N_INSNS (`1`), / cost of an add instruction /
2297	COSTS_N_INSNS (`1`) + `1`, / cost of a lea instruction /
2298	COSTS_N_INSNS (`1`), / variable shift costs /
2299	COSTS_N_INSNS (`1`), / constant shift costs /
2300	.mult_init: {COSTS_N_INSNS (`3`), / cost of starting multiply for QI /
2301	COSTS_N_INSNS (`4`), / HI /
2302	COSTS_N_INSNS (`3`), / SI /
2303	COSTS_N_INSNS (`4`), / DI /
2304	COSTS_N_INSNS (`4`)}, / other /
2305	.mult_bit: `0`, / cost of multiply per each bit set /
2306	.divide: {COSTS_N_INSNS (`16`), / cost of a divide/mod for QI /
2307	COSTS_N_INSNS (`22`), / HI /
2308	COSTS_N_INSNS (`30`), / SI /
2309	COSTS_N_INSNS (`74`), / DI /
2310	COSTS_N_INSNS (`74`)}, / other /
2311	COSTS_N_INSNS (`1`), / cost of movsx /
2312	COSTS_N_INSNS (`1`), / cost of movzx /
2313	.large_insn: `8`, / "large" insn /
2314	.move_ratio: `17`, / MOVE_RATIO /
2315	.clear_ratio: `17`, / CLEAR_RATIO /
2316	.int_load: {`6`, `6`, `6`}, / cost of loading integer registers*
2317	in QImode, HImode and SImode.
2318	Relative to reg-reg move (2). /*
2319	.int_store: {`8`, `8`, `8`}, / cost of storing integer registers /
2320	.sse_load: {`8`, `8`, `8`, `10`, `15`}, / cost of loading SSE register*
2321	in 32bit, 64bit, 128bit, 256bit and 512bit /*
2322	.sse_store: {`8`, `8`, `8`, `10`, `15`}, / cost of storing SSE register*
2323	in 32bit, 64bit, 128bit, 256bit and 512bit /*
2324	.sse_unaligned_load: {`8`, `8`, `8`, `10`, `15`}, / cost of unaligned loads. /
2325	.sse_unaligned_store: {`8`, `8`, `8`, `10`, `15`}, / cost of unaligned storess. /
2326	.xmm_move: `2`, .ymm_move: `3`, .zmm_move: `4`, / cost of moving XMM,YMM,ZMM register /
2327	.sse_to_integer: `6`, / cost of moving SSE register to integer. /
2328	.gather_static: `18`, .gather_per_elt: `6`, / Gather load static, per_elt. /
2329	.scatter_static: `18`, .scatter_per_elt: `6`, / Gather store static, per_elt. /
2330	.l1_cache_size: `32`, / size of l1 cache. /
2331	.l2_cache_size: `512`, / size of l2 cache. /
2332	.prefetch_block: `64`, / size of prefetch block /
2333	.simultaneous_prefetches: `6`, / number of parallel prefetches /
2334	.branch_cost: `3`, / Branch cost /
2335	COSTS_N_INSNS (`3`), / cost of FADD and FSUB insns. /
2336	COSTS_N_INSNS (`5`), / cost of FMUL instruction. /
2337	COSTS_N_INSNS (`17`), / cost of FDIV instruction. /
2338	COSTS_N_INSNS (`1`), / cost of FABS instruction. /
2339	COSTS_N_INSNS (`1`), / cost of FCHS instruction. /
2340	COSTS_N_INSNS (`14`), / cost of FSQRT instruction. /
2341
2342	COSTS_N_INSNS (`1`), / cost of cheap SSE instruction. /
2343	COSTS_N_INSNS (`3`), / cost of ADDSS/SD SUBSS/SD insns. /
2344	COSTS_N_INSNS (`4`), / cost of MULSS instruction. /
2345	COSTS_N_INSNS (`5`), / cost of MULSD instruction. /
2346	COSTS_N_INSNS (`5`), / cost of FMA SS instruction. /
2347	COSTS_N_INSNS (`5`), / cost of FMA SD instruction. /
2348	COSTS_N_INSNS (`13`), / cost of DIVSS instruction. /
2349	COSTS_N_INSNS (`17`), / cost of DIVSD instruction. /
2350	COSTS_N_INSNS (`14`), / cost of SQRTSS instruction. /
2351	COSTS_N_INSNS (`18`), / cost of SQRTSD instruction. /
2352	.reassoc_int: `1`, .reassoc_fp: `4`, .reassoc_vec_int: `3`, .reassoc_vec_fp: `3`, / reassoc int, fp, vec_int, vec_fp. /
2353	.memcpy: alderlake_memcpy,
2354	.memset: alderlake_memset,
2355	COSTS_N_INSNS (`4`), / cond_taken_branch_cost. /
2356	COSTS_N_INSNS (`2`), / cond_not_taken_branch_cost. /
2357	.align_loop: "16:11:8", / Loop alignment. /
2358	.align_jump: "16:11:8", / Jump alignment. /
2359	.align_label: "0:0:8", / Label alignment. /
2360	.align_func: "16", / Func alignment. /
2361	.small_unroll_ninsns: `4`, / Small unroll limit. /
2362	.small_unroll_factor: `2`, / Small unroll factor. /
2363	};
2364
2365	/ BTVER1 has optimized REP instruction for medium sized blocks, but for*
2366	very small blocks it is better to use loop. For large blocks, libcall can
2367	do nontemporary accesses and beat inline considerably. /*
2368	static stringop_algs btver1_memcpy[`2`] = {
2369	{.unknown_size: libcall, .size: {{`6`, loop, false}, {`14`, unrolled_loop, false},
2370	{-`1`, rep_prefix_4_byte, false}}},
2371	{.unknown_size: libcall, .size: {{`16`, loop, false}, {`8192`, rep_prefix_8_byte, false},
2372	{-`1`, libcall, false}}}};
2373	static stringop_algs btver1_memset[`2`] = {
2374	{.unknown_size: libcall, .size: {{`8`, loop, false}, {`24`, unrolled_loop, false},
2375	{`2048`, rep_prefix_4_byte, false}, {-`1`, libcall, false}}},
2376	{.unknown_size: libcall, .size: {{`48`, unrolled_loop, false}, {`8192`, rep_prefix_8_byte, false},
2377	{-`1`, libcall, false}}}};
2378	const struct processor_costs btver1_cost = {
2379	.hard_register: {
2380	/ Start of register allocator costs. integer->integer move cost is 2. /
2381	.movzbl_load: `8`, / cost for loading QImode using movzbl /
2382	.int_load: {`6`, `8`, `6`}, / cost of loading integer registers*
2383	in QImode, HImode and SImode.
2384	Relative to reg-reg move (2). /*
2385	.int_store: {`6`, `8`, `6`}, / cost of storing integer registers /
2386	.fp_move: `4`, / cost of reg,reg fld/fst /
2387	.fp_load: {`12`, `12`, `28`}, / cost of loading fp registers*
2388	in SFmode, DFmode and XFmode /*
2389	.fp_store: {`12`, `12`, `38`}, / cost of storing fp registers*
2390	in SFmode, DFmode and XFmode /*
2391	.mmx_move: `4`, / cost of moving MMX register /
2392	.mmx_load: {`10`, `10`}, / cost of loading MMX registers*
2393	in SImode and DImode /*
2394	.mmx_store: {`12`, `12`}, / cost of storing MMX registers*
2395	in SImode and DImode /*
2396	.xmm_move: `2`, .ymm_move: `4`, .zmm_move: `8`, / cost of moving XMM,YMM,ZMM register /
2397	.sse_load: {`10`, `10`, `12`, `48`, `96`}, / cost of loading SSE registers*
2398	in 32,64,128,256 and 512-bit /*
2399	.sse_store: {`10`, `10`, `12`, `48`, `96`}, / cost of storing SSE registers*
2400	in 32,64,128,256 and 512-bit /*
2401	.sse_to_integer: `14`, .integer_to_sse: `14`, / SSE->integer and integer->SSE moves /
2402	.mask_to_integer: `14`, .integer_to_mask: `14`, / mask->integer and integer->mask moves /
2403	.mask_load: {`6`, `8`, `6`}, / cost of loading mask register*
2404	in QImode, HImode, SImode. /*
2405	.mask_store: {`6`, `8`, `6`}, / cost if storing mask register*
2406	in QImode, HImode, SImode. /*
2407	.mask_move: `2`, / cost of moving mask register. /
2408	/ End of register allocator costs. /
2409	},
2410
2411	COSTS_N_INSNS (`1`), / cost of an add instruction /
2412	COSTS_N_INSNS (`2`), / cost of a lea instruction /
2413	COSTS_N_INSNS (`1`), / variable shift costs /
2414	COSTS_N_INSNS (`1`), / constant shift costs /
2415	.mult_init: {COSTS_N_INSNS (`3`), / cost of starting multiply for QI /
2416	COSTS_N_INSNS (`4`), / HI /
2417	COSTS_N_INSNS (`3`), / SI /
2418	COSTS_N_INSNS (`4`), / DI /
2419	COSTS_N_INSNS (`5`)}, / other /
2420	.mult_bit: `0`, / cost of multiply per each bit set /
2421	.divide: {COSTS_N_INSNS (`19`), / cost of a divide/mod for QI /
2422	COSTS_N_INSNS (`35`), / HI /
2423	COSTS_N_INSNS (`51`), / SI /
2424	COSTS_N_INSNS (`83`), / DI /
2425	COSTS_N_INSNS (`83`)}, / other /
2426	COSTS_N_INSNS (`1`), / cost of movsx /
2427	COSTS_N_INSNS (`1`), / cost of movzx /
2428	.large_insn: `8`, / "large" insn /
2429	.move_ratio: `9`, / MOVE_RATIO /
2430	.clear_ratio: `6`, / CLEAR_RATIO /
2431	.int_load: {`6`, `8`, `6`}, / cost of loading integer registers*
2432	in QImode, HImode and SImode.
2433	Relative to reg-reg move (2). /*
2434	.int_store: {`6`, `8`, `6`}, / cost of storing integer registers /
2435	.sse_load: {`10`, `10`, `12`, `48`, `96`}, / cost of loading SSE register*
2436	in 32bit, 64bit, 128bit, 256bit and 512bit /*
2437	.sse_store: {`10`, `10`, `12`, `48`, `96`}, / cost of storing SSE register*
2438	in 32bit, 64bit, 128bit, 256bit and 512bit /*
2439	.sse_unaligned_load: {`10`, `10`, `12`, `48`, `96`}, / cost of unaligned loads. /
2440	.sse_unaligned_store: {`10`, `10`, `12`, `48`, `96`}, / cost of unaligned stores. /
2441	.xmm_move: `2`, .ymm_move: `4`, .zmm_move: `8`, / cost of moving XMM,YMM,ZMM register /
2442	.sse_to_integer: `14`, / cost of moving SSE register to integer. /
2443	.gather_static: `10`, .gather_per_elt: `10`, / Gather load static, per_elt. /
2444	.scatter_static: `10`, .scatter_per_elt: `10`, / Gather store static, per_elt. /
2445	.l1_cache_size: `32`, / size of l1 cache. /
2446	.l2_cache_size: `512`, / size of l2 cache. /
2447	.prefetch_block: `64`, / size of prefetch block /
2448	.simultaneous_prefetches: `100`, / number of parallel prefetches /
2449	.branch_cost: `2`, / Branch cost /
2450	COSTS_N_INSNS (`4`), / cost of FADD and FSUB insns. /
2451	COSTS_N_INSNS (`4`), / cost of FMUL instruction. /
2452	COSTS_N_INSNS (`19`), / cost of FDIV instruction. /
2453	COSTS_N_INSNS (`2`), / cost of FABS instruction. /
2454	COSTS_N_INSNS (`2`), / cost of FCHS instruction. /
2455	COSTS_N_INSNS (`35`), / cost of FSQRT instruction. /
2456
2457	COSTS_N_INSNS (`1`), / cost of cheap SSE instruction. /
2458	COSTS_N_INSNS (`3`), / cost of ADDSS/SD SUBSS/SD insns. /
2459	COSTS_N_INSNS (`2`), / cost of MULSS instruction. /
2460	COSTS_N_INSNS (`4`), / cost of MULSD instruction. /
2461	COSTS_N_INSNS (`5`), / cost of FMA SS instruction. /
2462	COSTS_N_INSNS (`5`), / cost of FMA SD instruction. /
2463	COSTS_N_INSNS (`13`), / cost of DIVSS instruction. /
2464	COSTS_N_INSNS (`17`), / cost of DIVSD instruction. /
2465	COSTS_N_INSNS (`14`), / cost of SQRTSS instruction. /
2466	COSTS_N_INSNS (`48`), / cost of SQRTSD instruction. /
2467	.reassoc_int: `1`, .reassoc_fp: `1`, .reassoc_vec_int: `1`, .reassoc_vec_fp: `1`, / reassoc int, fp, vec_int, vec_fp. /
2468	.memcpy: btver1_memcpy,
2469	.memset: btver1_memset,
2470	COSTS_N_INSNS (`2`), / cond_taken_branch_cost. /
2471	COSTS_N_INSNS (`1`), / cond_not_taken_branch_cost. /
2472	.align_loop: "16:11:8", / Loop alignment. /
2473	.align_jump: "16:8:8", / Jump alignment. /
2474	.align_label: "0:0:8", / Label alignment. /
2475	.align_func: "11", / Func alignment. /
2476	.small_unroll_ninsns: `4`, / Small unroll limit. /
2477	.small_unroll_factor: `2`, / Small unroll factor. /
2478	};
2479
2480	static stringop_algs btver2_memcpy[`2`] = {
2481	{.unknown_size: libcall, .size: {{`6`, loop, false}, {`14`, unrolled_loop, false},
2482	{-`1`, rep_prefix_4_byte, false}}},
2483	{.unknown_size: libcall, .size: {{`16`, loop, false}, {`8192`, rep_prefix_8_byte, false},
2484	{-`1`, libcall, false}}}};
2485	static stringop_algs btver2_memset[`2`] = {
2486	{.unknown_size: libcall, .size: {{`8`, loop, false}, {`24`, unrolled_loop, false},
2487	{`2048`, rep_prefix_4_byte, false}, {-`1`, libcall, false}}},
2488	{.unknown_size: libcall, .size: {{`48`, unrolled_loop, false}, {`8192`, rep_prefix_8_byte, false},
2489	{-`1`, libcall, false}}}};
2490	const struct processor_costs btver2_cost = {
2491	.hard_register: {
2492	/ Start of register allocator costs. integer->integer move cost is 2. /
2493	.movzbl_load: `8`, / cost for loading QImode using movzbl /
2494	.int_load: {`8`, `8`, `6`}, / cost of loading integer registers*
2495	in QImode, HImode and SImode.
2496	Relative to reg-reg move (2). /*
2497	.int_store: {`8`, `8`, `6`}, / cost of storing integer registers /
2498	.fp_move: `4`, / cost of reg,reg fld/fst /
2499	.fp_load: {`12`, `12`, `28`}, / cost of loading fp registers*
2500	in SFmode, DFmode and XFmode /*
2501	.fp_store: {`12`, `12`, `38`}, / cost of storing fp registers*
2502	in SFmode, DFmode and XFmode /*
2503	.mmx_move: `4`, / cost of moving MMX register /
2504	.mmx_load: {`10`, `10`}, / cost of loading MMX registers*
2505	in SImode and DImode /*
2506	.mmx_store: {`12`, `12`}, / cost of storing MMX registers*
2507	in SImode and DImode /*
2508	.xmm_move: `2`, .ymm_move: `4`, .zmm_move: `8`, / cost of moving XMM,YMM,ZMM register /
2509	.sse_load: {`10`, `10`, `12`, `48`, `96`}, / cost of loading SSE registers*
2510	in 32,64,128,256 and 512-bit /*
2511	.sse_store: {`10`, `10`, `12`, `48`, `96`}, / cost of storing SSE registers*
2512	in 32,64,128,256 and 512-bit /*
2513	.sse_to_integer: `14`, .integer_to_sse: `14`, / SSE->integer and integer->SSE moves /
2514	.mask_to_integer: `14`, .integer_to_mask: `14`, / mask->integer and integer->mask moves /
2515	.mask_load: {`8`, `8`, `6`}, / cost of loading mask register*
2516	in QImode, HImode, SImode. /*
2517	.mask_store: {`8`, `8`, `6`}, / cost if storing mask register*
2518	in QImode, HImode, SImode. /*
2519	.mask_move: `2`, / cost of moving mask register. /
2520	/ End of register allocator costs. /
2521	},
2522
2523	COSTS_N_INSNS (`1`), / cost of an add instruction /
2524	COSTS_N_INSNS (`2`), / cost of a lea instruction /
2525	COSTS_N_INSNS (`1`), / variable shift costs /
2526	COSTS_N_INSNS (`1`), / constant shift costs /
2527	.mult_init: {COSTS_N_INSNS (`3`), / cost of starting multiply for QI /
2528	COSTS_N_INSNS (`4`), / HI /
2529	COSTS_N_INSNS (`3`), / SI /
2530	COSTS_N_INSNS (`4`), / DI /
2531	COSTS_N_INSNS (`5`)}, / other /
2532	.mult_bit: `0`, / cost of multiply per each bit set /
2533	.divide: {COSTS_N_INSNS (`19`), / cost of a divide/mod for QI /
2534	COSTS_N_INSNS (`35`), / HI /
2535	COSTS_N_INSNS (`51`), / SI /
2536	COSTS_N_INSNS (`83`), / DI /
2537	COSTS_N_INSNS (`83`)}, / other /
2538	COSTS_N_INSNS (`1`), / cost of movsx /
2539	COSTS_N_INSNS (`1`), / cost of movzx /
2540	.large_insn: `8`, / "large" insn /
2541	.move_ratio: `9`, / MOVE_RATIO /
2542	.clear_ratio: `6`, / CLEAR_RATIO /
2543	.int_load: {`8`, `8`, `6`}, / cost of loading integer registers*
2544	in QImode, HImode and SImode.
2545	Relative to reg-reg move (2). /*
2546	.int_store: {`8`, `8`, `6`}, / cost of storing integer registers /
2547	.sse_load: {`10`, `10`, `12`, `48`, `96`}, / cost of loading SSE register*
2548	in 32bit, 64bit, 128bit, 256bit and 512bit /*
2549	.sse_store: {`10`, `10`, `12`, `48`, `96`}, / cost of storing SSE register*
2550	in 32bit, 64bit, 128bit, 256bit and 512bit /*
2551	.sse_unaligned_load: {`10`, `10`, `12`, `48`, `96`}, / cost of unaligned loads. /
2552	.sse_unaligned_store: {`10`, `10`, `12`, `48`, `96`}, / cost of unaligned stores. /
2553	.xmm_move: `2`, .ymm_move: `4`, .zmm_move: `8`, / cost of moving XMM,YMM,ZMM register /
2554	.sse_to_integer: `14`, / cost of moving SSE register to integer. /
2555	.gather_static: `10`, .gather_per_elt: `10`, / Gather load static, per_elt. /
2556	.scatter_static: `10`, .scatter_per_elt: `10`, / Gather store static, per_elt. /
2557	.l1_cache_size: `32`, / size of l1 cache. /
2558	.l2_cache_size: `2048`, / size of l2 cache. /
2559	.prefetch_block: `64`, / size of prefetch block /
2560	.simultaneous_prefetches: `100`, / number of parallel prefetches /
2561	.branch_cost: `2`, / Branch cost /
2562	COSTS_N_INSNS (`4`), / cost of FADD and FSUB insns. /
2563	COSTS_N_INSNS (`4`), / cost of FMUL instruction. /
2564	COSTS_N_INSNS (`19`), / cost of FDIV instruction. /
2565	COSTS_N_INSNS (`2`), / cost of FABS instruction. /
2566	COSTS_N_INSNS (`2`), / cost of FCHS instruction. /
2567	COSTS_N_INSNS (`35`), / cost of FSQRT instruction. /
2568
2569	COSTS_N_INSNS (`1`), / cost of cheap SSE instruction. /
2570	COSTS_N_INSNS (`3`), / cost of ADDSS/SD SUBSS/SD insns. /
2571	COSTS_N_INSNS (`2`), / cost of MULSS instruction. /
2572	COSTS_N_INSNS (`4`), / cost of MULSD instruction. /
2573	COSTS_N_INSNS (`5`), / cost of FMA SS instruction. /
2574	COSTS_N_INSNS (`5`), / cost of FMA SD instruction. /
2575	COSTS_N_INSNS (`13`), / cost of DIVSS instruction. /
2576	COSTS_N_INSNS (`19`), / cost of DIVSD instruction. /
2577	COSTS_N_INSNS (`16`), / cost of SQRTSS instruction. /
2578	COSTS_N_INSNS (`21`), / cost of SQRTSD instruction. /
2579	.reassoc_int: `1`, .reassoc_fp: `1`, .reassoc_vec_int: `1`, .reassoc_vec_fp: `1`, / reassoc int, fp, vec_int, vec_fp. /
2580	.memcpy: btver2_memcpy,
2581	.memset: btver2_memset,
2582	COSTS_N_INSNS (`2`), / cond_taken_branch_cost. /
2583	COSTS_N_INSNS (`1`), / cond_not_taken_branch_cost. /
2584	.align_loop: "16:11:8", / Loop alignment. /
2585	.align_jump: "16:8:8", / Jump alignment. /
2586	.align_label: "0:0:8", / Label alignment. /
2587	.align_func: "11", / Func alignment. /
2588	.small_unroll_ninsns: `4`, / Small unroll limit. /
2589	.small_unroll_factor: `2`, / Small unroll factor. /
2590	};
2591
2592	static stringop_algs pentium4_memcpy[`2`] = {
2593	{.unknown_size: libcall, .size: {{`12`, loop_1_byte, false}, {-`1`, rep_prefix_4_byte, false}}},
2594	DUMMY_STRINGOP_ALGS};
2595	static stringop_algs pentium4_memset[`2`] = {
2596	{.unknown_size: libcall, .size: {{`6`, loop_1_byte, false}, {`48`, loop, false},
2597	{`20480`, rep_prefix_4_byte, false}, {-`1`, libcall, false}}},
2598	DUMMY_STRINGOP_ALGS};
2599
2600	static const
2601	struct processor_costs pentium4_cost = {
2602	.hard_register: {
2603	/ Start of register allocator costs. integer->integer move cost is 2. /
2604	.movzbl_load: `5`, / cost for loading QImode using movzbl /
2605	.int_load: {`4`, `5`, `4`}, / cost of loading integer registers*
2606	in QImode, HImode and SImode.
2607	Relative to reg-reg move (2). /*
2608	.int_store: {`2`, `3`, `2`}, / cost of storing integer registers /
2609	.fp_move: `12`, / cost of reg,reg fld/fst /
2610	.fp_load: {`14`, `14`, `14`}, / cost of loading fp registers*
2611	in SFmode, DFmode and XFmode /*
2612	.fp_store: {`14`, `14`, `14`}, / cost of storing fp registers*
2613	in SFmode, DFmode and XFmode /*
2614	.mmx_move: `12`, / cost of moving MMX register /
2615	.mmx_load: {`16`, `16`}, / cost of loading MMX registers*
2616	in SImode and DImode /*
2617	.mmx_store: {`16`, `16`}, / cost of storing MMX registers*
2618	in SImode and DImode /*
2619	.xmm_move: `12`, .ymm_move: `24`, .zmm_move: `48`, / cost of moving XMM,YMM,ZMM register /
2620	.sse_load: {`16`, `16`, `16`, `32`, `64`}, / cost of loading SSE registers*
2621	in 32,64,128,256 and 512-bit /*
2622	.sse_store: {`16`, `16`, `16`, `32`, `64`}, / cost of storing SSE registers*
2623	in 32,64,128,256 and 512-bit /*
2624	.sse_to_integer: `20`, .integer_to_sse: `12`, / SSE->integer and integer->SSE moves /
2625	.mask_to_integer: `20`, .integer_to_mask: `12`, / mask->integer and integer->mask moves /
2626	.mask_load: {`4`, `5`, `4`}, / cost of loading mask register*
2627	in QImode, HImode, SImode. /*
2628	.mask_store: {`2`, `3`, `2`}, / cost if storing mask register*
2629	in QImode, HImode, SImode. /*
2630	.mask_move: `2`, / cost of moving mask register. /
2631	/ End of register allocator costs. /
2632	},
2633
2634	COSTS_N_INSNS (`1`), / cost of an add instruction /
2635	COSTS_N_INSNS (`3`), / cost of a lea instruction /
2636	COSTS_N_INSNS (`4`), / variable shift costs /
2637	COSTS_N_INSNS (`4`), / constant shift costs /
2638	.mult_init: {COSTS_N_INSNS (`15`), / cost of starting multiply for QI /
2639	COSTS_N_INSNS (`15`), / HI /
2640	COSTS_N_INSNS (`15`), / SI /
2641	COSTS_N_INSNS (`15`), / DI /
2642	COSTS_N_INSNS (`15`)}, / other /
2643	.mult_bit: `0`, / cost of multiply per each bit set /
2644	.divide: {COSTS_N_INSNS (`56`), / cost of a divide/mod for QI /
2645	COSTS_N_INSNS (`56`), / HI /
2646	COSTS_N_INSNS (`56`), / SI /
2647	COSTS_N_INSNS (`56`), / DI /
2648	COSTS_N_INSNS (`56`)}, / other /
2649	COSTS_N_INSNS (`1`), / cost of movsx /
2650	COSTS_N_INSNS (`1`), / cost of movzx /
2651	.large_insn: `16`, / "large" insn /
2652	.move_ratio: `6`, / MOVE_RATIO /
2653	.clear_ratio: `6`, / CLEAR_RATIO /
2654	.int_load: {`4`, `5`, `4`}, / cost of loading integer registers*
2655	in QImode, HImode and SImode.
2656	Relative to reg-reg move (2). /*
2657	.int_store: {`2`, `3`, `2`}, / cost of storing integer registers /
2658	.sse_load: {`16`, `16`, `16`, `32`, `64`}, / cost of loading SSE register*
2659	in 32bit, 64bit, 128bit, 256bit and 512bit /*
2660	.sse_store: {`16`, `16`, `16`, `32`, `64`}, / cost of storing SSE register*
2661	in 32bit, 64bit, 128bit, 256bit and 512bit /*
2662	.sse_unaligned_load: {`32`, `32`, `32`, `64`, `128`}, / cost of unaligned loads. /
2663	.sse_unaligned_store: {`32`, `32`, `32`, `64`, `128`}, / cost of unaligned stores. /
2664	.xmm_move: `12`, .ymm_move: `24`, .zmm_move: `48`, / cost of moving XMM,YMM,ZMM register /
2665	.sse_to_integer: `20`, / cost of moving SSE register to integer. /
2666	.gather_static: `16`, .gather_per_elt: `16`, / Gather load static, per_elt. /
2667	.scatter_static: `16`, .scatter_per_elt: `16`, / Gather store static, per_elt. /
2668	.l1_cache_size: `8`, / size of l1 cache. /
2669	.l2_cache_size: `256`, / size of l2 cache. /
2670	.prefetch_block: `64`, / size of prefetch block /
2671	.simultaneous_prefetches: `6`, / number of parallel prefetches /
2672	.branch_cost: `2`, / Branch cost /
2673	COSTS_N_INSNS (`5`), / cost of FADD and FSUB insns. /
2674	COSTS_N_INSNS (`7`), / cost of FMUL instruction. /
2675	COSTS_N_INSNS (`43`), / cost of FDIV instruction. /
2676	COSTS_N_INSNS (`2`), / cost of FABS instruction. /
2677	COSTS_N_INSNS (`2`), / cost of FCHS instruction. /
2678	COSTS_N_INSNS (`43`), / cost of FSQRT instruction. /
2679
2680	COSTS_N_INSNS (`2`), / cost of cheap SSE instruction. /
2681	COSTS_N_INSNS (`4`), / cost of ADDSS/SD SUBSS/SD insns. /
2682	COSTS_N_INSNS (`6`), / cost of MULSS instruction. /
2683	COSTS_N_INSNS (`6`), / cost of MULSD instruction. /
2684	COSTS_N_INSNS (`6`), / cost of FMA SS instruction. /
2685	COSTS_N_INSNS (`6`), / cost of FMA SD instruction. /
2686	COSTS_N_INSNS (`23`), / cost of DIVSS instruction. /
2687	COSTS_N_INSNS (`38`), / cost of DIVSD instruction. /
2688	COSTS_N_INSNS (`23`), / cost of SQRTSS instruction. /
2689	COSTS_N_INSNS (`38`), / cost of SQRTSD instruction. /
2690	.reassoc_int: `1`, .reassoc_fp: `1`, .reassoc_vec_int: `1`, .reassoc_vec_fp: `1`, / reassoc int, fp, vec_int, vec_fp. /
2691	.memcpy: pentium4_memcpy,
2692	.memset: pentium4_memset,
2693	COSTS_N_INSNS (`3`), / cond_taken_branch_cost. /
2694	COSTS_N_INSNS (`1`), / cond_not_taken_branch_cost. /
2695	NULL, / Loop alignment. /
2696	NULL, / Jump alignment. /
2697	NULL, / Label alignment. /
2698	NULL, / Func alignment. /
2699	.small_unroll_ninsns: `4`, / Small unroll limit. /
2700	.small_unroll_factor: `2`, / Small unroll factor. /
2701	};
2702
2703	static stringop_algs nocona_memcpy[`2`] = {
2704	{.unknown_size: libcall, .size: {{`12`, loop_1_byte, false}, {-`1`, rep_prefix_4_byte, false}}},
2705	{.unknown_size: libcall, .size: {{`32`, loop, false}, {`20000`, rep_prefix_8_byte, false},
2706	{`100000`, unrolled_loop, false}, {-`1`, libcall, false}}}};
2707
2708	static stringop_algs nocona_memset[`2`] = {
2709	{.unknown_size: libcall, .size: {{`6`, loop_1_byte, false}, {`48`, loop, false},
2710	{`20480`, rep_prefix_4_byte, false}, {-`1`, libcall, false}}},
2711	{.unknown_size: libcall, .size: {{`24`, loop, false}, {`64`, unrolled_loop, false},
2712	{`8192`, rep_prefix_8_byte, false}, {-`1`, libcall, false}}}};
2713
2714	static const
2715	struct processor_costs nocona_cost = {
2716	.hard_register: {
2717	/ Start of register allocator costs. integer->integer move cost is 2. /
2718	.movzbl_load: `4`, / cost for loading QImode using movzbl /
2719	.int_load: {`4`, `4`, `4`}, / cost of loading integer registers*
2720	in QImode, HImode and SImode.
2721	Relative to reg-reg move (2). /*
2722	.int_store: {`4`, `4`, `4`}, / cost of storing integer registers /
2723	.fp_move: `12`, / cost of reg,reg fld/fst /
2724	.fp_load: {`14`, `14`, `14`}, / cost of loading fp registers*
2725	in SFmode, DFmode and XFmode /*
2726	.fp_store: {`14`, `14`, `14`}, / cost of storing fp registers*
2727	in SFmode, DFmode and XFmode /*
2728	.mmx_move: `14`, / cost of moving MMX register /
2729	.mmx_load: {`12`, `12`}, / cost of loading MMX registers*
2730	in SImode and DImode /*
2731	.mmx_store: {`12`, `12`}, / cost of storing MMX registers*
2732	in SImode and DImode /*
2733	.xmm_move: `6`, .ymm_move: `12`, .zmm_move: `24`, / cost of moving XMM,YMM,ZMM register /
2734	.sse_load: {`12`, `12`, `12`, `24`, `48`}, / cost of loading SSE registers*
2735	in 32,64,128,256 and 512-bit /*
2736	.sse_store: {`12`, `12`, `12`, `24`, `48`}, / cost of storing SSE registers*
2737	in 32,64,128,256 and 512-bit /*
2738	.sse_to_integer: `20`, .integer_to_sse: `12`, / SSE->integer and integer->SSE moves /
2739	.mask_to_integer: `20`, .integer_to_mask: `12`, / mask->integer and integer->mask moves /
2740	.mask_load: {`4`, `4`, `4`}, / cost of loading mask register*
2741	in QImode, HImode, SImode. /*
2742	.mask_store: {`4`, `4`, `4`}, / cost if storing mask register*
2743	in QImode, HImode, SImode. /*
2744	.mask_move: `2`, / cost of moving mask register. /
2745	/ End of register allocator costs. /
2746	},
2747
2748	COSTS_N_INSNS (`1`), / cost of an add instruction /
2749	COSTS_N_INSNS (`1`), / cost of a lea instruction /
2750	COSTS_N_INSNS (`1`), / variable shift costs /
2751	COSTS_N_INSNS (`1`), / constant shift costs /
2752	.mult_init: {COSTS_N_INSNS (`10`), / cost of starting multiply for QI /
2753	COSTS_N_INSNS (`10`), / HI /
2754	COSTS_N_INSNS (`10`), / SI /
2755	COSTS_N_INSNS (`10`), / DI /
2756	COSTS_N_INSNS (`10`)}, / other /
2757	.mult_bit: `0`, / cost of multiply per each bit set /
2758	.divide: {COSTS_N_INSNS (`66`), / cost of a divide/mod for QI /
2759	COSTS_N_INSNS (`66`), / HI /
2760	COSTS_N_INSNS (`66`), / SI /
2761	COSTS_N_INSNS (`66`), / DI /
2762	COSTS_N_INSNS (`66`)}, / other /
2763	COSTS_N_INSNS (`1`), / cost of movsx /
2764	COSTS_N_INSNS (`1`), / cost of movzx /
2765	.large_insn: `16`, / "large" insn /
2766	.move_ratio: `17`, / MOVE_RATIO /
2767	.clear_ratio: `6`, / CLEAR_RATIO /
2768	.int_load: {`4`, `4`, `4`}, / cost of loading integer registers*
2769	in QImode, HImode and SImode.
2770	Relative to reg-reg move (2). /*
2771	.int_store: {`4`, `4`, `4`}, / cost of storing integer registers /
2772	.sse_load: {`12`, `12`, `12`, `24`, `48`}, / cost of loading SSE register*
2773	in 32bit, 64bit, 128bit, 256bit and 512bit /*
2774	.sse_store: {`12`, `12`, `12`, `24`, `48`}, / cost of storing SSE register*
2775	in 32bit, 64bit, 128bit, 256bit and 512bit /*
2776	.sse_unaligned_load: {`24`, `24`, `24`, `48`, `96`}, / cost of unaligned loads. /
2777	.sse_unaligned_store: {`24`, `24`, `24`, `48`, `96`}, / cost of unaligned stores. /
2778	.xmm_move: `6`, .ymm_move: `12`, .zmm_move: `24`, / cost of moving XMM,YMM,ZMM register /
2779	.sse_to_integer: `20`, / cost of moving SSE register to integer. /
2780	.gather_static: `12`, .gather_per_elt: `12`, / Gather load static, per_elt. /
2781	.scatter_static: `12`, .scatter_per_elt: `12`, / Gather store static, per_elt. /
2782	.l1_cache_size: `8`, / size of l1 cache. /
2783	.l2_cache_size: `1024`, / size of l2 cache. /
2784	.prefetch_block: `64`, / size of prefetch block /
2785	.simultaneous_prefetches: `8`, / number of parallel prefetches /
2786	.branch_cost: `1`, / Branch cost /
2787	COSTS_N_INSNS (`6`), / cost of FADD and FSUB insns. /
2788	COSTS_N_INSNS (`8`), / cost of FMUL instruction. /
2789	COSTS_N_INSNS (`40`), / cost of FDIV instruction. /
2790	COSTS_N_INSNS (`3`), / cost of FABS instruction. /
2791	COSTS_N_INSNS (`3`), / cost of FCHS instruction. /
2792	COSTS_N_INSNS (`44`), / cost of FSQRT instruction. /
2793
2794	COSTS_N_INSNS (`2`), / cost of cheap SSE instruction. /
2795	COSTS_N_INSNS (`5`), / cost of ADDSS/SD SUBSS/SD insns. /
2796	COSTS_N_INSNS (`7`), / cost of MULSS instruction. /
2797	COSTS_N_INSNS (`7`), / cost of MULSD instruction. /
2798	COSTS_N_INSNS (`7`), / cost of FMA SS instruction. /
2799	COSTS_N_INSNS (`7`), / cost of FMA SD instruction. /
2800	COSTS_N_INSNS (`32`), / cost of DIVSS instruction. /
2801	COSTS_N_INSNS (`40`), / cost of DIVSD instruction. /
2802	COSTS_N_INSNS (`32`), / cost of SQRTSS instruction. /
2803	COSTS_N_INSNS (`41`), / cost of SQRTSD instruction. /
2804	.reassoc_int: `1`, .reassoc_fp: `1`, .reassoc_vec_int: `1`, .reassoc_vec_fp: `1`, / reassoc int, fp, vec_int, vec_fp. /
2805	.memcpy: nocona_memcpy,
2806	.memset: nocona_memset,
2807	COSTS_N_INSNS (`3`), / cond_taken_branch_cost. /
2808	COSTS_N_INSNS (`1`), / cond_not_taken_branch_cost. /
2809	NULL, / Loop alignment. /
2810	NULL, / Jump alignment. /
2811	NULL, / Label alignment. /
2812	NULL, / Func alignment. /
2813	.small_unroll_ninsns: `4`, / Small unroll limit. /
2814	.small_unroll_factor: `2`, / Small unroll factor. /
2815	};
2816
2817	static stringop_algs atom_memcpy[`2`] = {
2818	{.unknown_size: libcall, .size: {{`11`, loop, false}, {-`1`, rep_prefix_4_byte, false}}},
2819	{.unknown_size: libcall, .size: {{`32`, loop, false}, {`64`, rep_prefix_4_byte, false},
2820	{`8192`, rep_prefix_8_byte, false}, {-`1`, libcall, false}}}};
2821	static stringop_algs atom_memset[`2`] = {
2822	{.unknown_size: libcall, .size: {{`8`, loop, false}, {`15`, unrolled_loop, false},
2823	{`2048`, rep_prefix_4_byte, false}, {-`1`, libcall, false}}},
2824	{.unknown_size: libcall, .size: {{`24`, loop, false}, {`32`, unrolled_loop, false},
2825	{`8192`, rep_prefix_8_byte, false}, {-`1`, libcall, false}}}};
2826	static const
2827	struct processor_costs atom_cost = {
2828	.hard_register: {
2829	/ Start of register allocator costs. integer->integer move cost is 2. /
2830	.movzbl_load: `6`, / cost for loading QImode using movzbl /
2831	.int_load: {`6`, `6`, `6`}, / cost of loading integer registers*
2832	in QImode, HImode and SImode.
2833	Relative to reg-reg move (2). /*
2834	.int_store: {`6`, `6`, `6`}, / cost of storing integer registers /
2835	.fp_move: `4`, / cost of reg,reg fld/fst /
2836	.fp_load: {`6`, `6`, `18`}, / cost of loading fp registers*
2837	in SFmode, DFmode and XFmode /*
2838	.fp_store: {`14`, `14`, `24`}, / cost of storing fp registers*
2839	in SFmode, DFmode and XFmode /*
2840	.mmx_move: `2`, / cost of moving MMX register /
2841	.mmx_load: {`8`, `8`}, / cost of loading MMX registers*
2842	in SImode and DImode /*
2843	.mmx_store: {`10`, `10`}, / cost of storing MMX registers*
2844	in SImode and DImode /*
2845	.xmm_move: `2`, .ymm_move: `4`, .zmm_move: `8`, / cost of moving XMM,YMM,ZMM register /
2846	.sse_load: {`8`, `8`, `8`, `16`, `32`}, / cost of loading SSE registers*
2847	in 32,64,128,256 and 512-bit /*
2848	.sse_store: {`8`, `8`, `8`, `16`, `32`}, / cost of storing SSE registers*
2849	in 32,64,128,256 and 512-bit /*
2850	.sse_to_integer: `8`, .integer_to_sse: `6`, / SSE->integer and integer->SSE moves /
2851	.mask_to_integer: `8`, .integer_to_mask: `6`, / mask->integer and integer->mask moves /
2852	.mask_load: {`6`, `6`, `6`}, / cost of loading mask register*
2853	in QImode, HImode, SImode. /*
2854	.mask_store: {`6`, `6`, `6`}, / cost if storing mask register*
2855	in QImode, HImode, SImode. /*
2856	.mask_move: `2`, / cost of moving mask register. /
2857	/ End of register allocator costs. /
2858	},
2859
2860	COSTS_N_INSNS (`1`), / cost of an add instruction /
2861	COSTS_N_INSNS (`1`) + `1`, / cost of a lea instruction /
2862	COSTS_N_INSNS (`1`), / variable shift costs /
2863	COSTS_N_INSNS (`1`), / constant shift costs /
2864	.mult_init: {COSTS_N_INSNS (`3`), / cost of starting multiply for QI /
2865	COSTS_N_INSNS (`4`), / HI /
2866	COSTS_N_INSNS (`3`), / SI /
2867	COSTS_N_INSNS (`4`), / DI /
2868	COSTS_N_INSNS (`2`)}, / other /
2869	.mult_bit: `0`, / cost of multiply per each bit set /
2870	.divide: {COSTS_N_INSNS (`18`), / cost of a divide/mod for QI /
2871	COSTS_N_INSNS (`26`), / HI /
2872	COSTS_N_INSNS (`42`), / SI /
2873	COSTS_N_INSNS (`74`), / DI /
2874	COSTS_N_INSNS (`74`)}, / other /
2875	COSTS_N_INSNS (`1`), / cost of movsx /
2876	COSTS_N_INSNS (`1`), / cost of movzx /
2877	.large_insn: `8`, / "large" insn /
2878	.move_ratio: `17`, / MOVE_RATIO /
2879	.clear_ratio: `6`, / CLEAR_RATIO /
2880	.int_load: {`6`, `6`, `6`}, / cost of loading integer registers*
2881	in QImode, HImode and SImode.
2882	Relative to reg-reg move (2). /*
2883	.int_store: {`6`, `6`, `6`}, / cost of storing integer registers /
2884	.sse_load: {`8`, `8`, `8`, `16`, `32`}, / cost of loading SSE register*
2885	in 32bit, 64bit, 128bit, 256bit and 512bit /*
2886	.sse_store: {`8`, `8`, `8`, `16`, `32`}, / cost of storing SSE register*
2887	in 32bit, 64bit, 128bit, 256bit and 512bit /*
2888	.sse_unaligned_load: {`16`, `16`, `16`, `32`, `64`}, / cost of unaligned loads. /
2889	.sse_unaligned_store: {`16`, `16`, `16`, `32`, `64`}, / cost of unaligned stores. /
2890	.xmm_move: `2`, .ymm_move: `4`, .zmm_move: `8`, / cost of moving XMM,YMM,ZMM register /
2891	.sse_to_integer: `8`, / cost of moving SSE register to integer. /
2892	.gather_static: `8`, .gather_per_elt: `8`, / Gather load static, per_elt. /
2893	.scatter_static: `8`, .scatter_per_elt: `8`, / Gather store static, per_elt. /
2894	.l1_cache_size: `32`, / size of l1 cache. /
2895	.l2_cache_size: `256`, / size of l2 cache. /
2896	.prefetch_block: `64`, / size of prefetch block /
2897	.simultaneous_prefetches: `6`, / number of parallel prefetches /
2898	.branch_cost: `3`, / Branch cost /
2899	COSTS_N_INSNS (`8`), / cost of FADD and FSUB insns. /
2900	COSTS_N_INSNS (`8`), / cost of FMUL instruction. /
2901	COSTS_N_INSNS (`20`), / cost of FDIV instruction. /
2902	COSTS_N_INSNS (`8`), / cost of FABS instruction. /
2903	COSTS_N_INSNS (`8`), / cost of FCHS instruction. /
2904	COSTS_N_INSNS (`40`), / cost of FSQRT instruction. /
2905
2906	COSTS_N_INSNS (`1`), / cost of cheap SSE instruction. /
2907	COSTS_N_INSNS (`5`), / cost of ADDSS/SD SUBSS/SD insns. /
2908	COSTS_N_INSNS (`4`), / cost of MULSS instruction. /
2909	COSTS_N_INSNS (`5`), / cost of MULSD instruction. /
2910	COSTS_N_INSNS (`6`), / cost of FMA SS instruction. /
2911	COSTS_N_INSNS (`6`), / cost of FMA SD instruction. /
2912	COSTS_N_INSNS (`31`), / cost of DIVSS instruction. /
2913	COSTS_N_INSNS (`60`), / cost of DIVSD instruction. /
2914	COSTS_N_INSNS (`31`), / cost of SQRTSS instruction. /
2915	COSTS_N_INSNS (`63`), / cost of SQRTSD instruction. /
2916	.reassoc_int: `2`, .reassoc_fp: `2`, .reassoc_vec_int: `2`, .reassoc_vec_fp: `2`, / reassoc int, fp, vec_int, vec_fp. /
2917	.memcpy: atom_memcpy,
2918	.memset: atom_memset,
2919	COSTS_N_INSNS (`3`), / cond_taken_branch_cost. /
2920	COSTS_N_INSNS (`1`), / cond_not_taken_branch_cost. /
2921	.align_loop: "16", / Loop alignment. /
2922	.align_jump: "16:8:8", / Jump alignment. /
2923	.align_label: "0:0:8", / Label alignment. /
2924	.align_func: "16", / Func alignment. /
2925	.small_unroll_ninsns: `4`, / Small unroll limit. /
2926	.small_unroll_factor: `2`, / Small unroll factor. /
2927	};
2928
2929	static stringop_algs slm_memcpy[`2`] = {
2930	{.unknown_size: libcall, .size: {{`11`, loop, false}, {-`1`, rep_prefix_4_byte, false}}},
2931	{.unknown_size: libcall, .size: {{`32`, loop, false}, {`64`, rep_prefix_4_byte, false},
2932	{`8192`, rep_prefix_8_byte, false}, {-`1`, libcall, false}}}};
2933	static stringop_algs slm_memset[`2`] = {
2934	{.unknown_size: libcall, .size: {{`8`, loop, false}, {`15`, unrolled_loop, false},
2935	{`2048`, rep_prefix_4_byte, false}, {-`1`, libcall, false}}},
2936	{.unknown_size: libcall, .size: {{`24`, loop, false}, {`32`, unrolled_loop, false},
2937	{`8192`, rep_prefix_8_byte, false}, {-`1`, libcall, false}}}};
2938	static const
2939	struct processor_costs slm_cost = {
2940	.hard_register: {
2941	/ Start of register allocator costs. integer->integer move cost is 2. /
2942	.movzbl_load: `8`, / cost for loading QImode using movzbl /
2943	.int_load: {`8`, `8`, `8`}, / cost of loading integer registers*
2944	in QImode, HImode and SImode.
2945	Relative to reg-reg move (2). /*
2946	.int_store: {`6`, `6`, `6`}, / cost of storing integer registers /
2947	.fp_move: `2`, / cost of reg,reg fld/fst /
2948	.fp_load: {`8`, `8`, `18`}, / cost of loading fp registers*
2949	in SFmode, DFmode and XFmode /*
2950	.fp_store: {`6`, `6`, `18`}, / cost of storing fp registers*
2951	in SFmode, DFmode and XFmode /*
2952	.mmx_move: `2`, / cost of moving MMX register /
2953	.mmx_load: {`8`, `8`}, / cost of loading MMX registers*
2954	in SImode and DImode /*
2955	.mmx_store: {`6`, `6`}, / cost of storing MMX registers*
2956	in SImode and DImode /*
2957	.xmm_move: `2`, .ymm_move: `4`, .zmm_move: `8`, / cost of moving XMM,YMM,ZMM register /
2958	.sse_load: {`8`, `8`, `8`, `16`, `32`}, / cost of loading SSE registers*
2959	in 32,64,128,256 and 512-bit /*
2960	.sse_store: {`8`, `8`, `8`, `16`, `32`}, / cost of storing SSE registers*
2961	in 32,64,128,256 and 512-bit /*
2962	.sse_to_integer: `8`, .integer_to_sse: `6`, / SSE->integer and integer->SSE moves /
2963	.mask_to_integer: `8`, .integer_to_mask: `6`, / mask->integer and integer->mask moves /
2964	.mask_load: {`8`, `8`, `8`}, / cost of loading mask register*
2965	in QImode, HImode, SImode. /*
2966	.mask_store: {`6`, `6`, `6`}, / cost if storing mask register*
2967	in QImode, HImode, SImode. /*
2968	.mask_move: `2`, / cost of moving mask register. /
2969	/ End of register allocator costs. /
2970	},
2971
2972	COSTS_N_INSNS (`1`), / cost of an add instruction /
2973	COSTS_N_INSNS (`1`) + `1`, / cost of a lea instruction /
2974	COSTS_N_INSNS (`1`), / variable shift costs /
2975	COSTS_N_INSNS (`1`), / constant shift costs /
2976	.mult_init: {COSTS_N_INSNS (`3`), / cost of starting multiply for QI /
2977	COSTS_N_INSNS (`3`), / HI /
2978	COSTS_N_INSNS (`3`), / SI /
2979	COSTS_N_INSNS (`4`), / DI /
2980	COSTS_N_INSNS (`2`)}, / other /
2981	.mult_bit: `0`, / cost of multiply per each bit set /
2982	.divide: {COSTS_N_INSNS (`18`), / cost of a divide/mod for QI /
2983	COSTS_N_INSNS (`26`), / HI /
2984	COSTS_N_INSNS (`42`), / SI /
2985	COSTS_N_INSNS (`74`), / DI /
2986	COSTS_N_INSNS (`74`)}, / other /
2987	COSTS_N_INSNS (`1`), / cost of movsx /
2988	COSTS_N_INSNS (`1`), / cost of movzx /
2989	.large_insn: `8`, / "large" insn /
2990	.move_ratio: `17`, / MOVE_RATIO /
2991	.clear_ratio: `6`, / CLEAR_RATIO /
2992	.int_load: {`8`, `8`, `8`}, / cost of loading integer registers*
2993	in QImode, HImode and SImode.
2994	Relative to reg-reg move (2). /*
2995	.int_store: {`6`, `6`, `6`}, / cost of storing integer registers /
2996	.sse_load: {`8`, `8`, `8`, `16`, `32`}, / cost of loading SSE register*
2997	in 32bit, 64bit, 128bit, 256bit and 512bit /*
2998	.sse_store: {`8`, `8`, `8`, `16`, `32`}, / cost of storing SSE register*
2999	in SImode, DImode and TImode. /*
3000	.sse_unaligned_load: {`16`, `16`, `16`, `32`, `64`}, / cost of unaligned loads. /
3001	.sse_unaligned_store: {`16`, `16`, `16`, `32`, `64`}, / cost of unaligned stores. /
3002	.xmm_move: `2`, .ymm_move: `4`, .zmm_move: `8`, / cost of moving XMM,YMM,ZMM register /
3003	.sse_to_integer: `8`, / cost of moving SSE register to integer. /
3004	.gather_static: `8`, .gather_per_elt: `8`, / Gather load static, per_elt. /
3005	.scatter_static: `8`, .scatter_per_elt: `8`, / Gather store static, per_elt. /
3006	.l1_cache_size: `32`, / size of l1 cache. /
3007	.l2_cache_size: `256`, / size of l2 cache. /
3008	.prefetch_block: `64`, / size of prefetch block /
3009	.simultaneous_prefetches: `6`, / number of parallel prefetches /
3010	.branch_cost: `3`, / Branch cost /
3011	COSTS_N_INSNS (`8`), / cost of FADD and FSUB insns. /
3012	COSTS_N_INSNS (`8`), / cost of FMUL instruction. /
3013	COSTS_N_INSNS (`20`), / cost of FDIV instruction. /
3014	COSTS_N_INSNS (`8`), / cost of FABS instruction. /
3015	COSTS_N_INSNS (`8`), / cost of FCHS instruction. /
3016	COSTS_N_INSNS (`40`), / cost of FSQRT instruction. /
3017
3018	COSTS_N_INSNS (`1`), / cost of cheap SSE instruction. /
3019	COSTS_N_INSNS (`3`), / cost of ADDSS/SD SUBSS/SD insns. /
3020	COSTS_N_INSNS (`4`), / cost of MULSS instruction. /
3021	COSTS_N_INSNS (`5`), / cost of MULSD instruction. /
3022	COSTS_N_INSNS (`6`), / cost of FMA SS instruction. /
3023	COSTS_N_INSNS (`6`), / cost of FMA SD instruction. /
3024	COSTS_N_INSNS (`39`), / cost of DIVSS instruction. /
3025	COSTS_N_INSNS (`69`), / cost of DIVSD instruction. /
3026	COSTS_N_INSNS (`20`), / cost of SQRTSS instruction. /
3027	COSTS_N_INSNS (`35`), / cost of SQRTSD instruction. /
3028	.reassoc_int: `1`, .reassoc_fp: `2`, .reassoc_vec_int: `1`, .reassoc_vec_fp: `1`, / reassoc int, fp, vec_int, vec_fp. /
3029	.memcpy: slm_memcpy,
3030	.memset: slm_memset,
3031	COSTS_N_INSNS (`3`), / cond_taken_branch_cost. /
3032	COSTS_N_INSNS (`1`), / cond_not_taken_branch_cost. /
3033	.align_loop: "16", / Loop alignment. /
3034	.align_jump: "16:8:8", / Jump alignment. /
3035	.align_label: "0:0:8", / Label alignment. /
3036	.align_func: "16", / Func alignment. /
3037	.small_unroll_ninsns: `4`, / Small unroll limit. /
3038	.small_unroll_factor: `2`, / Small unroll factor. /
3039	};
3040
3041	static stringop_algs tremont_memcpy[`2`] = {
3042	{.unknown_size: libcall,
3043	.size: {{`256`, rep_prefix_1_byte, true},
3044	{`256`, loop, false},
3045	{-`1`, libcall, false}}},
3046	{.unknown_size: libcall,
3047	.size: {{`256`, rep_prefix_1_byte, true},
3048	{`256`, loop, false},
3049	{-`1`, libcall, false}}}};
3050	static stringop_algs tremont_memset[`2`] = {
3051	{.unknown_size: libcall,
3052	.size: {{`256`, rep_prefix_1_byte, true},
3053	{`256`, loop, false},
3054	{-`1`, libcall, false}}},
3055	{.unknown_size: libcall,
3056	.size: {{`256`, rep_prefix_1_byte, true},
3057	{`256`, loop, false},
3058	{-`1`, libcall, false}}}};
3059	static const
3060	struct processor_costs tremont_cost = {
3061	.hard_register: {
3062	/ Start of register allocator costs. integer->integer move cost is 2. /
3063	.movzbl_load: `6`, / cost for loading QImode using movzbl /
3064	.int_load: {`6`, `6`, `6`}, / cost of loading integer registers*
3065	in QImode, HImode and SImode.
3066	Relative to reg-reg move (2). /*
3067	.int_store: {`6`, `6`, `6`}, / cost of storing integer registers /
3068	.fp_move: `4`, / cost of reg,reg fld/fst /
3069	.fp_load: {`6`, `6`, `12`}, / cost of loading fp registers*
3070	in SFmode, DFmode and XFmode /*
3071	.fp_store: {`6`, `6`, `12`}, / cost of storing fp registers*
3072	in SFmode, DFmode and XFmode /*
3073	.mmx_move: `2`, / cost of moving MMX register /
3074	.mmx_load: {`6`, `6`}, / cost of loading MMX registers*
3075	in SImode and DImode /*
3076	.mmx_store: {`6`, `6`}, / cost of storing MMX registers*
3077	in SImode and DImode /*
3078	.xmm_move: `2`, .ymm_move: `3`, .zmm_move: `4`, / cost of moving XMM,YMM,ZMM register /
3079	.sse_load: {`6`, `6`, `6`, `10`, `15`}, / cost of loading SSE registers*
3080	in 32,64,128,256 and 512-bit /*
3081	.sse_store: {`6`, `6`, `6`, `10`, `15`}, / cost of storing SSE registers*
3082	in 32,64,128,256 and 512-bit /*
3083	.sse_to_integer: `6`, .integer_to_sse: `6`, / SSE->integer and integer->SSE moves /
3084	.mask_to_integer: `6`, .integer_to_mask: `6`, / mask->integer and integer->mask moves /
3085	.mask_load: {`6`, `6`, `6`}, / cost of loading mask register*
3086	in QImode, HImode, SImode. /*
3087	.mask_store: {`6`, `6`, `6`}, / cost if storing mask register*
3088	in QImode, HImode, SImode. /*
3089	.mask_move: `2`, / cost of moving mask register. /
3090	/ End of register allocator costs. /
3091	},
3092
3093	COSTS_N_INSNS (`1`), / cost of an add instruction /
3094	/ Setting cost to 2 makes our current implementation of synth_mult result in*
3095	use of unnecessary temporary registers causing regression on several
3096	SPECfp benchmarks. /*
3097	COSTS_N_INSNS (`1`) + `1`, / cost of a lea instruction /
3098	COSTS_N_INSNS (`1`), / variable shift costs /
3099	COSTS_N_INSNS (`1`), / constant shift costs /
3100	.mult_init: {COSTS_N_INSNS (`3`), / cost of starting multiply for QI /
3101	COSTS_N_INSNS (`4`), / HI /
3102	COSTS_N_INSNS (`3`), / SI /
3103	COSTS_N_INSNS (`4`), / DI /
3104	COSTS_N_INSNS (`4`)}, / other /
3105	.mult_bit: `0`, / cost of multiply per each bit set /
3106	.divide: {COSTS_N_INSNS (`16`), / cost of a divide/mod for QI /
3107	COSTS_N_INSNS (`22`), / HI /
3108	COSTS_N_INSNS (`30`), / SI /
3109	COSTS_N_INSNS (`74`), / DI /
3110	COSTS_N_INSNS (`74`)}, / other /
3111	COSTS_N_INSNS (`1`), / cost of movsx /
3112	COSTS_N_INSNS (`1`), / cost of movzx /
3113	.large_insn: `8`, / "large" insn /
3114	.move_ratio: `17`, / MOVE_RATIO /
3115	.clear_ratio: `17`, / CLEAR_RATIO /
3116	.int_load: {`6`, `6`, `6`}, / cost of loading integer registers*
3117	in QImode, HImode and SImode.
3118	Relative to reg-reg move (2). /*
3119	.int_store: {`6`, `6`, `6`}, / cost of storing integer registers /
3120	.sse_load: {`6`, `6`, `6`, `10`, `15`}, / cost of loading SSE register*
3121	in 32bit, 64bit, 128bit, 256bit and 512bit /*
3122	.sse_store: {`6`, `6`, `6`, `10`, `15`}, / cost of storing SSE register*
3123	in 32bit, 64bit, 128bit, 256bit and 512bit /*
3124	.sse_unaligned_load: {`6`, `6`, `6`, `10`, `15`}, / cost of unaligned loads. /
3125	.sse_unaligned_store: {`6`, `6`, `6`, `10`, `15`}, / cost of unaligned storess. /
3126	.xmm_move: `2`, .ymm_move: `3`, .zmm_move: `4`, / cost of moving XMM,YMM,ZMM register /
3127	.sse_to_integer: `6`, / cost of moving SSE register to integer. /
3128	.gather_static: `18`, .gather_per_elt: `6`, / Gather load static, per_elt. /
3129	.scatter_static: `18`, .scatter_per_elt: `6`, / Gather store static, per_elt. /
3130	.l1_cache_size: `32`, / size of l1 cache. /
3131	.l2_cache_size: `512`, / size of l2 cache. /
3132	.prefetch_block: `64`, / size of prefetch block /
3133	.simultaneous_prefetches: `6`, / number of parallel prefetches /
3134	/ Benchmarks shows large regressions on K8 sixtrack benchmark when this*
3135	value is increased to perhaps more appropriate value of 5. /*
3136	.branch_cost: `3`, / Branch cost /
3137	COSTS_N_INSNS (`3`), / cost of FADD and FSUB insns. /
3138	COSTS_N_INSNS (`5`), / cost of FMUL instruction. /
3139	COSTS_N_INSNS (`17`), / cost of FDIV instruction. /
3140	COSTS_N_INSNS (`1`), / cost of FABS instruction. /
3141	COSTS_N_INSNS (`1`), / cost of FCHS instruction. /
3142	COSTS_N_INSNS (`14`), / cost of FSQRT instruction. /
3143
3144	COSTS_N_INSNS (`1`), / cost of cheap SSE instruction. /
3145	COSTS_N_INSNS (`3`), / cost of ADDSS/SD SUBSS/SD insns. /
3146	COSTS_N_INSNS (`4`), / cost of MULSS instruction. /
3147	COSTS_N_INSNS (`5`), / cost of MULSD instruction. /
3148	COSTS_N_INSNS (`5`), / cost of FMA SS instruction. /
3149	COSTS_N_INSNS (`5`), / cost of FMA SD instruction. /
3150	COSTS_N_INSNS (`13`), / cost of DIVSS instruction. /
3151	COSTS_N_INSNS (`17`), / cost of DIVSD instruction. /
3152	COSTS_N_INSNS (`14`), / cost of SQRTSS instruction. /
3153	COSTS_N_INSNS (`18`), / cost of SQRTSD instruction. /
3154	.reassoc_int: `1`, .reassoc_fp: `4`, .reassoc_vec_int: `3`, .reassoc_vec_fp: `3`, / reassoc int, fp, vec_int, vec_fp. /
3155	.memcpy: tremont_memcpy,
3156	.memset: tremont_memset,
3157	COSTS_N_INSNS (`4`), / cond_taken_branch_cost. /
3158	COSTS_N_INSNS (`2`), / cond_not_taken_branch_cost. /
3159	.align_loop: "16:11:8", / Loop alignment. /
3160	.align_jump: "16:11:8", / Jump alignment. /
3161	.align_label: "0:0:8", / Label alignment. /
3162	.align_func: "16", / Func alignment. /
3163	.small_unroll_ninsns: `4`, / Small unroll limit. /
3164	.small_unroll_factor: `2`, / Small unroll factor. /
3165	};
3166
3167	static stringop_algs intel_memcpy[`2`] = {
3168	{.unknown_size: libcall, .size: {{`11`, loop, false}, {-`1`, rep_prefix_4_byte, false}}},
3169	{.unknown_size: libcall, .size: {{`32`, loop, false}, {`64`, rep_prefix_4_byte, false},
3170	{`8192`, rep_prefix_8_byte, false}, {-`1`, libcall, false}}}};
3171	static stringop_algs intel_memset[`2`] = {
3172	{.unknown_size: libcall, .size: {{`8`, loop, false}, {`15`, unrolled_loop, false},
3173	{`2048`, rep_prefix_4_byte, false}, {-`1`, libcall, false}}},
3174	{.unknown_size: libcall, .size: {{`24`, loop, false}, {`32`, unrolled_loop, false},
3175	{`8192`, rep_prefix_8_byte, false}, {-`1`, libcall, false}}}};
3176	static const
3177	struct processor_costs intel_cost = {
3178	.hard_register: {
3179	/ Start of register allocator costs. integer->integer move cost is 2. /
3180	.movzbl_load: `6`, / cost for loading QImode using movzbl /
3181	.int_load: {`4`, `4`, `4`}, / cost of loading integer registers*
3182	in QImode, HImode and SImode.
3183	Relative to reg-reg move (2). /*
3184	.int_store: {`6`, `6`, `6`}, / cost of storing integer registers /
3185	.fp_move: `2`, / cost of reg,reg fld/fst /
3186	.fp_load: {`6`, `6`, `8`}, / cost of loading fp registers*
3187	in SFmode, DFmode and XFmode /*
3188	.fp_store: {`6`, `6`, `10`}, / cost of storing fp registers*
3189	in SFmode, DFmode and XFmode /*
3190	.mmx_move: `2`, / cost of moving MMX register /
3191	.mmx_load: {`6`, `6`}, / cost of loading MMX registers*
3192	in SImode and DImode /*
3193	.mmx_store: {`6`, `6`}, / cost of storing MMX registers*
3194	in SImode and DImode /*
3195	.xmm_move: `2`, .ymm_move: `2`, .zmm_move: `2`, / cost of moving XMM,YMM,ZMM register /
3196	.sse_load: {`6`, `6`, `6`, `6`, `6`}, / cost of loading SSE registers*
3197	in 32,64,128,256 and 512-bit /*
3198	.sse_store: {`6`, `6`, `6`, `6`, `6`}, / cost of storing SSE registers*
3199	in 32,64,128,256 and 512-bit /*
3200	.sse_to_integer: `4`, .integer_to_sse: `4`, / SSE->integer and integer->SSE moves /
3201	.mask_to_integer: `4`, .integer_to_mask: `4`, / mask->integer and integer->mask moves /
3202	.mask_load: {`4`, `4`, `4`}, / cost of loading mask register*
3203	in QImode, HImode, SImode. /*
3204	.mask_store: {`6`, `6`, `6`}, / cost if storing mask register*
3205	in QImode, HImode, SImode. /*
3206	.mask_move: `2`, / cost of moving mask register. /
3207	/ End of register allocator costs. /
3208	},
3209
3210	COSTS_N_INSNS (`1`), / cost of an add instruction /
3211	COSTS_N_INSNS (`1`) + `1`, / cost of a lea instruction /
3212	COSTS_N_INSNS (`1`), / variable shift costs /
3213	COSTS_N_INSNS (`1`), / constant shift costs /
3214	.mult_init: {COSTS_N_INSNS (`3`), / cost of starting multiply for QI /
3215	COSTS_N_INSNS (`3`), / HI /
3216	COSTS_N_INSNS (`3`), / SI /
3217	COSTS_N_INSNS (`4`), / DI /
3218	COSTS_N_INSNS (`2`)}, / other /
3219	.mult_bit: `0`, / cost of multiply per each bit set /
3220	.divide: {COSTS_N_INSNS (`18`), / cost of a divide/mod for QI /
3221	COSTS_N_INSNS (`26`), / HI /
3222	COSTS_N_INSNS (`42`), / SI /
3223	COSTS_N_INSNS (`74`), / DI /
3224	COSTS_N_INSNS (`74`)}, / other /
3225	COSTS_N_INSNS (`1`), / cost of movsx /
3226	COSTS_N_INSNS (`1`), / cost of movzx /
3227	.large_insn: `8`, / "large" insn /
3228	.move_ratio: `17`, / MOVE_RATIO /
3229	.clear_ratio: `6`, / CLEAR_RATIO /
3230	.int_load: {`4`, `4`, `4`}, / cost of loading integer registers*
3231	in QImode, HImode and SImode.
3232	Relative to reg-reg move (2). /*
3233	.int_store: {`6`, `6`, `6`}, / cost of storing integer registers /
3234	.sse_load: {`6`, `6`, `6`, `6`, `6`}, / cost of loading SSE register*
3235	in 32bit, 64bit, 128bit, 256bit and 512bit /*
3236	.sse_store: {`6`, `6`, `6`, `6`, `6`}, / cost of storing SSE register*
3237	in 32bit, 64bit, 128bit, 256bit and 512bit /*
3238	.sse_unaligned_load: {`10`, `10`, `10`, `10`, `10`}, / cost of unaligned loads. /
3239	.sse_unaligned_store: {`10`, `10`, `10`, `10`, `10`}, / cost of unaligned loads. /
3240	.xmm_move: `2`, .ymm_move: `2`, .zmm_move: `2`, / cost of moving XMM,YMM,ZMM register /
3241	.sse_to_integer: `4`, / cost of moving SSE register to integer. /
3242	.gather_static: `6`, .gather_per_elt: `6`, / Gather load static, per_elt. /
3243	.scatter_static: `6`, .scatter_per_elt: `6`, / Gather store static, per_elt. /
3244	.l1_cache_size: `32`, / size of l1 cache. /
3245	.l2_cache_size: `256`, / size of l2 cache. /
3246	.prefetch_block: `64`, / size of prefetch block /
3247	.simultaneous_prefetches: `6`, / number of parallel prefetches /
3248	.branch_cost: `3`, / Branch cost /
3249	COSTS_N_INSNS (`8`), / cost of FADD and FSUB insns. /
3250	COSTS_N_INSNS (`8`), / cost of FMUL instruction. /
3251	COSTS_N_INSNS (`20`), / cost of FDIV instruction. /
3252	COSTS_N_INSNS (`8`), / cost of FABS instruction. /
3253	COSTS_N_INSNS (`8`), / cost of FCHS instruction. /
3254	COSTS_N_INSNS (`40`), / cost of FSQRT instruction. /
3255
3256	COSTS_N_INSNS (`1`), / cost of cheap SSE instruction. /
3257	COSTS_N_INSNS (`8`), / cost of ADDSS/SD SUBSS/SD insns. /
3258	COSTS_N_INSNS (`8`), / cost of MULSS instruction. /
3259	COSTS_N_INSNS (`8`), / cost of MULSD instruction. /
3260	COSTS_N_INSNS (`6`), / cost of FMA SS instruction. /
3261	COSTS_N_INSNS (`6`), / cost of FMA SD instruction. /
3262	COSTS_N_INSNS (`20`), / cost of DIVSS instruction. /
3263	COSTS_N_INSNS (`20`), / cost of DIVSD instruction. /
3264	COSTS_N_INSNS (`40`), / cost of SQRTSS instruction. /
3265	COSTS_N_INSNS (`40`), / cost of SQRTSD instruction. /
3266	.reassoc_int: `1`, .reassoc_fp: `4`, .reassoc_vec_int: `1`, .reassoc_vec_fp: `1`, / reassoc int, fp, vec_int, vec_fp. /
3267	.memcpy: intel_memcpy,
3268	.memset: intel_memset,
3269	COSTS_N_INSNS (`3`), / cond_taken_branch_cost. /
3270	COSTS_N_INSNS (`1`), / cond_not_taken_branch_cost. /
3271	.align_loop: "16", / Loop alignment. /
3272	.align_jump: "16:8:8", / Jump alignment. /
3273	.align_label: "0:0:8", / Label alignment. /
3274	.align_func: "16", / Func alignment. /
3275	.small_unroll_ninsns: `4`, / Small unroll limit. /
3276	.small_unroll_factor: `2`, / Small unroll factor. /
3277	};
3278
3279	/ lujiazui_cost should produce code tuned for ZHAOXIN lujiazui CPU. /
3280	static stringop_algs lujiazui_memcpy[`2`] = {
3281	{.unknown_size: libcall, .size: {{`32`, loop, false}, {`8192`, rep_prefix_4_byte, false},
3282	{-`1`, libcall, false}}},
3283	{.unknown_size: libcall, .size: {{`12`, unrolled_loop, true}, {`32`, loop, false},
3284	{`6144`, rep_prefix_8_byte, false},
3285	{-`1`, libcall, false}}}};
3286	static stringop_algs lujiazui_memset[`2`] = {
3287	{.unknown_size: libcall, .size: {{`32`, loop, false}, {`8192`, rep_prefix_4_byte, false},
3288	{-`1`, libcall, false}}},
3289	{.unknown_size: libcall, .size: {{`12`, loop, true}, {`32`, loop, false},
3290	{`640`, rep_prefix_8_byte, false},
3291	{-`1`, libcall, false}}}};
3292	static const
3293	struct processor_costs lujiazui_cost = {
3294	.hard_register: {
3295	/ Start of register allocator costs. integer->integer move cost is 2. /
3296	.movzbl_load: `6`, / cost for loading QImode using movzbl. /
3297	.int_load: {`6`, `6`, `6`}, / cost of loading integer registers*
3298	in QImode, HImode and SImode.
3299	Relative to reg-reg move (2). /*
3300	.int_store: {`6`, `6`, `6`}, / cost of storing integer registers. /
3301	.fp_move: `2`, / cost of reg,reg fld/fst. /
3302	.fp_load: {`6`, `6`, `8`}, / cost of loading fp registers*
3303	in SFmode, DFmode and XFmode. /*
3304	.fp_store: {`6`, `6`, `8`}, / cost of storing fp registers*
3305	in SFmode, DFmode and XFmode. /*
3306	.mmx_move: `2`, / cost of moving MMX register. /
3307	.mmx_load: {`6`, `6`}, / cost of loading MMX registers*
3308	in SImode and DImode. /*
3309	.mmx_store: {`6`, `6`}, / cost of storing MMX registers*
3310	in SImode and DImode. /*
3311	.xmm_move: `2`, .ymm_move: `3`, .zmm_move: `4`, / cost of moving XMM,YMM,ZMM register. /
3312	.sse_load: {`6`, `6`, `6`, `10`, `15`}, / cost of loading SSE registers*
3313	in 32,64,128,256 and 512-bit. /*
3314	.sse_store: {`6`, `6`, `6`, `10`, `15`}, / cost of storing SSE registers*
3315	in 32,64,128,256 and 512-bit. /*
3316	.sse_to_integer: `6`, .integer_to_sse: `6`, / SSE->integer and integer->SSE moves. /
3317	.mask_to_integer: `6`, .integer_to_mask: `6`, / mask->integer and integer->mask moves. /
3318	.mask_load: {`6`, `6`, `6`}, / cost of loading mask register*
3319	in QImode, HImode, SImode. /*
3320	.mask_store: {`6`, `6`, `6`}, / cost if storing mask register*
3321	in QImode, HImode, SImode. /*
3322	.mask_move: `2`, / cost of moving mask register. /
3323	/ End of register allocator costs. /
3324	},
3325
3326	COSTS_N_INSNS (`1`), / cost of an add instruction. /
3327	COSTS_N_INSNS (`1`) + `1`, / cost of a lea instruction. /
3328	COSTS_N_INSNS (`1`), / variable shift costs. /
3329	COSTS_N_INSNS (`1`), / constant shift costs. /
3330	.mult_init: {COSTS_N_INSNS (`2`), / cost of starting multiply for QI. /
3331	COSTS_N_INSNS (`3`), / HI. /
3332	COSTS_N_INSNS (`3`), / SI. /
3333	COSTS_N_INSNS (`12`), / DI. /
3334	COSTS_N_INSNS (`14`)}, / other. /
3335	.mult_bit: `0`, / cost of multiply per each bit set. /
3336	.divide: {COSTS_N_INSNS (`22`), / cost of a divide/mod for QI. /
3337	COSTS_N_INSNS (`24`), / HI. /
3338	COSTS_N_INSNS (`24`), / SI. /
3339	COSTS_N_INSNS (`150`), / DI. /
3340	COSTS_N_INSNS (`152`)}, / other. /
3341	COSTS_N_INSNS (`1`), / cost of movsx. /
3342	COSTS_N_INSNS (`1`), / cost of movzx. /
3343	.large_insn: `8`, / "large" insn. /
3344	.move_ratio: `17`, / MOVE_RATIO. /
3345	.clear_ratio: `6`, / CLEAR_RATIO. /
3346	.int_load: {`6`, `6`, `6`}, / cost of loading integer registers*
3347	in QImode, HImode and SImode.
3348	Relative to reg-reg move (2). /*
3349	.int_store: {`6`, `6`, `6`}, / cost of storing integer registers. /
3350	.sse_load: {`6`, `6`, `6`, `10`, `15`}, / cost of loading SSE register*
3351	in 32bit, 64bit, 128bit, 256bit and 512bit. /*
3352	.sse_store: {`6`, `6`, `6`, `10`, `15`}, / cost of storing SSE register*
3353	in 32bit, 64bit, 128bit, 256bit and 512bit. /*
3354	.sse_unaligned_load: {`6`, `6`, `6`, `10`, `15`}, / cost of unaligned loads. /
3355	.sse_unaligned_store: {`6`, `6`, `6`, `10`, `15`}, / cost of unaligned storess. /
3356	.xmm_move: `2`, .ymm_move: `3`, .zmm_move: `4`, / cost of moving XMM,YMM,ZMM register. /
3357	.sse_to_integer: `6`, / cost of moving SSE register to integer. /
3358	.gather_static: `18`, .gather_per_elt: `6`, / Gather load static, per_elt. /
3359	.scatter_static: `18`, .scatter_per_elt: `6`, / Gather store static, per_elt. /
3360	.l1_cache_size: `32`, / size of l1 cache. /
3361	.l2_cache_size: `4096`, / size of l2 cache. /
3362	.prefetch_block: `64`, / size of prefetch block. /
3363	/ Lujiazui processor never drop prefetches, like AMD processors. /
3364	.simultaneous_prefetches: `100`, / number of parallel prefetches. /
3365	.branch_cost: `3`, / Branch cost. /
3366	COSTS_N_INSNS (`3`), / cost of FADD and FSUB insns. /
3367	COSTS_N_INSNS (`4`), / cost of FMUL instruction. /
3368	COSTS_N_INSNS (`22`), / cost of FDIV instruction. /
3369	COSTS_N_INSNS (`1`), / cost of FABS instruction. /
3370	COSTS_N_INSNS (`1`), / cost of FCHS instruction. /
3371	COSTS_N_INSNS (`44`), / cost of FSQRT instruction. /
3372
3373	COSTS_N_INSNS (`1`), / cost of cheap SSE instruction. /
3374	COSTS_N_INSNS (`3`), / cost of ADDSS/SD SUBSS/SD insns. /
3375	COSTS_N_INSNS (`3`), / cost of MULSS instruction. /
3376	COSTS_N_INSNS (`4`), / cost of MULSD instruction. /
3377	COSTS_N_INSNS (`6`), / cost of FMA SS instruction. /
3378	COSTS_N_INSNS (`6`), / cost of FMA SD instruction. /
3379	COSTS_N_INSNS (`13`), / cost of DIVSS instruction. /
3380	COSTS_N_INSNS (`17`), / cost of DIVSD instruction. /
3381	COSTS_N_INSNS (`32`), / cost of SQRTSS instruction. /
3382	COSTS_N_INSNS (`60`), / cost of SQRTSD instruction. /
3383	.reassoc_int: `1`, .reassoc_fp: `4`, .reassoc_vec_int: `3`, .reassoc_vec_fp: `3`, / reassoc int, fp, vec_int, vec_fp. /
3384	.memcpy: lujiazui_memcpy,
3385	.memset: lujiazui_memset,
3386	COSTS_N_INSNS (`4`), / cond_taken_branch_cost. /
3387	COSTS_N_INSNS (`2`), / cond_not_taken_branch_cost. /
3388	.align_loop: "16:11:8", / Loop alignment. /
3389	.align_jump: "16:11:8", / Jump alignment. /
3390	.align_label: "0:0:8", / Label alignment. /
3391	.align_func: "16", / Func alignment. /
3392	.small_unroll_ninsns: `4`, / Small unroll limit. /
3393	.small_unroll_factor: `2`, / Small unroll factor. /
3394	};
3395
3396	/ yongfeng_cost should produce code tuned for ZHAOXIN yongfeng CPU. /
3397	static stringop_algs yongfeng_memcpy[`2`] = {
3398	{.unknown_size: libcall, .size: {{`6`, unrolled_loop, true}, {`256`, unrolled_loop, false},
3399	{-`1`, libcall, false}}},
3400	{.unknown_size: libcall, .size: {{`8`, loop, false}, {`512`, unrolled_loop, false},
3401	{-`1`, libcall, false}}}};
3402	static stringop_algs yongfeng_memset[`2`] = {
3403	{.unknown_size: libcall, .size: {{`6`, loop_1_byte, false}, {`128`, loop, false},
3404	{-`1`, libcall, false}}},
3405	{.unknown_size: libcall, .size: {{`2`, rep_prefix_4_byte, false}, {`64`, loop, false},
3406	{`1024`, vector_loop, false},
3407	{-`1`, libcall, false}}}};
3408	static const
3409	struct processor_costs yongfeng_cost = {
3410	.hard_register: {
3411	/ Start of register allocator costs. integer->integer move cost is 2. /
3412	.movzbl_load: `8`, / cost for loading QImode using movzbl. /
3413	.int_load: {`8`, `8`, `8`}, / cost of loading integer registers*
3414	in QImode, HImode and SImode.
3415	Relative to reg-reg move (2). /*
3416	.int_store: {`8`, `8`, `8`}, / cost of storing integer registers. /
3417	.fp_move: `2`, / cost of reg,reg fld/fst. /
3418	.fp_load: {`8`, `8`, `8`}, / cost of loading fp registers*
3419	in SFmode, DFmode and XFmode. /*
3420	.fp_store: {`8`, `8`, `8`}, / cost of storing fp registers*
3421	in SFmode, DFmode and XFmode. /*
3422	.mmx_move: `2`, / cost of moving MMX register. /
3423	.mmx_load: {`8`, `8`}, / cost of loading MMX registers*
3424	in SImode and DImode. /*
3425	.mmx_store: {`8`, `8`}, / cost of storing MMX registers*
3426	in SImode and DImode. /*
3427	.xmm_move: `2`, .ymm_move: `3`, .zmm_move: `4`, / cost of moving XMM,YMM,ZMM register. /
3428	.sse_load: {`8`, `8`, `8`, `10`, `15`}, / cost of loading SSE registers*
3429	in 32,64,128,256 and 512-bit. /*
3430	.sse_store: {`8`, `8`, `8`, `10`, `15`}, / cost of storing SSE registers*
3431	in 32,64,128,256 and 512-bit. /*
3432	.sse_to_integer: `8`, .integer_to_sse: `8`, / SSE->integer and integer->SSE moves. /
3433	.mask_to_integer: `8`, .integer_to_mask: `8`, / mask->integer and integer->mask moves. /
3434	.mask_load: {`8`, `8`, `8`}, / cost of loading mask register*
3435	in QImode, HImode, SImode. /*
3436	.mask_store: {`8`, `8`, `8`}, / cost if storing mask register*
3437	in QImode, HImode, SImode. /*
3438	.mask_move: `2`, / cost of moving mask register. /
3439	/ End of register allocator costs. /
3440	},
3441
3442	COSTS_N_INSNS (`1`), / cost of an add instruction. /
3443	COSTS_N_INSNS (`1`), / cost of a lea instruction. /
3444	COSTS_N_INSNS (`1`), / variable shift costs. /
3445	COSTS_N_INSNS (`1`), / constant shift costs. /
3446	.mult_init: {COSTS_N_INSNS (`2`), / cost of starting multiply for QI. /
3447	COSTS_N_INSNS (`3`), / HI. /
3448	COSTS_N_INSNS (`2`), / SI. /
3449	COSTS_N_INSNS (`2`), / DI. /
3450	COSTS_N_INSNS (`3`)}, / other. /
3451	.mult_bit: `0`, / cost of multiply per each bit set. /
3452	.divide: {COSTS_N_INSNS (`8`), / cost of a divide/mod for QI. /
3453	COSTS_N_INSNS (`9`), / HI. /
3454	COSTS_N_INSNS (`8`), / SI. /
3455	COSTS_N_INSNS (`41`), / DI. /
3456	COSTS_N_INSNS (`41`)}, / other. /
3457	COSTS_N_INSNS (`1`), / cost of movsx. /
3458	COSTS_N_INSNS (`1`), / cost of movzx. /
3459	.large_insn: `8`, / "large" insn. /
3460	.move_ratio: `17`, / MOVE_RATIO. /
3461	.clear_ratio: `6`, / CLEAR_RATIO. /
3462	.int_load: {`8`, `8`, `8`}, / cost of loading integer registers*
3463	in QImode, HImode and SImode.
3464	Relative to reg-reg move (2). /*
3465	.int_store: {`8`, `8`, `8`}, / cost of storing integer registers. /
3466	.sse_load: {`8`, `8`, `8`, `12`, `15`}, / cost of loading SSE register*
3467	in 32bit, 64bit, 128bit, 256bit and 512bit. /*
3468	.sse_store: {`8`, `8`, `8`, `12`, `15`}, / cost of storing SSE register*
3469	in 32bit, 64bit, 128bit, 256bit and 512bit. /*
3470	.sse_unaligned_load: {`8`, `8`, `8`, `12`, `15`}, / cost of unaligned loads. /
3471	.sse_unaligned_store: {`8`, `8`, `8`, `12`, `15`}, / cost of unaligned storess. /
3472	.xmm_move: `2`, .ymm_move: `3`, .zmm_move: `4`, / cost of moving XMM,YMM,ZMM register. /
3473	.sse_to_integer: `8`, / cost of moving SSE register to integer. /
3474	.gather_static: `18`, .gather_per_elt: `6`, / Gather load static, per_elt. /
3475	.scatter_static: `18`, .scatter_per_elt: `6`, / Gather store static, per_elt. /
3476	.l1_cache_size: `32`, / size of l1 cache. /
3477	.l2_cache_size: `256`, / size of l2 cache. /
3478	.prefetch_block: `64`, / size of prefetch block. /
3479	.simultaneous_prefetches: `12`, / number of parallel prefetches. /
3480	.branch_cost: `3`, / Branch cost. /
3481	COSTS_N_INSNS (`3`), / cost of FADD and FSUB insns. /
3482	COSTS_N_INSNS (`3`), / cost of FMUL instruction. /
3483	COSTS_N_INSNS (`14`), / cost of FDIV instruction. /
3484	COSTS_N_INSNS (`1`), / cost of FABS instruction. /
3485	COSTS_N_INSNS (`1`), / cost of FCHS instruction. /
3486	COSTS_N_INSNS (`40`), / cost of FSQRT instruction. /
3487
3488	COSTS_N_INSNS (`1`), / cost of cheap SSE instruction. /
3489	COSTS_N_INSNS (`3`), / cost of ADDSS/SD SUBSS/SD insns. /
3490	COSTS_N_INSNS (`3`), / cost of MULSS instruction. /
3491	COSTS_N_INSNS (`3`), / cost of MULSD instruction. /
3492	COSTS_N_INSNS (`5`), / cost of FMA SS instruction. /
3493	COSTS_N_INSNS (`5`), / cost of FMA SD instruction. /
3494	COSTS_N_INSNS (`10`), / cost of DIVSS instruction. /
3495	COSTS_N_INSNS (`14`), / cost of DIVSD instruction. /
3496	COSTS_N_INSNS (`20`), / cost of SQRTSS instruction. /
3497	COSTS_N_INSNS (`35`), / cost of SQRTSD instruction. /
3498	.reassoc_int: `4`, .reassoc_fp: `4`, .reassoc_vec_int: `4`, .reassoc_vec_fp: `4`, / reassoc int, fp, vec_int, vec_fp. /
3499	.memcpy: yongfeng_memcpy,
3500	.memset: yongfeng_memset,
3501	COSTS_N_INSNS (`3`), / cond_taken_branch_cost. /
3502	COSTS_N_INSNS (`1`), / cond_not_taken_branch_cost. /
3503	.align_loop: "16:11:8", / Loop alignment. /
3504	.align_jump: "16:11:8", / Jump alignment. /
3505	.align_label: "0:0:8", / Label alignment. /
3506	.align_func: "16", / Func alignment. /
3507	.small_unroll_ninsns: `4`, / Small unroll limit. /
3508	.small_unroll_factor: `2`, / Small unroll factor. /
3509	};
3510
3511
3512	/ Generic should produce code tuned for Core-i7 (and newer chips)*
3513	and btver1 (and newer chips). /*
3514
3515	static stringop_algs generic_memcpy[`2`] = {
3516	{.unknown_size: libcall, .size: {{`32`, loop, false}, {`8192`, rep_prefix_4_byte, false},
3517	{-`1`, libcall, false}}},
3518	{.unknown_size: libcall, .size: {{`32`, loop, false}, {`8192`, rep_prefix_8_byte, false},
3519	{-`1`, libcall, false}}}};
3520	static stringop_algs generic_memset[`2`] = {
3521	{.unknown_size: libcall, .size: {{`32`, loop, false}, {`8192`, rep_prefix_4_byte, false},
3522	{-`1`, libcall, false}}},
3523	{.unknown_size: libcall, .size: {{`32`, loop, false}, {`8192`, rep_prefix_8_byte, false},
3524	{-`1`, libcall, false}}}};
3525	static const
3526	struct processor_costs generic_cost = {
3527	.hard_register: {
3528	/ Start of register allocator costs. integer->integer move cost is 2. /
3529	.movzbl_load: `6`, / cost for loading QImode using movzbl /
3530	.int_load: {`6`, `6`, `6`}, / cost of loading integer registers*
3531	in QImode, HImode and SImode.
3532	Relative to reg-reg move (2). /*
3533	.int_store: {`6`, `6`, `6`}, / cost of storing integer registers /
3534	.fp_move: `4`, / cost of reg,reg fld/fst /
3535	.fp_load: {`6`, `6`, `12`}, / cost of loading fp registers*
3536	in SFmode, DFmode and XFmode /*
3537	.fp_store: {`6`, `6`, `12`}, / cost of storing fp registers*
3538	in SFmode, DFmode and XFmode /*
3539	.mmx_move: `2`, / cost of moving MMX register /
3540	.mmx_load: {`6`, `6`}, / cost of loading MMX registers*
3541	in SImode and DImode /*
3542	.mmx_store: {`6`, `6`}, / cost of storing MMX registers*
3543	in SImode and DImode /*
3544	.xmm_move: `2`, .ymm_move: `3`, .zmm_move: `4`, / cost of moving XMM,YMM,ZMM register /
3545	.sse_load: {`6`, `6`, `6`, `10`, `15`}, / cost of loading SSE registers*
3546	in 32,64,128,256 and 512-bit /*
3547	.sse_store: {`6`, `6`, `6`, `10`, `15`}, / cost of storing SSE registers*
3548	in 32,64,128,256 and 512-bit /*
3549	.sse_to_integer: `6`, .integer_to_sse: `6`, / SSE->integer and integer->SSE moves /
3550	.mask_to_integer: `6`, .integer_to_mask: `6`, / mask->integer and integer->mask moves /
3551	.mask_load: {`6`, `6`, `6`}, / cost of loading mask register*
3552	in QImode, HImode, SImode. /*
3553	.mask_store: {`6`, `6`, `6`}, / cost if storing mask register*
3554	in QImode, HImode, SImode. /*
3555	.mask_move: `2`, / cost of moving mask register. /
3556	/ End of register allocator costs. /
3557	},
3558
3559	COSTS_N_INSNS (`1`), / cost of an add instruction /
3560	/ Setting cost to 2 makes our current implementation of synth_mult result in*
3561	use of unnecessary temporary registers causing regression on several
3562	SPECfp benchmarks. /*
3563	COSTS_N_INSNS (`1`) + `1`, / cost of a lea instruction /
3564	COSTS_N_INSNS (`1`), / variable shift costs /
3565	COSTS_N_INSNS (`1`), / constant shift costs /
3566	.mult_init: {COSTS_N_INSNS (`3`), / cost of starting multiply for QI /
3567	COSTS_N_INSNS (`4`), / HI /
3568	COSTS_N_INSNS (`3`), / SI /
3569	COSTS_N_INSNS (`4`), / DI /
3570	COSTS_N_INSNS (`4`)}, / other /
3571	.mult_bit: `0`, / cost of multiply per each bit set /
3572	.divide: {COSTS_N_INSNS (`16`), / cost of a divide/mod for QI /
3573	COSTS_N_INSNS (`22`), / HI /
3574	COSTS_N_INSNS (`30`), / SI /
3575	COSTS_N_INSNS (`74`), / DI /
3576	COSTS_N_INSNS (`74`)}, / other /
3577	COSTS_N_INSNS (`1`), / cost of movsx /
3578	COSTS_N_INSNS (`1`), / cost of movzx /
3579	.large_insn: `8`, / "large" insn /
3580	.move_ratio: `17`, / MOVE_RATIO /
3581	.clear_ratio: `6`, / CLEAR_RATIO /
3582	.int_load: {`6`, `6`, `6`}, / cost of loading integer registers*
3583	in QImode, HImode and SImode.
3584	Relative to reg-reg move (2). /*
3585	.int_store: {`6`, `6`, `6`}, / cost of storing integer registers /
3586	.sse_load: {`6`, `6`, `6`, `10`, `15`}, / cost of loading SSE register*
3587	in 32bit, 64bit, 128bit, 256bit and 512bit /*
3588	.sse_store: {`6`, `6`, `6`, `10`, `15`}, / cost of storing SSE register*
3589	in 32bit, 64bit, 128bit, 256bit and 512bit /*
3590	.sse_unaligned_load: {`6`, `6`, `6`, `10`, `15`}, / cost of unaligned loads. /
3591	.sse_unaligned_store: {`6`, `6`, `6`, `10`, `15`}, / cost of unaligned storess. /
3592	.xmm_move: `2`, .ymm_move: `3`, .zmm_move: `4`, / cost of moving XMM,YMM,ZMM register /
3593	.sse_to_integer: `6`, / cost of moving SSE register to integer. /
3594	.gather_static: `18`, .gather_per_elt: `6`, / Gather load static, per_elt. /
3595	.scatter_static: `18`, .scatter_per_elt: `6`, / Gather store static, per_elt. /
3596	.l1_cache_size: `32`, / size of l1 cache. /
3597	.l2_cache_size: `512`, / size of l2 cache. /
3598	.prefetch_block: `64`, / size of prefetch block /
3599	.simultaneous_prefetches: `6`, / number of parallel prefetches /
3600	/ Benchmarks shows large regressions on K8 sixtrack benchmark when this*
3601	value is increased to perhaps more appropriate value of 5. /*
3602	.branch_cost: `3`, / Branch cost /
3603	COSTS_N_INSNS (`3`), / cost of FADD and FSUB insns. /
3604	COSTS_N_INSNS (`5`), / cost of FMUL instruction. /
3605	COSTS_N_INSNS (`17`), / cost of FDIV instruction. /
3606	COSTS_N_INSNS (`1`), / cost of FABS instruction. /
3607	COSTS_N_INSNS (`1`), / cost of FCHS instruction. /
3608	COSTS_N_INSNS (`14`), / cost of FSQRT instruction. /
3609
3610	COSTS_N_INSNS (`1`), / cost of cheap SSE instruction. /
3611	COSTS_N_INSNS (`3`), / cost of ADDSS/SD SUBSS/SD insns. /
3612	COSTS_N_INSNS (`4`), / cost of MULSS instruction. /
3613	COSTS_N_INSNS (`5`), / cost of MULSD instruction. /
3614	COSTS_N_INSNS (`5`), / cost of FMA SS instruction. /
3615	COSTS_N_INSNS (`5`), / cost of FMA SD instruction. /
3616	COSTS_N_INSNS (`13`), / cost of DIVSS instruction. /
3617	COSTS_N_INSNS (`17`), / cost of DIVSD instruction. /
3618	COSTS_N_INSNS (`14`), / cost of SQRTSS instruction. /
3619	COSTS_N_INSNS (`18`), / cost of SQRTSD instruction. /
3620	.reassoc_int: `1`, .reassoc_fp: `4`, .reassoc_vec_int: `3`, .reassoc_vec_fp: `3`, / reassoc int, fp, vec_int, vec_fp. /
3621	.memcpy: generic_memcpy,
3622	.memset: generic_memset,
3623	COSTS_N_INSNS (`4`), / cond_taken_branch_cost. /
3624	COSTS_N_INSNS (`2`), / cond_not_taken_branch_cost. /
3625	.align_loop: "16:11:8", / Loop alignment. /
3626	.align_jump: "16:11:8", / Jump alignment. /
3627	.align_label: "0:0:8", / Label alignment. /
3628	.align_func: "16", / Func alignment. /
3629	.small_unroll_ninsns: `4`, / Small unroll limit. /
3630	.small_unroll_factor: `2`, / Small unroll factor. /
3631	};
3632
3633	/ core_cost should produce code tuned for Core familly of CPUs. /
3634	static stringop_algs core_memcpy[`2`] = {
3635	{.unknown_size: libcall, .size: {{`1024`, rep_prefix_4_byte, true}, {-`1`, libcall, false}}},
3636	{.unknown_size: libcall, .size: {{`24`, loop, true}, {`128`, rep_prefix_8_byte, true},
3637	{-`1`, libcall, false}}}};
3638	static stringop_algs core_memset[`2`] = {
3639	{.unknown_size: libcall, .size: {{`6`, loop_1_byte, true},
3640	{`24`, loop, true},
3641	{`8192`, rep_prefix_4_byte, true},
3642	{-`1`, libcall, false}}},
3643	{.unknown_size: libcall, .size: {{`24`, loop, true}, {`512`, rep_prefix_8_byte, true},
3644	{-`1`, libcall, false}}}};
3645
3646	static const
3647	struct processor_costs core_cost = {
3648	.hard_register: {
3649	/ Start of register allocator costs. integer->integer move cost is 2. /
3650	.movzbl_load: `6`, / cost for loading QImode using movzbl /
3651	.int_load: {`4`, `4`, `4`}, / cost of loading integer registers*
3652	in QImode, HImode and SImode.
3653	Relative to reg-reg move (2). /*
3654	.int_store: {`6`, `6`, `6`}, / cost of storing integer registers /
3655	.fp_move: `2`, / cost of reg,reg fld/fst /
3656	.fp_load: {`6`, `6`, `8`}, / cost of loading fp registers*
3657	in SFmode, DFmode and XFmode /*
3658	.fp_store: {`6`, `6`, `10`}, / cost of storing fp registers*
3659	in SFmode, DFmode and XFmode /*
3660	.mmx_move: `2`, / cost of moving MMX register /
3661	.mmx_load: {`6`, `6`}, / cost of loading MMX registers*
3662	in SImode and DImode /*
3663	.mmx_store: {`6`, `6`}, / cost of storing MMX registers*
3664	in SImode and DImode /*
3665	.xmm_move: `2`, .ymm_move: `2`, .zmm_move: `4`, / cost of moving XMM,YMM,ZMM register /
3666	.sse_load: {`6`, `6`, `6`, `6`, `12`}, / cost of loading SSE registers*
3667	in 32,64,128,256 and 512-bit /*
3668	.sse_store: {`6`, `6`, `6`, `6`, `12`}, / cost of storing SSE registers*
3669	in 32,64,128,256 and 512-bit /*
3670	.sse_to_integer: `6`, .integer_to_sse: `6`, / SSE->integer and integer->SSE moves /
3671	.mask_to_integer: `6`, .integer_to_mask: `6`, / mask->integer and integer->mask moves /
3672	.mask_load: {`4`, `4`, `4`}, / cost of loading mask register*
3673	in QImode, HImode, SImode. /*
3674	.mask_store: {`6`, `6`, `6`}, / cost if storing mask register*
3675	in QImode, HImode, SImode. /*
3676	.mask_move: `2`, / cost of moving mask register. /
3677	/ End of register allocator costs. /
3678	},
3679
3680	COSTS_N_INSNS (`1`), / cost of an add instruction /
3681	/ On all chips taken into consideration lea is 2 cycles and more. With*
3682	this cost however our current implementation of synth_mult results in
3683	use of unnecessary temporary registers causing regression on several
3684	SPECfp benchmarks. /*
3685	COSTS_N_INSNS (`1`) + `1`, / cost of a lea instruction /
3686	COSTS_N_INSNS (`1`), / variable shift costs /
3687	COSTS_N_INSNS (`1`), / constant shift costs /
3688	.mult_init: {COSTS_N_INSNS (`3`), / cost of starting multiply for QI /
3689	COSTS_N_INSNS (`4`), / HI /
3690	COSTS_N_INSNS (`3`), / SI /
3691	/ Here we tune for Sandybridge or newer. /
3692	COSTS_N_INSNS (`3`), / DI /
3693	COSTS_N_INSNS (`3`)}, / other /
3694	.mult_bit: `0`, / cost of multiply per each bit set /
3695	/ Expanding div/mod currently doesn't consider parallelism. So the cost*
3696	model is not realistic. We compensate by increasing the latencies a bit. /*
3697	.divide: {COSTS_N_INSNS (`11`), / cost of a divide/mod for QI /
3698	COSTS_N_INSNS (`11`), / HI /
3699	COSTS_N_INSNS (`14`), / SI /
3700	COSTS_N_INSNS (`81`), / DI /
3701	COSTS_N_INSNS (`81`)}, / other /
3702	COSTS_N_INSNS (`1`), / cost of movsx /
3703	COSTS_N_INSNS (`1`), / cost of movzx /
3704	.large_insn: `8`, / "large" insn /
3705	.move_ratio: `17`, / MOVE_RATIO /
3706	.clear_ratio: `6`, / CLEAR_RATIO /
3707	.int_load: {`4`, `4`, `4`}, / cost of loading integer registers*
3708	in QImode, HImode and SImode.
3709	Relative to reg-reg move (2). /*
3710	.int_store: {`6`, `6`, `6`}, / cost of storing integer registers /
3711	.sse_load: {`6`, `6`, `6`, `6`, `12`}, / cost of loading SSE register*
3712	in 32bit, 64bit, 128bit, 256bit and 512bit /*
3713	.sse_store: {`6`, `6`, `6`, `6`, `12`}, / cost of storing SSE register*
3714	in 32bit, 64bit, 128bit, 256bit and 512bit /*
3715	.sse_unaligned_load: {`6`, `6`, `6`, `6`, `12`}, / cost of unaligned loads. /
3716	.sse_unaligned_store: {`6`, `6`, `6`, `6`, `12`}, / cost of unaligned stores. /
3717	.xmm_move: `2`, .ymm_move: `2`, .zmm_move: `4`, / cost of moving XMM,YMM,ZMM register /
3718	.sse_to_integer: `2`, / cost of moving SSE register to integer. /
3719	/ VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,*
3720	rec. throughput 6.
3721	So 5 uops statically and one uops per load. /*
3722	.gather_static: `10`, .gather_per_elt: `6`, / Gather load static, per_elt. /
3723	.scatter_static: `10`, .scatter_per_elt: `6`, / Gather store static, per_elt. /
3724	.l1_cache_size: `64`, / size of l1 cache. /
3725	.l2_cache_size: `512`, / size of l2 cache. /
3726	.prefetch_block: `64`, / size of prefetch block /
3727	.simultaneous_prefetches: `6`, / number of parallel prefetches /
3728	/ FIXME perhaps more appropriate value is 5. /
3729	.branch_cost: `3`, / Branch cost /
3730	COSTS_N_INSNS (`3`), / cost of FADD and FSUB insns. /
3731	COSTS_N_INSNS (`5`), / cost of FMUL instruction. /
3732	/ 10-24 /
3733	COSTS_N_INSNS (`24`), / cost of FDIV instruction. /
3734	COSTS_N_INSNS (`1`), / cost of FABS instruction. /
3735	COSTS_N_INSNS (`1`), / cost of FCHS instruction. /
3736	COSTS_N_INSNS (`23`), / cost of FSQRT instruction. /
3737
3738	COSTS_N_INSNS (`1`), / cost of cheap SSE instruction. /
3739	COSTS_N_INSNS (`3`), / cost of ADDSS/SD SUBSS/SD insns. /
3740	COSTS_N_INSNS (`4`), / cost of MULSS instruction. /
3741	COSTS_N_INSNS (`5`), / cost of MULSD instruction. /
3742	COSTS_N_INSNS (`5`), / cost of FMA SS instruction. /
3743	COSTS_N_INSNS (`5`), / cost of FMA SD instruction. /
3744	COSTS_N_INSNS (`18`), / cost of DIVSS instruction. /
3745	COSTS_N_INSNS (`32`), / cost of DIVSD instruction. /
3746	COSTS_N_INSNS (`30`), / cost of SQRTSS instruction. /
3747	COSTS_N_INSNS (`58`), / cost of SQRTSD instruction. /
3748	.reassoc_int: `1`, .reassoc_fp: `4`, .reassoc_vec_int: `2`, .reassoc_vec_fp: `2`, / reassoc int, fp, vec_int, vec_fp. /
3749	.memcpy: core_memcpy,
3750	.memset: core_memset,
3751	COSTS_N_INSNS (`3`), / cond_taken_branch_cost. /
3752	COSTS_N_INSNS (`1`), / cond_not_taken_branch_cost. /
3753	.align_loop: "16:11:8", / Loop alignment. /
3754	.align_jump: "16:11:8", / Jump alignment. /
3755	.align_label: "0:0:8", / Label alignment. /
3756	.align_func: "16", / Func alignment. /
3757	.small_unroll_ninsns: `4`, / Small unroll limit. /
3758	.small_unroll_factor: `2`, / Small unroll factor. /
3759	};
3760
3761

source code of gcc/config/i386/x86-tune-costs.h