R600ISelLowering.cpp source code [llvm/lib/Target/AMDGPU/R600ISelLowering.cpp]

1	//===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// Custom DAG lowering for R600
11	//
12	//===----------------------------------------------------------------------===//
13
14	#include "R600ISelLowering.h"
15	#include "AMDGPU.h"
16	#include "MCTargetDesc/R600MCTargetDesc.h"
17	#include "R600Defines.h"
18	#include "R600InstrInfo.h"
19	#include "R600MachineFunctionInfo.h"
20	#include "R600Subtarget.h"
21	#include "R600TargetMachine.h"
22	#include "llvm/CodeGen/MachineFunction.h"
23	#include "llvm/IR/IntrinsicsAMDGPU.h"
24	#include "llvm/IR/IntrinsicsR600.h"
25
26	using namespace llvm;
27
28	#include "R600GenCallingConv.inc"
29
30	R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
31	const R600Subtarget &STI)
32	: AMDGPUTargetLowering (TM, STI), Subtarget(&STI), Gen(STI.getGeneration()) {
33	addRegisterClass(MVT::VT: f32, RC: &R600::R600_Reg32RegClass);
34	addRegisterClass(MVT::i32, &R600::R600_Reg32RegClass);
35	addRegisterClass(MVT::v2f32, &R600::R600_Reg64RegClass);
36	addRegisterClass(MVT::v2i32, &R600::R600_Reg64RegClass);
37	addRegisterClass(MVT::v4f32, &R600::R600_Reg128RegClass);
38	addRegisterClass(MVT::v4i32, &R600::R600_Reg128RegClass);
39
40	setBooleanContents(ZeroOrNegativeOneBooleanContent);
41	setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
42
43	computeRegisterProperties(Subtarget->getRegisterInfo());
44
45	// Legalize loads and stores to the private address space.
46	setOperationAction(ISD::LOAD, {MVT::i32, MVT::v2i32, MVT::v4i32}, Custom);
47
48	// EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
49	// spaces, so it is custom lowered to handle those where it isn't.
50	for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD})
51	for (MVT VT : MVT::integer_valuetypes()) {
52	setLoadExtAction(Op, VT, MVT::i1, Promote);
53	setLoadExtAction(Op, VT, MVT::i8, Custom);
54	setLoadExtAction(Op, VT, MVT::i16, Custom);
55	}
56
57	// Workaround for LegalizeDAG asserting on expansion of i1 vector loads.
58	setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::v2i32,
59	MVT::v2i1, Expand);
60
61	setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::v4i32,
62	MVT::v4i1, Expand);
63
64	setOperationAction(ISD::STORE, {MVT::i8, MVT::i32, MVT::v2i32, MVT::v4i32},
65	Custom);
66
67	setTruncStoreAction(MVT::i32, MVT::i8, Custom);
68	setTruncStoreAction(MVT::i32, MVT::i16, Custom);
69	// We need to include these since trunc STORES to PRIVATE need
70	// special handling to accommodate RMW
71	setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
72	setTruncStoreAction(MVT::v4i32, MVT::v4i16, Custom);
73	setTruncStoreAction(MVT::v8i32, MVT::v8i16, Custom);
74	setTruncStoreAction(MVT::v16i32, MVT::v16i16, Custom);
75	setTruncStoreAction(MVT::v32i32, MVT::v32i16, Custom);
76	setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
77	setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom);
78	setTruncStoreAction(MVT::v8i32, MVT::v8i8, Custom);
79	setTruncStoreAction(MVT::v16i32, MVT::v16i8, Custom);
80	setTruncStoreAction(MVT::v32i32, MVT::v32i8, Custom);
81
82	// Workaround for LegalizeDAG asserting on expansion of i1 vector stores.
83	setTruncStoreAction(MVT::v2i32, MVT::v2i1, Expand);
84	setTruncStoreAction(MVT::v4i32, MVT::v4i1, Expand);
85
86	// Set condition code actions
87	setCondCodeAction({ISD::SETO, ISD::SETUO, ISD::SETLT, ISD::SETLE, ISD::SETOLT,
88	ISD::SETOLE, ISD::SETONE, ISD::SETUEQ, ISD::SETUGE,
89	ISD::SETUGT, ISD::SETULT, ISD::SETULE},
90	MVT::f32, Expand);
91
92	setCondCodeAction({ISD::SETLE, ISD::SETLT, ISD::SETULE, ISD::SETULT},
93	MVT::i32, Expand);
94
95	setOperationAction({ISD::FCOS, ISD::FSIN}, MVT::f32, Custom);
96
97	setOperationAction(ISD::SETCC, {MVT::v4i32, MVT::v2i32}, Expand);
98
99	setOperationAction(ISD::BR_CC, {MVT::i32, MVT::f32}, Expand);
100	setOperationAction(ISD::BRCOND, MVT::Other, Custom);
101
102	setOperationAction(ISD::FSUB, MVT::f32, Expand);
103
104	setOperationAction({ISD::FCEIL, ISD::FTRUNC, ISD::FROUNDEVEN, ISD::FFLOOR},
105	MVT::f64, Custom);
106
107	setOperationAction(ISD::SELECT_CC, {MVT::f32, MVT::i32}, Custom);
108
109	setOperationAction(ISD::SETCC, {MVT::i32, MVT::f32}, Expand);
110	setOperationAction({ISD::FP_TO_UINT, ISD::FP_TO_SINT}, {MVT::i1, MVT::i64},
111	Custom);
112
113	setOperationAction(ISD::SELECT, {MVT::i32, MVT::f32, MVT::v2i32, MVT::v4i32},
114	Expand);
115
116	// ADD, SUB overflow.
117	// TODO: turn these into Legal?
118	if (Subtarget->hasCARRY())
119	setOperationAction(ISD::UADDO, MVT::i32, Custom);
120
121	if (Subtarget->hasBORROW())
122	setOperationAction(ISD::USUBO, MVT::i32, Custom);
123
124	// Expand sign extension of vectors
125	if (!Subtarget->hasBFE())
126	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
127
128	setOperationAction(ISD::SIGN_EXTEND_INREG, {MVT::v2i1, MVT::v4i1}, Expand);
129
130	if (!Subtarget->hasBFE())
131	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
132	setOperationAction(ISD::SIGN_EXTEND_INREG, {MVT::v2i8, MVT::v4i8}, Expand);
133
134	if (!Subtarget->hasBFE())
135	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
136	setOperationAction(ISD::SIGN_EXTEND_INREG, {MVT::v2i16, MVT::v4i16}, Expand);
137
138	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
139	setOperationAction(ISD::SIGN_EXTEND_INREG, {MVT::v2i32, MVT::v4i32}, Expand);
140
141	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand);
142
143	setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
144
145	setOperationAction(ISD::EXTRACT_VECTOR_ELT,
146	{MVT::v2i32, MVT::v2f32, MVT::v4i32, MVT::v4f32}, Custom);
147
148	setOperationAction(ISD::INSERT_VECTOR_ELT,
149	{MVT::v2i32, MVT::v2f32, MVT::v4i32, MVT::v4f32}, Custom);
150
151	// We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32
152	// to be Legal/Custom in order to avoid library calls.
153	setOperationAction({ISD::SHL_PARTS, ISD::SRL_PARTS, ISD::SRA_PARTS}, MVT::i32,
154	Custom);
155
156	if (!Subtarget->hasFMA())
157	setOperationAction(ISD::FMA, {MVT::f32, MVT::f64}, Expand);
158
159	// FIXME: May need no denormals check
160	setOperationAction(ISD::FMAD, MVT::f32, Legal);
161
162	if (!Subtarget->hasBFI())
163	// fcopysign can be done in a single instruction with BFI.
164	setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand);
165
166	if (!Subtarget->hasBCNT(`32`))
167	setOperationAction(ISD::CTPOP, MVT::i32, Expand);
168
169	if (!Subtarget->hasBCNT(`64`))
170	setOperationAction(ISD::CTPOP, MVT::i64, Expand);
171
172	if (Subtarget->hasFFBH())
173	setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom);
174
175	if (Subtarget->hasFFBL())
176	setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Custom);
177
178	// FIXME: This was moved from AMDGPUTargetLowering, I'm not sure if we
179	// need it for R600.
180	if (Subtarget->hasBFE())
181	setHasExtractBitsInsn(true);
182
183	setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
184	setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
185
186	const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
187	for (MVT VT : ScalarIntVTs)
188	setOperationAction({ISD::ADDC, ISD::SUBC, ISD::ADDE, ISD::SUBE}, VT,
189	Expand);
190
191	// LLVM will expand these to atomic_cmp_swap(0)
192	// and atomic_swap, respectively.
193	setOperationAction({ISD::ATOMIC_LOAD, ISD::ATOMIC_STORE}, MVT::i32, Expand);
194
195	// We need to custom lower some of the intrinsics
196	setOperationAction({ISD::INTRINSIC_VOID, ISD::INTRINSIC_WO_CHAIN}, MVT::Other,
197	Custom);
198
199	setSchedulingPreference(Sched::Source);
200
201	setTargetDAGCombine({ISD::FP_ROUND, ISD::FP_TO_SINT, ISD::EXTRACT_VECTOR_ELT,
202	ISD::SELECT_CC, ISD::INSERT_VECTOR_ELT, ISD::LOAD});
203	}
204
205	static inline bool isEOP(MachineBasicBlock::iterator I) {
206	if (std::next(x: I) == I ->getParent()->end())
207	return false;
208	return std::next(x: I)->getOpcode() == R600::RETURN;
209	}
210
211	MachineBasicBlock *
212	R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
213	MachineBasicBlock BB) const* {
214	MachineFunction *MF = BB->getParent();
215	MachineRegisterInfo &MRI = MF->getRegInfo();
216	MachineBasicBlock::iterator I = MI;
217	const R600InstrInfo *TII = Subtarget->getInstrInfo();
218
219	switch (MI.getOpcode()) {
220	default:
221	// Replace LDS__RET instruction that don't have any uses with the*
222	// equivalent LDS__NORET instruction.*
223	if (TII->isLDSRetInstr(MI.getOpcode())) {
224	int DstIdx = TII->getOperandIdx(MI.getOpcode(), R600::OpName::dst);
225	assert(DstIdx != -`1`);
226	MachineInstrBuilder NewMI;
227	// FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add
228	// LDS_1A2D support and remove this special case.
229	if (!MRI.use_empty(MI.getOperand(DstIdx).getReg()) \|\|
230	MI.getOpcode() == R600::LDS_CMPST_RET)
231	return BB;
232
233	NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
234	TII->get(R600::getLDSNoRetOp(MI.getOpcode())));
235	for (const MachineOperand &MO : llvm::drop_begin(MI.operands()))
236	NewMI.add(MO);
237	} else {
238	return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
239	}
240	break;
241
242	case R600::FABS_R600: {
243	MachineInstr *NewMI = TII->buildDefaultInstruction(
244	*BB, I, R600::MOV, MI.getOperand(`0`).getReg(),
245	MI.getOperand(`1`).getReg());
246	TII->addFlag(*NewMI, `0`, MO_FLAG_ABS);
247	break;
248	}
249
250	case R600::FNEG_R600: {
251	MachineInstr *NewMI = TII->buildDefaultInstruction(
252	*BB, I, R600::MOV, MI.getOperand(`0`).getReg(),
253	MI.getOperand(`1`).getReg());
254	TII->addFlag(*NewMI, `0`, MO_FLAG_NEG);
255	break;
256	}
257
258	case R600::MASK_WRITE: {
259	Register maskedRegister = MI.getOperand(`0`).getReg();
260	assert(maskedRegister.isVirtual());
261	MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
262	TII->addFlag(*defInstr, `0`, MO_FLAG_MASK);
263	break;
264	}
265
266	case R600::MOV_IMM_F32:
267	TII->buildMovImm(*BB, I, MI.getOperand(`0`).getReg(), MI.getOperand(`1`)
268	.getFPImm()
269	->getValueAPF()
270	.bitcastToAPInt()
271	.getZExtValue());
272	break;
273
274	case R600::MOV_IMM_I32:
275	TII->buildMovImm(*BB, I, MI.getOperand(`0`).getReg(),
276	MI.getOperand(`1`).getImm());
277	break;
278
279	case R600::MOV_IMM_GLOBAL_ADDR: {
280	//TODO: Perhaps combine this instruction with the next if possible
281	auto MIB = TII->buildDefaultInstruction(
282	*BB, MI, R600::MOV, MI.getOperand(`0`).getReg(), R600::ALU_LITERAL_X);
283	int Idx = TII->getOperandIdx(*MIB, R600::OpName::literal);
284	//TODO: Ugh this is rather ugly
285	const MachineOperand &MO = MI.getOperand(`1`);
286	MIB->getOperand(Idx).ChangeToGA(MO.getGlobal(), MO.getOffset(),
287	MO.getTargetFlags());
288	break;
289	}
290
291	case R600::CONST_COPY: {
292	MachineInstr *NewMI = TII->buildDefaultInstruction(
293	*BB, MI, R600::MOV, MI.getOperand(`0`).getReg(), R600::ALU_CONST);
294	TII->setImmOperand(*NewMI, R600::OpName::src0_sel,
295	MI.getOperand(`1`).getImm());
296	break;
297	}
298
299	case R600::RAT_WRITE_CACHELESS_32_eg:
300	case R600::RAT_WRITE_CACHELESS_64_eg:
301	case R600::RAT_WRITE_CACHELESS_128_eg:
302	BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
303	.add(MI.getOperand(`0`))
304	.add(MI.getOperand(`1`))
305	.addImm(isEOP(I)); // Set End of program bit
306	break;
307
308	case R600::RAT_STORE_TYPED_eg:
309	BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
310	.add(MI.getOperand(`0`))
311	.add(MI.getOperand(`1`))
312	.add(MI.getOperand(`2`))
313	.addImm(isEOP(I)); // Set End of program bit
314	break;
315
316	case R600::BRANCH:
317	BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(R600::JUMP))
318	.add(MI.getOperand(`0`));
319	break;
320
321	case R600::BRANCH_COND_f32: {
322	MachineInstr *NewMI =
323	BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(R600::PRED_X),
324	R600::PREDICATE_BIT)
325	.add(MI.getOperand(`1`))
326	.addImm(R600::PRED_SETNE)
327	.addImm(`0`); // Flags
328	TII->addFlag(*NewMI, `0`, MO_FLAG_PUSH);
329	BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(R600::JUMP_COND))
330	.add(MI.getOperand(`0`))
331	.addReg(R600::PREDICATE_BIT, RegState::Kill);
332	break;
333	}
334
335	case R600::BRANCH_COND_i32: {
336	MachineInstr *NewMI =
337	BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(R600::PRED_X),
338	R600::PREDICATE_BIT)
339	.add(MI.getOperand(`1`))
340	.addImm(R600::PRED_SETNE_INT)
341	.addImm(`0`); // Flags
342	TII->addFlag(*NewMI, `0`, MO_FLAG_PUSH);
343	BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(R600::JUMP_COND))
344	.add(MI.getOperand(`0`))
345	.addReg(R600::PREDICATE_BIT, RegState::Kill);
346	break;
347	}
348
349	case R600::EG_ExportSwz:
350	case R600::R600_ExportSwz: {
351	// Instruction is left unmodified if its not the last one of its type
352	bool isLastInstructionOfItsType = true;
353	unsigned InstExportType = MI.getOperand(`1`).getImm();
354	for (MachineBasicBlock::iterator NextExportInst = std::next(I),
355	EndBlock = BB->end(); NextExportInst != EndBlock;
356	NextExportInst = std::next(NextExportInst)) {
357	if (NextExportInst->getOpcode() == R600::EG_ExportSwz \|\|
358	NextExportInst->getOpcode() == R600::R600_ExportSwz) {
359	unsigned CurrentInstExportType = NextExportInst->getOperand(`1`)
360	.getImm();
361	if (CurrentInstExportType == InstExportType) {
362	isLastInstructionOfItsType = false;
363	break;
364	}
365	}
366	}
367	bool EOP = isEOP(I);
368	if (!EOP && !isLastInstructionOfItsType)
369	return BB;
370	unsigned CfInst = (MI.getOpcode() == R600::EG_ExportSwz) ? `84` : `40`;
371	BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
372	.add(MI.getOperand(`0`))
373	.add(MI.getOperand(`1`))
374	.add(MI.getOperand(`2`))
375	.add(MI.getOperand(`3`))
376	.add(MI.getOperand(`4`))
377	.add(MI.getOperand(`5`))
378	.add(MI.getOperand(`6`))
379	.addImm(CfInst)
380	.addImm(EOP);
381	break;
382	}
383	case R600::RETURN: {
384	return BB;
385	}
386	}
387
388	MI.eraseFromParent();
389	return BB;
390	}
391
392	//===----------------------------------------------------------------------===//
393	// Custom DAG Lowering Operations
394	//===----------------------------------------------------------------------===//
395
396	SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
397	MachineFunction &MF = DAG.getMachineFunction();
398	R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
399	switch (Op.getOpcode()) {
400	default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
401	case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
402	case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
403	case ISD::SHL_PARTS:
404	case ISD::SRA_PARTS:
405	case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
406	case ISD::UADDO: return LowerUADDSUBO(Op, DAG, mainop: ISD::ADD, ovf: AMDGPUISD::CARRY);
407	case ISD::USUBO: return LowerUADDSUBO(Op, DAG, mainop: ISD::SUB, ovf: AMDGPUISD::BORROW);
408	case ISD::FCOS:
409	case ISD::FSIN: return LowerTrig(Op, DAG);
410	case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
411	case ISD::STORE: return LowerSTORE(Op, DAG);
412	case ISD::LOAD: {
413	SDValue Result = LowerLOAD(Op, DAG);
414	assert((!Result.getNode() \|\|
415	Result.getNode()->getNumValues() == `2`) &&
416	"Load should return a value and a chain");
417	return Result;
418	}
419
420	case ISD::BRCOND: return LowerBRCOND(Op, DAG);
421	case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
422	case ISD::FrameIndex: return lowerFrameIndex(Op, DAG);
423	case ISD::ADDRSPACECAST:
424	return lowerADDRSPACECAST(Op, DAG);
425	case ISD::INTRINSIC_VOID: {
426	SDValue Chain = Op.getOperand(i: `0`);
427	unsigned IntrinsicID = Op.getConstantOperandVal(i: `1`);
428	switch (IntrinsicID) {
429	case Intrinsic::r600_store_swizzle: {
430	SDLoc DL(Op);
431	const SDValue Args[`8`] = {
432	Chain,
433	Op.getOperand(`2`), // Export Value
434	Op.getOperand(`3`), // ArrayBase
435	Op.getOperand(`4`), // Type
436	DAG.getConstant(`0`, DL, MVT::i32), // SWZ_X
437	DAG.getConstant(`1`, DL, MVT::i32), // SWZ_Y
438	DAG.getConstant(`2`, DL, MVT::i32), // SWZ_Z
439	DAG.getConstant(`3`, DL, MVT::i32) // SWZ_W
440	};
441	return DAG.getNode(Opcode: AMDGPUISD::R600_EXPORT, DL, VT: Op.getValueType(), Ops: Args);
442	}
443
444	// default for switch(IntrinsicID)
445	default: break;
446	}
447	// break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
448	break;
449	}
450	case ISD::INTRINSIC_WO_CHAIN: {
451	unsigned IntrinsicID = Op.getConstantOperandVal(i: `0`);
452	EVT VT = Op.getValueType();
453	SDLoc DL(Op);
454	switch (IntrinsicID) {
455	case Intrinsic::r600_tex:
456	case Intrinsic::r600_texc: {
457	unsigned TextureOp;
458	switch (IntrinsicID) {
459	case Intrinsic::r600_tex:
460	TextureOp = `0`;
461	break;
462	case Intrinsic::r600_texc:
463	TextureOp = `1`;
464	break;
465	default:
466	llvm_unreachable("unhandled texture operation");
467	}
468
469	SDValue TexArgs[`19`] = {
470	DAG.getConstant(TextureOp, DL, MVT::i32),
471	Op.getOperand(`1`),
472	DAG.getConstant(`0`, DL, MVT::i32),
473	DAG.getConstant(`1`, DL, MVT::i32),
474	DAG.getConstant(`2`, DL, MVT::i32),
475	DAG.getConstant(`3`, DL, MVT::i32),
476	Op.getOperand(`2`),
477	Op.getOperand(`3`),
478	Op.getOperand(`4`),
479	DAG.getConstant(`0`, DL, MVT::i32),
480	DAG.getConstant(`1`, DL, MVT::i32),
481	DAG.getConstant(`2`, DL, MVT::i32),
482	DAG.getConstant(`3`, DL, MVT::i32),
483	Op.getOperand(`5`),
484	Op.getOperand(`6`),
485	Op.getOperand(`7`),
486	Op.getOperand(`8`),
487	Op.getOperand(`9`),
488	Op.getOperand(`10`)
489	};
490	return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs);
491	}
492	case Intrinsic::r600_dot4: {
493	SDValue Args[`8`] = {
494	DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(`1`),
495	DAG.getConstant(`0`, DL, MVT::i32)),
496	DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(`2`),
497	DAG.getConstant(`0`, DL, MVT::i32)),
498	DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(`1`),
499	DAG.getConstant(`1`, DL, MVT::i32)),
500	DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(`2`),
501	DAG.getConstant(`1`, DL, MVT::i32)),
502	DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(`1`),
503	DAG.getConstant(`2`, DL, MVT::i32)),
504	DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(`2`),
505	DAG.getConstant(`2`, DL, MVT::i32)),
506	DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(`1`),
507	DAG.getConstant(`3`, DL, MVT::i32)),
508	DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(`2`),
509	DAG.getConstant(`3`, DL, MVT::i32))
510	};
511	return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args);
512	}
513
514	case Intrinsic::r600_implicitarg_ptr: {
515	MVT PtrVT = getPointerTy(DL: DAG.getDataLayout(), AS: AMDGPUAS::PARAM_I_ADDRESS);
516	uint32_t ByteOffset = getImplicitParameterOffset(MF, Param: FIRST_IMPLICIT);
517	return DAG.getConstant(Val: ByteOffset, DL, VT: PtrVT);
518	}
519	case Intrinsic::r600_read_ngroups_x:
520	return LowerImplicitParameter(DAG, VT, DL, DwordOffset: `0`);
521	case Intrinsic::r600_read_ngroups_y:
522	return LowerImplicitParameter(DAG, VT, DL, DwordOffset: `1`);
523	case Intrinsic::r600_read_ngroups_z:
524	return LowerImplicitParameter(DAG, VT, DL, DwordOffset: `2`);
525	case Intrinsic::r600_read_global_size_x:
526	return LowerImplicitParameter(DAG, VT, DL, DwordOffset: `3`);
527	case Intrinsic::r600_read_global_size_y:
528	return LowerImplicitParameter(DAG, VT, DL, DwordOffset: `4`);
529	case Intrinsic::r600_read_global_size_z:
530	return LowerImplicitParameter(DAG, VT, DL, DwordOffset: `5`);
531	case Intrinsic::r600_read_local_size_x:
532	return LowerImplicitParameter(DAG, VT, DL, DwordOffset: `6`);
533	case Intrinsic::r600_read_local_size_y:
534	return LowerImplicitParameter(DAG, VT, DL, DwordOffset: `7`);
535	case Intrinsic::r600_read_local_size_z:
536	return LowerImplicitParameter(DAG, VT, DL, DwordOffset: `8`);
537
538	case Intrinsic::r600_read_tgid_x:
539	case Intrinsic::amdgcn_workgroup_id_x:
540	return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass,
541	R600::T1_X, VT);
542	case Intrinsic::r600_read_tgid_y:
543	case Intrinsic::amdgcn_workgroup_id_y:
544	return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass,
545	R600::T1_Y, VT);
546	case Intrinsic::r600_read_tgid_z:
547	case Intrinsic::amdgcn_workgroup_id_z:
548	return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass,
549	R600::T1_Z, VT);
550	case Intrinsic::r600_read_tidig_x:
551	case Intrinsic::amdgcn_workitem_id_x:
552	return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass,
553	R600::T0_X, VT);
554	case Intrinsic::r600_read_tidig_y:
555	case Intrinsic::amdgcn_workitem_id_y:
556	return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass,
557	R600::T0_Y, VT);
558	case Intrinsic::r600_read_tidig_z:
559	case Intrinsic::amdgcn_workitem_id_z:
560	return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass,
561	R600::T0_Z, VT);
562
563	case Intrinsic::r600_recipsqrt_ieee:
564	return DAG.getNode(Opcode: AMDGPUISD::RSQ, DL, VT, Operand: Op.getOperand(i: `1`));
565
566	case Intrinsic::r600_recipsqrt_clamped:
567	return DAG.getNode(Opcode: AMDGPUISD::RSQ_CLAMP, DL, VT, Operand: Op.getOperand(i: `1`));
568	default:
569	return Op;
570	}
571
572	// break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
573	break;
574	}
575	} // end switch(Op.getOpcode())
576	return SDValue ();
577	}
578
579	void R600TargetLowering::ReplaceNodeResults(SDNode *N,
580	SmallVectorImpl<SDValue> &Results,
581	SelectionDAG &DAG) const {
582	switch (N->getOpcode()) {
583	default:
584	AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
585	return;
586	case ISD::FP_TO_UINT:
587	if (N->getValueType(`0`) == MVT::i1) {
588	Results.push_back(Elt: lowerFP_TO_UINT(Op: N->getOperand(Num: `0`), DAG));
589	return;
590	}
591	// Since we don't care about out of bounds values we can use FP_TO_SINT for
592	// uints too. The DAGLegalizer code for uint considers some extra cases
593	// which are not necessary here.
594	[[fallthrough]];
595	case ISD::FP_TO_SINT: {
596	if (N->getValueType(`0`) == MVT::i1) {
597	Results.push_back(Elt: lowerFP_TO_SINT(Op: N->getOperand(Num: `0`), DAG));
598	return;
599	}
600
601	SDValue Result;
602	if (expandFP_TO_SINT(N, Result, DAG))
603	Results.push_back(Elt: Result);
604	return;
605	}
606	case ISD::SDIVREM: {
607	SDValue Op = SDValue (N, `1`);
608	SDValue RES = LowerSDIVREM(Op, DAG);
609	Results.push_back(Elt: RES);
610	Results.push_back(Elt: RES.getValue(R: `1`));
611	break;
612	}
613	case ISD::UDIVREM: {
614	SDValue Op = SDValue (N, `0`);
615	LowerUDIVREM64(Op, DAG, Results);
616	break;
617	}
618	}
619	}
620
621	SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG,
622	SDValue Vector) const {
623	SDLoc DL(Vector);
624	EVT VecVT = Vector.getValueType();
625	EVT EltVT = VecVT.getVectorElementType();
626	SmallVector<SDValue, `8`> Args;
627
628	for (unsigned i = `0`, e = VecVT.getVectorNumElements(); i != e; ++i) {
629	Args.push_back(Elt: DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: EltVT, N1: Vector,
630	N2: DAG.getVectorIdxConstant(Val: i, DL)));
631	}
632
633	return DAG.getNode(Opcode: AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VT: VecVT, Ops: Args);
634	}
635
636	SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
637	SelectionDAG &DAG) const {
638	SDLoc DL(Op);
639	SDValue Vector = Op.getOperand(i: `0`);
640	SDValue Index = Op.getOperand(i: `1`);
641
642	if (isa<ConstantSDNode>(Val: Index) \|\|
643	Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
644	return Op;
645
646	Vector = vectorToVerticalVector(DAG, Vector);
647	return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: Op.getValueType(),
648	N1: Vector, N2: Index);
649	}
650
651	SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
652	SelectionDAG &DAG) const {
653	SDLoc DL(Op);
654	SDValue Vector = Op.getOperand(i: `0`);
655	SDValue Value = Op.getOperand(i: `1`);
656	SDValue Index = Op.getOperand(i: `2`);
657
658	if (isa<ConstantSDNode>(Val: Index) \|\|
659	Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
660	return Op;
661
662	Vector = vectorToVerticalVector(DAG, Vector);
663	SDValue Insert = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL, VT: Op.getValueType(),
664	N1: Vector, N2: Value, N3: Index);
665	return vectorToVerticalVector(DAG, Vector: Insert);
666	}
667
668	SDValue R600TargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
669	SDValue Op,
670	SelectionDAG &DAG) const {
671	GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Val&: Op);
672	if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
673	return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
674
675	const DataLayout &DL = DAG.getDataLayout();
676	const GlobalValue *GV = GSD->getGlobal();
677	MVT ConstPtrVT = getPointerTy(DL, AS: AMDGPUAS::CONSTANT_ADDRESS);
678
679	SDValue GA = DAG.getTargetGlobalAddress(GV, DL: SDLoc (GSD), VT: ConstPtrVT);
680	return DAG.getNode(Opcode: AMDGPUISD::CONST_DATA_PTR, DL: SDLoc (GSD), VT: ConstPtrVT, Operand: GA);
681	}
682
683	SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
684	// On hw >= R700, COS/SIN input must be between -1. and 1.
685	// Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
686	EVT VT = Op.getValueType();
687	SDValue Arg = Op.getOperand(i: `0`);
688	SDLoc DL(Op);
689
690	// TODO: Should this propagate fast-math-flags?
691	SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT,
692	DAG.getNode(ISD::FADD, DL, VT,
693	DAG.getNode(ISD::FMUL, DL, VT, Arg,
694	DAG.getConstantFP(`0.15915494309`, DL, MVT::f32)),
695	DAG.getConstantFP(`0.5`, DL, MVT::f32)));
696	unsigned TrigNode;
697	switch (Op.getOpcode()) {
698	case ISD::FCOS:
699	TrigNode = AMDGPUISD::COS_HW;
700	break;
701	case ISD::FSIN:
702	TrigNode = AMDGPUISD::SIN_HW;
703	break;
704	default:
705	llvm_unreachable("Wrong trig opcode");
706	}
707	SDValue TrigVal = DAG.getNode(TrigNode, DL, VT,
708	DAG.getNode(ISD::FADD, DL, VT, FractPart,
709	DAG.getConstantFP(-`0.5`, DL, MVT::f32)));
710	if (Gen >= AMDGPUSubtarget::R700)
711	return TrigVal;
712	// On R600 hw, COS/SIN input must be between -Pi and Pi.
713	return DAG.getNode(ISD::FMUL, DL, VT, TrigVal,
714	DAG.getConstantFP(numbers::pif, DL, MVT::f32));
715	}
716
717	SDValue R600TargetLowering::LowerShiftParts(SDValue Op,
718	SelectionDAG &DAG) const {
719	SDValue Lo, Hi;
720	expandShiftParts(N: Op.getNode(), Lo, Hi, DAG);
721	return DAG.getMergeValues(Ops: {Lo, Hi}, dl: SDLoc (Op));
722	}
723
724	SDValue R600TargetLowering::LowerUADDSUBO(SDValue Op, SelectionDAG &DAG,
725	unsigned mainop, unsigned ovf) const {
726	SDLoc DL(Op);
727	EVT VT = Op.getValueType();
728
729	SDValue Lo = Op.getOperand(i: `0`);
730	SDValue Hi = Op.getOperand(i: `1`);
731
732	SDValue OVF = DAG.getNode(Opcode: ovf, DL, VT, N1: Lo, N2: Hi);
733	// Extend sign.
734	OVF = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, OVF,
735	DAG.getValueType(MVT::i1));
736
737	SDValue Res = DAG.getNode(Opcode: mainop, DL, VT, N1: Lo, N2: Hi);
738
739	return DAG.getNode(Opcode: ISD::MERGE_VALUES, DL, VTList: DAG.getVTList(VT1: VT, VT2: VT), N1: Res, N2: OVF);
740	}
741
742	SDValue R600TargetLowering::lowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const {
743	SDLoc DL(Op);
744	return DAG.getNode(
745	ISD::SETCC,
746	DL,
747	MVT::i1,
748	Op, DAG.getConstantFP(`1.0f`, DL, MVT::f32),
749	DAG.getCondCode(ISD::SETEQ));
750	}
751
752	SDValue R600TargetLowering::lowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const {
753	SDLoc DL(Op);
754	return DAG.getNode(
755	ISD::SETCC,
756	DL,
757	MVT::i1,
758	Op, DAG.getConstantFP(-`1.0f`, DL, MVT::f32),
759	DAG.getCondCode(ISD::SETEQ));
760	}
761
762	SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
763	const SDLoc &DL,
764	unsigned DwordOffset) const {
765	unsigned ByteOffset = DwordOffset * `4`;
766	PointerType * PtrType = PointerType::get(ElementType: VT.getTypeForEVT(Context&: *DAG.getContext()),
767	AddressSpace: AMDGPUAS::PARAM_I_ADDRESS);
768
769	// We shouldn't be using an offset wider than 16-bits for implicit parameters.
770	assert(isInt<`16`>(ByteOffset));
771
772	return DAG.getLoad(VT, DL, DAG.getEntryNode(),
773	DAG.getConstant(ByteOffset, DL, MVT::i32), // PTR
774	MachinePointerInfo(ConstantPointerNull::get(PtrType)));
775	}
776
777	bool R600TargetLowering::isZero(SDValue Op) const {
778	if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Val&: Op)) {
779	return Cst->isZero();
780	} else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Val&: Op)){
781	return CstFP->isZero();
782	} else {
783	return false;
784	}
785	}
786
787	bool R600TargetLowering::isHWTrueValue(SDValue Op) const {
788	if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Val&: Op)) {
789	return CFP->isExactlyValue(V: `1.0`);
790	}
791	return isAllOnesConstant(V: Op);
792	}
793
794	bool R600TargetLowering::isHWFalseValue(SDValue Op) const {
795	if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Val&: Op)) {
796	return CFP->getValueAPF().isZero();
797	}
798	return isNullConstant(V: Op);
799	}
800
801	SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
802	SDLoc DL(Op);
803	EVT VT = Op.getValueType();
804
805	SDValue LHS = Op.getOperand(i: `0`);
806	SDValue RHS = Op.getOperand(i: `1`);
807	SDValue True = Op.getOperand(i: `2`);
808	SDValue False = Op.getOperand(i: `3`);
809	SDValue CC = Op.getOperand(i: `4`);
810	SDValue Temp;
811
812	if (VT == MVT::f32) {
813	DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
814	SDValue MinMax = combineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI);
815	if (MinMax)
816	return MinMax;
817	}
818
819	// LHS and RHS are guaranteed to be the same value type
820	EVT CompareVT = LHS.getValueType();
821
822	// Check if we can lower this to a native operation.
823
824	// Try to lower to a SET instruction:*
825	//
826	// SET can match the following patterns:*
827	//
828	// select_cc f32, f32, -1, 0, cc_supported
829	// select_cc f32, f32, 1.0f, 0.0f, cc_supported
830	// select_cc i32, i32, -1, 0, cc_supported
831	//
832
833	// Move hardware True/False values to the correct operand.
834	if (isHWTrueValue(Op: False) && isHWFalseValue(Op: True)) {
835	ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Val&: CC)->get();
836	ISD::CondCode InverseCC = ISD::getSetCCInverse(Operation: CCOpcode, Type: CompareVT);
837	if (isCondCodeLegal(CC: InverseCC, VT: CompareVT.getSimpleVT())) {
838	std::swap(a&: False, b&: True);
839	CC = DAG.getCondCode(Cond: InverseCC);
840	} else {
841	ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(Operation: InverseCC);
842	if (isCondCodeLegal(CC: SwapInvCC, VT: CompareVT.getSimpleVT())) {
843	std::swap(a&: False, b&: True);
844	std::swap(a&: LHS, b&: RHS);
845	CC = DAG.getCondCode(Cond: SwapInvCC);
846	}
847	}
848	}
849
850	if (isHWTrueValue(True) && isHWFalseValue(False) &&
851	(CompareVT == VT \|\| VT == MVT::i32)) {
852	// This can be matched by a SET instruction.*
853	return DAG.getNode(Opcode: ISD::SELECT_CC, DL, VT, N1: LHS, N2: RHS, N3: True, N4: False, N5: CC);
854	}
855
856	// Try to lower to a CND instruction:*
857	//
858	// CND can match the following patterns:*
859	//
860	// select_cc f32, 0.0, f32, f32, cc_supported
861	// select_cc f32, 0.0, i32, i32, cc_supported
862	// select_cc i32, 0, f32, f32, cc_supported
863	// select_cc i32, 0, i32, i32, cc_supported
864	//
865
866	// Try to move the zero value to the RHS
867	if (isZero(Op: LHS)) {
868	ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Val&: CC)->get();
869	// Try swapping the operands
870	ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(Operation: CCOpcode);
871	if (isCondCodeLegal(CC: CCSwapped, VT: CompareVT.getSimpleVT())) {
872	std::swap(a&: LHS, b&: RHS);
873	CC = DAG.getCondCode(Cond: CCSwapped);
874	} else {
875	// Try inverting the condition and then swapping the operands
876	ISD::CondCode CCInv = ISD::getSetCCInverse(Operation: CCOpcode, Type: CompareVT);
877	CCSwapped = ISD::getSetCCSwappedOperands(Operation: CCInv);
878	if (isCondCodeLegal(CC: CCSwapped, VT: CompareVT.getSimpleVT())) {
879	std::swap(a&: True, b&: False);
880	std::swap(a&: LHS, b&: RHS);
881	CC = DAG.getCondCode(Cond: CCSwapped);
882	}
883	}
884	}
885	if (isZero(Op: RHS)) {
886	SDValue Cond = LHS;
887	SDValue Zero = RHS;
888	ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Val&: CC)->get();
889	if (CompareVT != VT) {
890	// Bitcast True / False to the correct types. This will end up being
891	// a nop, but it allows us to define only a single pattern in the
892	// .TD files for each CND instruction rather than having to have*
893	// one pattern for integer True/False and one for fp True/False
894	True = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: CompareVT, Operand: True);
895	False = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: CompareVT, Operand: False);
896	}
897
898	switch (CCOpcode) {
899	case ISD::SETONE:
900	case ISD::SETUNE:
901	case ISD::SETNE:
902	CCOpcode = ISD::getSetCCInverse(Operation: CCOpcode, Type: CompareVT);
903	Temp = True;
904	True = False;
905	False = Temp;
906	break;
907	default:
908	break;
909	}
910	SDValue SelectNode = DAG.getNode(Opcode: ISD::SELECT_CC, DL, VT: CompareVT,
911	N1: Cond, N2: Zero,
912	N3: True, N4: False,
913	N5: DAG.getCondCode(Cond: CCOpcode));
914	return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: SelectNode);
915	}
916
917	// If we make it this for it means we have no native instructions to handle
918	// this SELECT_CC, so we must lower it.
919	SDValue HWTrue, HWFalse;
920
921	if (CompareVT == MVT::f32) {
922	HWTrue = DAG.getConstantFP(Val: `1.0f`, DL, VT: CompareVT);
923	HWFalse = DAG.getConstantFP(Val: `0.0f`, DL, VT: CompareVT);
924	} else if (CompareVT == MVT::i32) {
925	HWTrue = DAG.getConstant(Val: -`1`, DL, VT: CompareVT);
926	HWFalse = DAG.getConstant(Val: `0`, DL, VT: CompareVT);
927	}
928	else {
929	llvm_unreachable("Unhandled value type in LowerSELECT_CC");
930	}
931
932	// Lower this unsupported SELECT_CC into a combination of two supported
933	// SELECT_CC operations.
934	SDValue Cond = DAG.getNode(Opcode: ISD::SELECT_CC, DL, VT: CompareVT, N1: LHS, N2: RHS, N3: HWTrue, N4: HWFalse, N5: CC);
935
936	return DAG.getNode(Opcode: ISD::SELECT_CC, DL, VT,
937	N1: Cond, N2: HWFalse,
938	N3: True, N4: False,
939	N5: DAG.getCondCode(Cond: ISD::SETNE));
940	}
941
942	SDValue R600TargetLowering::lowerADDRSPACECAST(SDValue Op,
943	SelectionDAG &DAG) const {
944	SDLoc SL(Op);
945	EVT VT = Op.getValueType();
946
947	const R600TargetMachine &TM =
948	static_cast<const R600TargetMachine &>(getTargetMachine());
949
950	const AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(Val&: Op);
951	unsigned SrcAS = ASC->getSrcAddressSpace();
952	unsigned DestAS = ASC->getDestAddressSpace();
953
954	if (isNullConstant(V: Op.getOperand(i: `0`)) && SrcAS == AMDGPUAS::FLAT_ADDRESS)
955	return DAG.getConstant(Val: TM.getNullPointerValue(AddrSpace: DestAS), DL: SL, VT);
956
957	return Op;
958	}
959
960	/// LLVM generates byte-addressed pointers. For indirect addressing, we need to
961	/// convert these pointers to a register index. Each register holds
962	/// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
963	/// \p StackWidth, which tells us how many of the 4 sub-registers will be used
964	/// for indirect addressing.
965	SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
966	unsigned StackWidth,
967	SelectionDAG &DAG) const {
968	unsigned SRLPad;
969	switch(StackWidth) {
970	case `1`:
971	SRLPad = `2`;
972	break;
973	case `2`:
974	SRLPad = `3`;
975	break;
976	case `4`:
977	SRLPad = `4`;
978	break;
979	default: llvm_unreachable("Invalid stack width");
980	}
981
982	SDLoc DL(Ptr);
983	return DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), Ptr,
984	DAG.getConstant(SRLPad, DL, MVT::i32));
985	}
986
987	void R600TargetLowering::getStackAddress(unsigned StackWidth,
988	unsigned ElemIdx,
989	unsigned &Channel,
990	unsigned &PtrIncr) const {
991	switch (StackWidth) {
992	default:
993	case `1`:
994	Channel = `0`;
995	if (ElemIdx > `0`) {
996	PtrIncr = `1`;
997	} else {
998	PtrIncr = `0`;
999	}
1000	break;
1001	case `2`:
1002	Channel = ElemIdx % `2`;
1003	if (ElemIdx == `2`) {
1004	PtrIncr = `1`;
1005	} else {
1006	PtrIncr = `0`;
1007	}
1008	break;
1009	case `4`:
1010	Channel = ElemIdx;
1011	PtrIncr = `0`;
1012	break;
1013	}
1014	}
1015
1016	SDValue R600TargetLowering::lowerPrivateTruncStore(StoreSDNode *Store,
1017	SelectionDAG &DAG) const {
1018	SDLoc DL(Store);
1019	//TODO: Who creates the i8 stores?
1020	assert(Store->isTruncatingStore()
1021	\|\| Store->getValue().getValueType() == MVT::i8);
1022	assert(Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS);
1023
1024	SDValue Mask;
1025	if (Store->getMemoryVT() == MVT::i8) {
1026	assert(Store->getAlign() >= `1`);
1027	Mask = DAG.getConstant(`0xff`, DL, MVT::i32);
1028	} else if (Store->getMemoryVT() == MVT::i16) {
1029	assert(Store->getAlign() >= `2`);
1030	Mask = DAG.getConstant(`0xffff`, DL, MVT::i32);
1031	} else {
1032	llvm_unreachable("Unsupported private trunc store");
1033	}
1034
1035	SDValue OldChain = Store->getChain();
1036	bool VectorTrunc = (OldChain.getOpcode() == AMDGPUISD::DUMMY_CHAIN);
1037	// Skip dummy
1038	SDValue Chain = VectorTrunc ? OldChain ->getOperand(Num: `0`) : OldChain;
1039	SDValue BasePtr = Store->getBasePtr();
1040	SDValue Offset = Store->getOffset();
1041	EVT MemVT = Store->getMemoryVT();
1042
1043	SDValue LoadPtr = BasePtr;
1044	if (!Offset.isUndef()) {
1045	LoadPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, Offset);
1046	}
1047
1048	// Get dword location
1049	// TODO: this should be eliminated by the future SHR ptr, 2
1050	SDValue Ptr = DAG.getNode(ISD::AND, DL, MVT::i32, LoadPtr,
1051	DAG.getConstant(`0xfffffffc`, DL, MVT::i32));
1052
1053	// Load dword
1054	// TODO: can we be smarter about machine pointer info?
1055	MachinePointerInfo PtrInfo(AMDGPUAS::PRIVATE_ADDRESS);
1056	SDValue Dst = DAG.getLoad(MVT::i32, DL, Chain, Ptr, PtrInfo);
1057
1058	Chain = Dst.getValue(R: `1`);
1059
1060	// Get offset in dword
1061	SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, LoadPtr,
1062	DAG.getConstant(`0x3`, DL, MVT::i32));
1063
1064	// Convert byte offset to bit shift
1065	SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
1066	DAG.getConstant(`3`, DL, MVT::i32));
1067
1068	// TODO: Contrary to the name of the function,
1069	// it also handles sub i32 non-truncating stores (like i1)
1070	SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32,
1071	Store->getValue());
1072
1073	// Mask the value to the right type
1074	SDValue MaskedValue = DAG.getZeroExtendInReg(Op: SExtValue, DL, VT: MemVT);
1075
1076	// Shift the value in place
1077	SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32,
1078	MaskedValue, ShiftAmt);
1079
1080	// Shift the mask in place
1081	SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32, Mask, ShiftAmt);
1082
1083	// Invert the mask. NOTE: if we had native ROL instructions we could
1084	// use inverted mask
1085	DstMask = DAG.getNOT(DL, DstMask, MVT::i32);
1086
1087	// Cleanup the target bits
1088	Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask);
1089
1090	// Add the new bits
1091	SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue);
1092
1093	// Store dword
1094	// TODO: Can we be smarter about MachinePointerInfo?
1095	SDValue NewStore = DAG.getStore(Chain, dl: DL, Val: Value, Ptr, PtrInfo);
1096
1097	// If we are part of expanded vector, make our neighbors depend on this store
1098	if (VectorTrunc) {
1099	// Make all other vector elements depend on this store
1100	Chain = DAG.getNode(AMDGPUISD::DUMMY_CHAIN, DL, MVT::Other, NewStore);
1101	DAG.ReplaceAllUsesOfValueWith(From: OldChain, To: Chain);
1102	}
1103	return NewStore;
1104	}
1105
1106	SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
1107	StoreSDNode *StoreNode = cast<StoreSDNode>(Val&: Op);
1108	unsigned AS = StoreNode->getAddressSpace();
1109
1110	SDValue Chain = StoreNode->getChain();
1111	SDValue Ptr = StoreNode->getBasePtr();
1112	SDValue Value = StoreNode->getValue();
1113
1114	EVT VT = Value.getValueType();
1115	EVT MemVT = StoreNode->getMemoryVT();
1116	EVT PtrVT = Ptr.getValueType();
1117
1118	SDLoc DL(Op);
1119
1120	const bool TruncatingStore = StoreNode->isTruncatingStore();
1121
1122	// Neither LOCAL nor PRIVATE can do vectors at the moment
1123	if ((AS == AMDGPUAS::LOCAL_ADDRESS \|\| AS == AMDGPUAS::PRIVATE_ADDRESS \|\|
1124	TruncatingStore) &&
1125	VT.isVector()) {
1126	if ((AS == AMDGPUAS::PRIVATE_ADDRESS) && TruncatingStore) {
1127	// Add an extra level of chain to isolate this vector
1128	SDValue NewChain = DAG.getNode(AMDGPUISD::DUMMY_CHAIN, DL, MVT::Other, Chain);
1129	// TODO: can the chain be replaced without creating a new store?
1130	SDValue NewStore = DAG.getTruncStore(
1131	Chain: NewChain, dl: DL, Val: Value, Ptr, PtrInfo: StoreNode->getPointerInfo(), SVT: MemVT,
1132	Alignment: StoreNode->getAlign(), MMOFlags: StoreNode->getMemOperand()->getFlags(),
1133	AAInfo: StoreNode->getAAInfo());
1134	StoreNode = cast<StoreSDNode>(Val&: NewStore);
1135	}
1136
1137	return scalarizeVectorStore(ST: StoreNode, DAG);
1138	}
1139
1140	Align Alignment = StoreNode->getAlign();
1141	if (Alignment < MemVT.getStoreSize() &&
1142	!allowsMisalignedMemoryAccesses(VT: MemVT, AS, Alignment,
1143	Flags: StoreNode->getMemOperand()->getFlags(),
1144	IsFast: nullptr)) {
1145	return expandUnalignedStore(ST: StoreNode, DAG);
1146	}
1147
1148	SDValue DWordAddr = DAG.getNode(Opcode: ISD::SRL, DL, VT: PtrVT, N1: Ptr,
1149	N2: DAG.getConstant(Val: `2`, DL, VT: PtrVT));
1150
1151	if (AS == AMDGPUAS::GLOBAL_ADDRESS) {
1152	// It is beneficial to create MSKOR here instead of combiner to avoid
1153	// artificial dependencies introduced by RMW
1154	if (TruncatingStore) {
1155	assert(VT.bitsLE(MVT::i32));
1156	SDValue MaskConstant;
1157	if (MemVT == MVT::i8) {
1158	MaskConstant = DAG.getConstant(`0xFF`, DL, MVT::i32);
1159	} else {
1160	assert(MemVT == MVT::i16);
1161	assert(StoreNode->getAlign() >= `2`);
1162	MaskConstant = DAG.getConstant(`0xFFFF`, DL, MVT::i32);
1163	}
1164
1165	SDValue ByteIndex = DAG.getNode(Opcode: ISD::AND, DL, VT: PtrVT, N1: Ptr,
1166	N2: DAG.getConstant(Val: `0x00000003`, DL, VT: PtrVT));
1167	SDValue BitShift = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: ByteIndex,
1168	N2: DAG.getConstant(Val: `3`, DL, VT));
1169
1170	// Put the mask in correct place
1171	SDValue Mask = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: MaskConstant, N2: BitShift);
1172
1173	// Put the value bits in correct place
1174	SDValue TruncValue = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: Value, N2: MaskConstant);
1175	SDValue ShiftedValue = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: TruncValue, N2: BitShift);
1176
1177	// XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32
1178	// vector instead.
1179	SDValue Src[`4`] = {
1180	ShiftedValue,
1181	DAG.getConstant(`0`, DL, MVT::i32),
1182	DAG.getConstant(`0`, DL, MVT::i32),
1183	Mask
1184	};
1185	SDValue Input = DAG.getBuildVector(MVT::v4i32, DL, Src);
1186	SDValue Args[`3`] = { Chain, Input, DWordAddr };
1187	return DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::STORE_MSKOR, dl: DL,
1188	VTList: Op ->getVTList(), Ops: Args, MemVT,
1189	MMO: StoreNode->getMemOperand());
1190	} else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR && VT.bitsGE(MVT::i32)) {
1191	// Convert pointer from byte address to dword address.
1192	Ptr = DAG.getNode(Opcode: AMDGPUISD::DWORDADDR, DL, VT: PtrVT, Operand: DWordAddr);
1193
1194	if (StoreNode->isIndexed()) {
1195	llvm_unreachable("Indexed stores not supported yet");
1196	} else {
1197	Chain = DAG.getStore(Chain, dl: DL, Val: Value, Ptr, MMO: StoreNode->getMemOperand());
1198	}
1199	return Chain;
1200	}
1201	}
1202
1203	// GLOBAL_ADDRESS has been handled above, LOCAL_ADDRESS allows all sizes
1204	if (AS != AMDGPUAS::PRIVATE_ADDRESS)
1205	return SDValue ();
1206
1207	if (MemVT.bitsLT(MVT::i32))
1208	return lowerPrivateTruncStore(Store: StoreNode, DAG);
1209
1210	// Standard i32+ store, tag it with DWORDADDR to note that the address
1211	// has been shifted
1212	if (Ptr.getOpcode() != AMDGPUISD::DWORDADDR) {
1213	Ptr = DAG.getNode(Opcode: AMDGPUISD::DWORDADDR, DL, VT: PtrVT, Operand: DWordAddr);
1214	return DAG.getStore(Chain, dl: DL, Val: Value, Ptr, MMO: StoreNode->getMemOperand());
1215	}
1216
1217	// Tagged i32+ stores will be matched by patterns
1218	return SDValue ();
1219	}
1220
1221	// return (512 + (kc_bank << 12)
1222	static int
1223	ConstantAddressBlock(unsigned AddressSpace) {
1224	switch (AddressSpace) {
1225	case AMDGPUAS::CONSTANT_BUFFER_0:
1226	return `512`;
1227	case AMDGPUAS::CONSTANT_BUFFER_1:
1228	return `512` + `4096`;
1229	case AMDGPUAS::CONSTANT_BUFFER_2:
1230	return `512` + `4096` * `2`;
1231	case AMDGPUAS::CONSTANT_BUFFER_3:
1232	return `512` + `4096` * `3`;
1233	case AMDGPUAS::CONSTANT_BUFFER_4:
1234	return `512` + `4096` * `4`;
1235	case AMDGPUAS::CONSTANT_BUFFER_5:
1236	return `512` + `4096` * `5`;
1237	case AMDGPUAS::CONSTANT_BUFFER_6:
1238	return `512` + `4096` * `6`;
1239	case AMDGPUAS::CONSTANT_BUFFER_7:
1240	return `512` + `4096` * `7`;
1241	case AMDGPUAS::CONSTANT_BUFFER_8:
1242	return `512` + `4096` * `8`;
1243	case AMDGPUAS::CONSTANT_BUFFER_9:
1244	return `512` + `4096` * `9`;
1245	case AMDGPUAS::CONSTANT_BUFFER_10:
1246	return `512` + `4096` * `10`;
1247	case AMDGPUAS::CONSTANT_BUFFER_11:
1248	return `512` + `4096` * `11`;
1249	case AMDGPUAS::CONSTANT_BUFFER_12:
1250	return `512` + `4096` * `12`;
1251	case AMDGPUAS::CONSTANT_BUFFER_13:
1252	return `512` + `4096` * `13`;
1253	case AMDGPUAS::CONSTANT_BUFFER_14:
1254	return `512` + `4096` * `14`;
1255	case AMDGPUAS::CONSTANT_BUFFER_15:
1256	return `512` + `4096` * `15`;
1257	default:
1258	return -`1`;
1259	}
1260	}
1261
1262	SDValue R600TargetLowering::lowerPrivateExtLoad(SDValue Op,
1263	SelectionDAG &DAG) const {
1264	SDLoc DL(Op);
1265	LoadSDNode *Load = cast<LoadSDNode>(Val&: Op);
1266	ISD::LoadExtType ExtType = Load->getExtensionType();
1267	EVT MemVT = Load->getMemoryVT();
1268	assert(Load->getAlign() >= MemVT.getStoreSize());
1269
1270	SDValue BasePtr = Load->getBasePtr();
1271	SDValue Chain = Load->getChain();
1272	SDValue Offset = Load->getOffset();
1273
1274	SDValue LoadPtr = BasePtr;
1275	if (!Offset.isUndef()) {
1276	LoadPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, Offset);
1277	}
1278
1279	// Get dword location
1280	// NOTE: this should be eliminated by the future SHR ptr, 2
1281	SDValue Ptr = DAG.getNode(ISD::AND, DL, MVT::i32, LoadPtr,
1282	DAG.getConstant(`0xfffffffc`, DL, MVT::i32));
1283
1284	// Load dword
1285	// TODO: can we be smarter about machine pointer info?
1286	MachinePointerInfo PtrInfo(AMDGPUAS::PRIVATE_ADDRESS);
1287	SDValue Read = DAG.getLoad(MVT::i32, DL, Chain, Ptr, PtrInfo);
1288
1289	// Get offset within the register.
1290	SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32,
1291	LoadPtr, DAG.getConstant(`0x3`, DL, MVT::i32));
1292
1293	// Bit offset of target byte (byteIdx 8).*
1294	SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
1295	DAG.getConstant(`3`, DL, MVT::i32));
1296
1297	// Shift to the right.
1298	SDValue Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Read, ShiftAmt);
1299
1300	// Eliminate the upper bits by setting them to ...
1301	EVT MemEltVT = MemVT.getScalarType();
1302
1303	if (ExtType == ISD::SEXTLOAD) { // ... ones.
1304	SDValue MemEltVTNode = DAG.getValueType(MemEltVT);
1305	Ret = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode);
1306	} else { // ... or zeros.
1307	Ret = DAG.getZeroExtendInReg(Op: Ret, DL, VT: MemEltVT);
1308	}
1309
1310	SDValue Ops[] = {
1311	Ret,
1312	Read.getValue(R: `1`) // This should be our output chain
1313	};
1314
1315	return DAG.getMergeValues(Ops, dl: DL);
1316	}
1317
1318	SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
1319	LoadSDNode *LoadNode = cast<LoadSDNode>(Val&: Op);
1320	unsigned AS = LoadNode->getAddressSpace();
1321	EVT MemVT = LoadNode->getMemoryVT();
1322	ISD::LoadExtType ExtType = LoadNode->getExtensionType();
1323
1324	if (AS == AMDGPUAS::PRIVATE_ADDRESS &&
1325	ExtType != ISD::NON_EXTLOAD && MemVT.bitsLT(MVT::i32)) {
1326	return lowerPrivateExtLoad(Op, DAG);
1327	}
1328
1329	SDLoc DL(Op);
1330	EVT VT = Op.getValueType();
1331	SDValue Chain = LoadNode->getChain();
1332	SDValue Ptr = LoadNode->getBasePtr();
1333
1334	if ((LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS \|\|
1335	LoadNode->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) &&
1336	VT.isVector()) {
1337	SDValue Ops[`2`];
1338	std::tie(args&: Ops[`0`], args&: Ops[`1`]) = scalarizeVectorLoad(LD: LoadNode, DAG);
1339	return DAG.getMergeValues(Ops, dl: DL);
1340	}
1341
1342	// This is still used for explicit load from addrspace(8)
1343	int ConstantBlock = ConstantAddressBlock(AddressSpace: LoadNode->getAddressSpace());
1344	if (ConstantBlock > -`1` &&
1345	((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) \|\|
1346	(LoadNode->getExtensionType() == ISD::ZEXTLOAD))) {
1347	SDValue Result;
1348	if (isa<Constant>(Val: LoadNode->getMemOperand()->getValue()) \|\|
1349	isa<ConstantSDNode>(Val: Ptr)) {
1350	return constBufferLoad(LoadNode, Block: LoadNode->getAddressSpace(), DAG);
1351	} else {
1352	//TODO: Does this even work?
1353	// non-constant ptr can't be folded, keeps it as a v4f32 load
1354	Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
1355	DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
1356	DAG.getConstant(`4`, DL, MVT::i32)),
1357	DAG.getConstant(LoadNode->getAddressSpace() -
1358	AMDGPUAS::CONSTANT_BUFFER_0, DL, MVT::i32)
1359	);
1360	}
1361
1362	if (!VT.isVector()) {
1363	Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
1364	DAG.getConstant(`0`, DL, MVT::i32));
1365	}
1366
1367	SDValue MergedValues[`2`] = {
1368	Result,
1369	Chain
1370	};
1371	return DAG.getMergeValues(Ops: MergedValues, dl: DL);
1372	}
1373
1374	// For most operations returning SDValue() will result in the node being
1375	// expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we
1376	// need to manually expand loads that may be legal in some address spaces and
1377	// illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for
1378	// compute shaders, since the data is sign extended when it is uploaded to the
1379	// buffer. However SEXT loads from other address spaces are not supported, so
1380	// we need to expand them here.
1381	if (LoadNode->getExtensionType() == ISD::SEXTLOAD) {
1382	assert(!MemVT.isVector() && (MemVT == MVT::i16 \|\| MemVT == MVT::i8));
1383	SDValue NewLoad = DAG.getExtLoad(
1384	ExtType: ISD::EXTLOAD, dl: DL, VT, Chain, Ptr, PtrInfo: LoadNode->getPointerInfo(), MemVT,
1385	Alignment: LoadNode->getAlign(), MMOFlags: LoadNode->getMemOperand()->getFlags());
1386	SDValue Res = DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL, VT, N1: NewLoad,
1387	N2: DAG.getValueType(MemVT));
1388
1389	SDValue MergedValues[`2`] = { Res, Chain };
1390	return DAG.getMergeValues(Ops: MergedValues, dl: DL);
1391	}
1392
1393	if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1394	return SDValue ();
1395	}
1396
1397	// DWORDADDR ISD marks already shifted address
1398	if (Ptr.getOpcode() != AMDGPUISD::DWORDADDR) {
1399	assert(VT == MVT::i32);
1400	Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(`2`, DL, MVT::i32));
1401	Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, MVT::i32, Ptr);
1402	return DAG.getLoad(MVT::i32, DL, Chain, Ptr, LoadNode->getMemOperand());
1403	}
1404	return SDValue ();
1405	}
1406
1407	SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
1408	SDValue Chain = Op.getOperand(i: `0`);
1409	SDValue Cond = Op.getOperand(i: `1`);
1410	SDValue Jump = Op.getOperand(i: `2`);
1411
1412	return DAG.getNode(Opcode: AMDGPUISD::BRANCH_COND, DL: SDLoc (Op), VT: Op.getValueType(),
1413	N1: Chain, N2: Jump, N3: Cond);
1414	}
1415
1416	SDValue R600TargetLowering::lowerFrameIndex(SDValue Op,
1417	SelectionDAG &DAG) const {
1418	MachineFunction &MF = DAG.getMachineFunction();
1419	const R600FrameLowering *TFL = Subtarget->getFrameLowering();
1420
1421	FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Val&: Op);
1422
1423	unsigned FrameIndex = FIN->getIndex();
1424	Register IgnoredFrameReg;
1425	StackOffset Offset =
1426	TFL->getFrameIndexReference(MF, FI: FrameIndex, FrameReg&: IgnoredFrameReg);
1427	return DAG.getConstant(Val: Offset.getFixed() * `4` * TFL->getStackWidth(MF),
1428	DL: SDLoc (Op), VT: Op.getValueType());
1429	}
1430
1431	CCAssignFn *R600TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
1432	bool IsVarArg) const {
1433	switch (CC) {
1434	case CallingConv::AMDGPU_KERNEL:
1435	case CallingConv::SPIR_KERNEL:
1436	case CallingConv::C:
1437	case CallingConv::Fast:
1438	case CallingConv::Cold:
1439	llvm_unreachable("kernels should not be handled here");
1440	case CallingConv::AMDGPU_VS:
1441	case CallingConv::AMDGPU_GS:
1442	case CallingConv::AMDGPU_PS:
1443	case CallingConv::AMDGPU_CS:
1444	case CallingConv::AMDGPU_HS:
1445	case CallingConv::AMDGPU_ES:
1446	case CallingConv::AMDGPU_LS:
1447	return CC_R600;
1448	default:
1449	report_fatal_error(reason: "Unsupported calling convention.");
1450	}
1451	}
1452
1453	/// XXX Only kernel functions are supported, so we can assume for now that
1454	/// every function is a kernel function, but in the future we should use
1455	/// separate calling conventions for kernel and non-kernel functions.
1456	SDValue R600TargetLowering::LowerFormalArguments(
1457	SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
1458	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
1459	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
1460	SmallVector<CCValAssign, `16`> ArgLocs;
1461	CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
1462	*DAG.getContext());
1463	MachineFunction &MF = DAG.getMachineFunction();
1464	SmallVector<ISD::InputArg, `8`> LocalIns;
1465
1466	if (AMDGPU::isShader(CC: CallConv)) {
1467	CCInfo.AnalyzeFormalArguments(Ins, Fn: CCAssignFnForCall(CC: CallConv, IsVarArg: isVarArg));
1468	} else {
1469	analyzeFormalArgumentsCompute(State&: CCInfo, Ins);
1470	}
1471
1472	for (unsigned i = `0`, e = Ins.size(); i < e; ++i) {
1473	CCValAssign &VA = ArgLocs [i];
1474	const ISD::InputArg &In = Ins [i];
1475	EVT VT = In.VT;
1476	EVT MemVT = VA.getLocVT();
1477	if (!VT.isVector() && MemVT.isVector()) {
1478	// Get load source type if scalarized.
1479	MemVT = MemVT.getVectorElementType();
1480	}
1481
1482	if (AMDGPU::isShader(CC: CallConv)) {
1483	Register Reg = MF.addLiveIn(VA.getLocReg(), &R600::R600_Reg128RegClass);
1484	SDValue Register = DAG.getCopyFromReg(Chain, dl: DL, Reg, VT);
1485	InVals.push_back(Elt: Register);
1486	continue;
1487	}
1488
1489	// i64 isn't a legal type, so the register type used ends up as i32, which
1490	// isn't expected here. It attempts to create this sextload, but it ends up
1491	// being invalid. Somehow this seems to work with i64 arguments, but breaks
1492	// for <1 x i64>.
1493
1494	// The first 36 bytes of the input buffer contains information about
1495	// thread group and global sizes.
1496	ISD::LoadExtType Ext = ISD::NON_EXTLOAD;
1497	if (MemVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) {
1498	// FIXME: This should really check the extload type, but the handling of
1499	// extload vector parameters seems to be broken.
1500
1501	// Ext = In.Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
1502	Ext = ISD::SEXTLOAD;
1503	}
1504
1505	// Compute the offset from the value.
1506	// XXX - I think PartOffset should give you this, but it seems to give the
1507	// size of the register which isn't useful.
1508
1509	unsigned PartOffset = VA.getLocMemOffset();
1510	Align Alignment = commonAlignment(A: Align (VT.getStoreSize()), Offset: PartOffset);
1511
1512	MachinePointerInfo PtrInfo(AMDGPUAS::PARAM_I_ADDRESS);
1513	SDValue Arg = DAG.getLoad(
1514	ISD::UNINDEXED, Ext, VT, DL, Chain,
1515	DAG.getConstant(PartOffset, DL, MVT::i32), DAG.getUNDEF(MVT::i32),
1516	PtrInfo,
1517	MemVT, Alignment, MachineMemOperand::MONonTemporal \|
1518	MachineMemOperand::MODereferenceable \|
1519	MachineMemOperand::MOInvariant);
1520
1521	InVals.push_back(Elt: Arg);
1522	}
1523	return Chain;
1524	}
1525
1526	EVT R600TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
1527	EVT VT) const {
1528	if (!VT.isVector())
1529	return MVT::i32;
1530	return VT.changeVectorElementTypeToInteger();
1531	}
1532
1533	bool R600TargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
1534	const MachineFunction &MF) const {
1535	// Local and Private addresses do not handle vectors. Limit to i32
1536	if ((AS == AMDGPUAS::LOCAL_ADDRESS \|\| AS == AMDGPUAS::PRIVATE_ADDRESS)) {
1537	return (MemVT.getSizeInBits() <= `32`);
1538	}
1539	return true;
1540	}
1541
1542	bool R600TargetLowering::allowsMisalignedMemoryAccesses(
1543	EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
1544	unsigned IsFast) const* {
1545	if (IsFast)
1546	*IsFast = `0`;
1547
1548	if (!VT.isSimple() \|\| VT == MVT::Other)
1549	return false;
1550
1551	if (VT.bitsLT(MVT::i32))
1552	return false;
1553
1554	// TODO: This is a rough estimate.
1555	if (IsFast)
1556	*IsFast = `1`;
1557
1558	return VT.bitsGT(MVT::i32) && Alignment >= Align(`4`);
1559	}
1560
1561	static SDValue CompactSwizzlableVector(
1562	SelectionDAG &DAG, SDValue VectorEntry,
1563	DenseMap<unsigned, unsigned> &RemapSwizzle) {
1564	assert(RemapSwizzle.empty());
1565
1566	SDLoc DL(VectorEntry);
1567	EVT EltTy = VectorEntry.getValueType().getVectorElementType();
1568
1569	SDValue NewBldVec[`4`];
1570	for (unsigned i = `0`; i < `4`; i++)
1571	NewBldVec[i] = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: EltTy, N1: VectorEntry,
1572	N2: DAG.getIntPtrConstant(Val: i, DL));
1573
1574	for (unsigned i = `0`; i < `4`; i++) {
1575	if (NewBldVec[i].isUndef())
1576	// We mask write here to teach later passes that the ith element of this
1577	// vector is undef. Thus we can use it to reduce 128 bits reg usage,
1578	// break false dependencies and additionally make assembly easier to read.
1579	RemapSwizzle [i] = `7`; // SEL_MASK_WRITE
1580	if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val&: NewBldVec[i])) {
1581	if (C->isZero()) {
1582	RemapSwizzle [i] = `4`; // SEL_0
1583	NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1584	} else if (C->isExactlyValue(V: `1.0`)) {
1585	RemapSwizzle [i] = `5`; // SEL_1
1586	NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1587	}
1588	}
1589
1590	if (NewBldVec[i].isUndef())
1591	continue;
1592
1593	for (unsigned j = `0`; j < i; j++) {
1594	if (NewBldVec[i] == NewBldVec[j]) {
1595	NewBldVec[i] = DAG.getUNDEF(VT: NewBldVec[i].getValueType());
1596	RemapSwizzle [i] = j;
1597	break;
1598	}
1599	}
1600	}
1601
1602	return DAG.getBuildVector(VT: VectorEntry.getValueType(), DL: SDLoc (VectorEntry),
1603	Ops: NewBldVec);
1604	}
1605
1606	static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
1607	DenseMap<unsigned, unsigned> &RemapSwizzle) {
1608	assert(RemapSwizzle.empty());
1609
1610	SDLoc DL(VectorEntry);
1611	EVT EltTy = VectorEntry.getValueType().getVectorElementType();
1612
1613	SDValue NewBldVec[`4`];
1614	bool isUnmovable[`4`] = {false, false, false, false};
1615	for (unsigned i = `0`; i < `4`; i++)
1616	NewBldVec[i] = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: EltTy, N1: VectorEntry,
1617	N2: DAG.getIntPtrConstant(Val: i, DL));
1618
1619	for (unsigned i = `0`; i < `4`; i++) {
1620	RemapSwizzle [i] = i;
1621	if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1622	unsigned Idx = NewBldVec[i].getConstantOperandVal(i: `1`);
1623	if (i == Idx)
1624	isUnmovable[Idx] = true;
1625	}
1626	}
1627
1628	for (unsigned i = `0`; i < `4`; i++) {
1629	if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1630	unsigned Idx = NewBldVec[i].getConstantOperandVal(i: `1`);
1631	if (isUnmovable[Idx])
1632	continue;
1633	// Swap i and Idx
1634	std::swap(a&: NewBldVec[Idx], b&: NewBldVec[i]);
1635	std::swap(a&: RemapSwizzle [i], b&: RemapSwizzle [Idx]);
1636	break;
1637	}
1638	}
1639
1640	return DAG.getBuildVector(VT: VectorEntry.getValueType(), DL: SDLoc (VectorEntry),
1641	Ops: NewBldVec);
1642	}
1643
1644	SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, SDValue Swz[],
1645	SelectionDAG &DAG,
1646	const SDLoc &DL) const {
1647	// Old -> New swizzle values
1648	DenseMap<unsigned, unsigned> SwizzleRemap;
1649
1650	BuildVector = CompactSwizzlableVector(DAG, VectorEntry: BuildVector, RemapSwizzle&: SwizzleRemap);
1651	for (unsigned i = `0`; i < `4`; i++) {
1652	unsigned Idx = Swz[i]->getAsZExtVal();
1653	if (SwizzleRemap.contains(Idx))
1654	Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32);
1655	}
1656
1657	SwizzleRemap.clear();
1658	BuildVector = ReorganizeVector(DAG, VectorEntry: BuildVector, RemapSwizzle&: SwizzleRemap);
1659	for (unsigned i = `0`; i < `4`; i++) {
1660	unsigned Idx = Swz[i]->getAsZExtVal();
1661	if (SwizzleRemap.contains(Idx))
1662	Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32);
1663	}
1664
1665	return BuildVector;
1666	}
1667
1668	SDValue R600TargetLowering::constBufferLoad(LoadSDNode LoadNode, int* Block,
1669	SelectionDAG &DAG) const {
1670	SDLoc DL(LoadNode);
1671	EVT VT = LoadNode->getValueType(ResNo: `0`);
1672	SDValue Chain = LoadNode->getChain();
1673	SDValue Ptr = LoadNode->getBasePtr();
1674	assert (isa<ConstantSDNode>(Ptr));
1675
1676	//TODO: Support smaller loads
1677	if (LoadNode->getMemoryVT().getScalarType() != MVT::i32 \|\| !ISD::isNON_EXTLoad(LoadNode))
1678	return SDValue ();
1679
1680	if (LoadNode->getAlign() < Align (`4`))
1681	return SDValue ();
1682
1683	int ConstantBlock = ConstantAddressBlock(AddressSpace: Block);
1684
1685	SDValue Slots[`4`];
1686	for (unsigned i = `0`; i < `4`; i++) {
1687	// We want Const position encoded with the following formula :
1688	// (((512 + (kc_bank << 12) + const_index) << 2) + chan)
1689	// const_index is Ptr computed by llvm using an alignment of 16.
1690	// Thus we add (((512 + (kc_bank << 12)) + chan ) 4 here and*
1691	// then div by 4 at the ISel step
1692	SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
1693	DAG.getConstant(`4` * i + ConstantBlock * `16`, DL, MVT::i32));
1694	Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
1695	}
1696	EVT NewVT = MVT::v4i32;
1697	unsigned NumElements = `4`;
1698	if (VT.isVector()) {
1699	NewVT = VT;
1700	NumElements = VT.getVectorNumElements();
1701	}
1702	SDValue Result = DAG.getBuildVector(VT: NewVT, DL, Ops: ArrayRef(Slots, NumElements));
1703	if (!VT.isVector()) {
1704	Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
1705	DAG.getConstant(`0`, DL, MVT::i32));
1706	}
1707	SDValue MergedValues[`2`] = {
1708	Result,
1709	Chain
1710	};
1711	return DAG.getMergeValues(Ops: MergedValues, dl: DL);
1712	}
1713
1714	//===----------------------------------------------------------------------===//
1715	// Custom DAG Optimizations
1716	//===----------------------------------------------------------------------===//
1717
1718	SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
1719	DAGCombinerInfo &DCI) const {
1720	SelectionDAG &DAG = DCI.DAG;
1721	SDLoc DL(N);
1722
1723	switch (N->getOpcode()) {
1724	// (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
1725	case ISD::FP_ROUND: {
1726	SDValue Arg = N->getOperand(Num: `0`);
1727	if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
1728	return DAG.getNode(Opcode: ISD::UINT_TO_FP, DL, VT: N->getValueType(ResNo: `0`),
1729	Operand: Arg.getOperand(i: `0`));
1730	}
1731	break;
1732	}
1733
1734	// (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
1735	// (i32 select_cc f32, f32, -1, 0 cc)
1736	//
1737	// Mesa's GLSL frontend generates the above pattern a lot and we can lower
1738	// this to one of the SET_DX10 instructions.*
1739	case ISD::FP_TO_SINT: {
1740	SDValue FNeg = N->getOperand(Num: `0`);
1741	if (FNeg.getOpcode() != ISD::FNEG) {
1742	return SDValue ();
1743	}
1744	SDValue SelectCC = FNeg.getOperand(i: `0`);
1745	if (SelectCC.getOpcode() != ISD::SELECT_CC \|\|
1746	SelectCC.getOperand(`0`).getValueType() != MVT::f32 \|\| // LHS
1747	SelectCC.getOperand(`2`).getValueType() != MVT::f32 \|\| // True
1748	!isHWTrueValue(SelectCC.getOperand(`2`)) \|\|
1749	!isHWFalseValue(SelectCC.getOperand(`3`))) {
1750	return SDValue ();
1751	}
1752
1753	return DAG.getNode(ISD::SELECT_CC, DL, N->getValueType(`0`),
1754	SelectCC.getOperand(`0`), // LHS
1755	SelectCC.getOperand(`1`), // RHS
1756	DAG.getConstant(-`1`, DL, MVT::i32), // True
1757	DAG.getConstant(`0`, DL, MVT::i32), // False
1758	SelectCC.getOperand(`4`)); // CC
1759	}
1760
1761	// insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx
1762	// => build_vector elt0, ... , NewEltIdx, ... , eltN
1763	case ISD::INSERT_VECTOR_ELT: {
1764	SDValue InVec = N->getOperand(Num: `0`);
1765	SDValue InVal = N->getOperand(Num: `1`);
1766	SDValue EltNo = N->getOperand(Num: `2`);
1767
1768	// If the inserted element is an UNDEF, just use the input vector.
1769	if (InVal.isUndef())
1770	return InVec;
1771
1772	EVT VT = InVec.getValueType();
1773
1774	// If we can't generate a legal BUILD_VECTOR, exit
1775	if (!isOperationLegal(Op: ISD::BUILD_VECTOR, VT))
1776	return SDValue ();
1777
1778	// Check that we know which element is being inserted
1779	if (!isa<ConstantSDNode>(Val: EltNo))
1780	return SDValue ();
1781	unsigned Elt = EltNo ->getAsZExtVal();
1782
1783	// Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
1784	// be converted to a BUILD_VECTOR). Fill in the Ops vector with the
1785	// vector elements.
1786	SmallVector<SDValue, `8`> Ops;
1787	if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
1788	Ops.append(in_start: InVec.getNode()->op_begin(),
1789	in_end: InVec.getNode()->op_end());
1790	} else if (InVec.isUndef()) {
1791	unsigned NElts = VT.getVectorNumElements();
1792	Ops.append(NumInputs: NElts, Elt: DAG.getUNDEF(VT: InVal.getValueType()));
1793	} else {
1794	return SDValue ();
1795	}
1796
1797	// Insert the element
1798	if (Elt < Ops.size()) {
1799	// All the operands of BUILD_VECTOR must have the same type;
1800	// we enforce that here.
1801	EVT OpVT = Ops [`0`].getValueType();
1802	if (InVal.getValueType() != OpVT)
1803	InVal = OpVT.bitsGT(VT: InVal.getValueType()) ?
1804	DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: OpVT, Operand: InVal) :
1805	DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: OpVT, Operand: InVal);
1806	Ops [Elt] = InVal;
1807	}
1808
1809	// Return the new vector
1810	return DAG.getBuildVector(VT, DL, Ops);
1811	}
1812
1813	// Extract_vec (Build_vector) generated by custom lowering
1814	// also needs to be customly combined
1815	case ISD::EXTRACT_VECTOR_ELT: {
1816	SDValue Arg = N->getOperand(Num: `0`);
1817	if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
1818	if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: `1`))) {
1819	unsigned Element = Const->getZExtValue();
1820	return Arg ->getOperand(Num: Element);
1821	}
1822	}
1823	if (Arg.getOpcode() == ISD::BITCAST &&
1824	Arg.getOperand(i: `0`).getOpcode() == ISD::BUILD_VECTOR &&
1825	(Arg.getOperand(i: `0`).getValueType().getVectorNumElements() ==
1826	Arg.getValueType().getVectorNumElements())) {
1827	if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: `1`))) {
1828	unsigned Element = Const->getZExtValue();
1829	return DAG.getNode(Opcode: ISD::BITCAST, DL, VTList: N->getVTList(),
1830	N: Arg ->getOperand(Num: `0`).getOperand(i: Element));
1831	}
1832	}
1833	break;
1834	}
1835
1836	case ISD::SELECT_CC: {
1837	// Try common optimizations
1838	if (SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI))
1839	return Ret;
1840
1841	// fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
1842	// selectcc x, y, a, b, inv(cc)
1843	//
1844	// fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
1845	// selectcc x, y, a, b, cc
1846	SDValue LHS = N->getOperand(Num: `0`);
1847	if (LHS.getOpcode() != ISD::SELECT_CC) {
1848	return SDValue ();
1849	}
1850
1851	SDValue RHS = N->getOperand(Num: `1`);
1852	SDValue True = N->getOperand(Num: `2`);
1853	SDValue False = N->getOperand(Num: `3`);
1854	ISD::CondCode NCC = cast<CondCodeSDNode>(Val: N->getOperand(Num: `4`))->get();
1855
1856	if (LHS.getOperand(i: `2`).getNode() != True.getNode() \|\|
1857	LHS.getOperand(i: `3`).getNode() != False.getNode() \|\|
1858	RHS.getNode() != False.getNode()) {
1859	return SDValue ();
1860	}
1861
1862	switch (NCC) {
1863	default: return SDValue ();
1864	case ISD::SETNE: return LHS;
1865	case ISD::SETEQ: {
1866	ISD::CondCode LHSCC = cast<CondCodeSDNode>(Val: LHS.getOperand(i: `4`))->get();
1867	LHSCC = ISD::getSetCCInverse(Operation: LHSCC, Type: LHS.getOperand(i: `0`).getValueType());
1868	if (DCI.isBeforeLegalizeOps() \|\|
1869	isCondCodeLegal(CC: LHSCC, VT: LHS.getOperand(i: `0`).getSimpleValueType()))
1870	return DAG.getSelectCC(DL,
1871	LHS: LHS.getOperand(i: `0`),
1872	RHS: LHS.getOperand(i: `1`),
1873	True: LHS.getOperand(i: `2`),
1874	False: LHS.getOperand(i: `3`),
1875	Cond: LHSCC);
1876	break;
1877	}
1878	}
1879	return SDValue ();
1880	}
1881
1882	case AMDGPUISD::R600_EXPORT: {
1883	SDValue Arg = N->getOperand(Num: `1`);
1884	if (Arg.getOpcode() != ISD::BUILD_VECTOR)
1885	break;
1886
1887	SDValue NewArgs[`8`] = {
1888	N->getOperand(Num: `0`), // Chain
1889	SDValue (),
1890	N->getOperand(Num: `2`), // ArrayBase
1891	N->getOperand(Num: `3`), // Type
1892	N->getOperand(Num: `4`), // SWZ_X
1893	N->getOperand(Num: `5`), // SWZ_Y
1894	N->getOperand(Num: `6`), // SWZ_Z
1895	N->getOperand(Num: `7`) // SWZ_W
1896	};
1897	NewArgs[`1`] = OptimizeSwizzle(BuildVector: N->getOperand(Num: `1`), Swz: &NewArgs[`4`], DAG, DL);
1898	return DAG.getNode(Opcode: AMDGPUISD::R600_EXPORT, DL, VTList: N->getVTList(), Ops: NewArgs);
1899	}
1900	case AMDGPUISD::TEXTURE_FETCH: {
1901	SDValue Arg = N->getOperand(Num: `1`);
1902	if (Arg.getOpcode() != ISD::BUILD_VECTOR)
1903	break;
1904
1905	SDValue NewArgs[`19`] = {
1906	N->getOperand(Num: `0`),
1907	N->getOperand(Num: `1`),
1908	N->getOperand(Num: `2`),
1909	N->getOperand(Num: `3`),
1910	N->getOperand(Num: `4`),
1911	N->getOperand(Num: `5`),
1912	N->getOperand(Num: `6`),
1913	N->getOperand(Num: `7`),
1914	N->getOperand(Num: `8`),
1915	N->getOperand(Num: `9`),
1916	N->getOperand(Num: `10`),
1917	N->getOperand(Num: `11`),
1918	N->getOperand(Num: `12`),
1919	N->getOperand(Num: `13`),
1920	N->getOperand(Num: `14`),
1921	N->getOperand(Num: `15`),
1922	N->getOperand(Num: `16`),
1923	N->getOperand(Num: `17`),
1924	N->getOperand(Num: `18`),
1925	};
1926	NewArgs[`1`] = OptimizeSwizzle(BuildVector: N->getOperand(Num: `1`), Swz: &NewArgs[`2`], DAG, DL);
1927	return DAG.getNode(Opcode: AMDGPUISD::TEXTURE_FETCH, DL, VTList: N->getVTList(), Ops: NewArgs);
1928	}
1929
1930	case ISD::LOAD: {
1931	LoadSDNode *LoadNode = cast<LoadSDNode>(Val: N);
1932	SDValue Ptr = LoadNode->getBasePtr();
1933	if (LoadNode->getAddressSpace() == AMDGPUAS::PARAM_I_ADDRESS &&
1934	isa<ConstantSDNode>(Val: Ptr))
1935	return constBufferLoad(LoadNode, Block: AMDGPUAS::CONSTANT_BUFFER_0, DAG);
1936	break;
1937	}
1938
1939	default: break;
1940	}
1941
1942	return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
1943	}
1944
1945	bool R600TargetLowering::FoldOperand(SDNode ParentNode, unsigned* SrcIdx,
1946	SDValue &Src, SDValue &Neg, SDValue &Abs,
1947	SDValue &Sel, SDValue &Imm,
1948	SelectionDAG &DAG) const {
1949	const R600InstrInfo *TII = Subtarget->getInstrInfo();
1950	if (!Src.isMachineOpcode())
1951	return false;
1952
1953	switch (Src.getMachineOpcode()) {
1954	case R600::FNEG_R600:
1955	if (!Neg.getNode())
1956	return false;
1957	Src = Src.getOperand(i: `0`);
1958	Neg = DAG.getTargetConstant(`1`, SDLoc(ParentNode), MVT::i32);
1959	return true;
1960	case R600::FABS_R600:
1961	if (!Abs.getNode())
1962	return false;
1963	Src = Src.getOperand(i: `0`);
1964	Abs = DAG.getTargetConstant(`1`, SDLoc(ParentNode), MVT::i32);
1965	return true;
1966	case R600::CONST_COPY: {
1967	unsigned Opcode = ParentNode->getMachineOpcode();
1968	bool HasDst = TII->getOperandIdx(Opcode, R600::OpName::dst) > -`1`;
1969
1970	if (!Sel.getNode())
1971	return false;
1972
1973	SDValue CstOffset = Src.getOperand(i: `0`);
1974	if (ParentNode->getValueType(ResNo: `0`).isVector())
1975	return false;
1976
1977	// Gather constants values
1978	int SrcIndices[] = {
1979	TII->getOperandIdx(Opcode, R600::OpName::src0),
1980	TII->getOperandIdx(Opcode, R600::OpName::src1),
1981	TII->getOperandIdx(Opcode, R600::OpName::src2),
1982	TII->getOperandIdx(Opcode, R600::OpName::src0_X),
1983	TII->getOperandIdx(Opcode, R600::OpName::src0_Y),
1984	TII->getOperandIdx(Opcode, R600::OpName::src0_Z),
1985	TII->getOperandIdx(Opcode, R600::OpName::src0_W),
1986	TII->getOperandIdx(Opcode, R600::OpName::src1_X),
1987	TII->getOperandIdx(Opcode, R600::OpName::src1_Y),
1988	TII->getOperandIdx(Opcode, R600::OpName::src1_Z),
1989	TII->getOperandIdx(Opcode, R600::OpName::src1_W)
1990	};
1991	std::vector<unsigned> Consts;
1992	for (int OtherSrcIdx : SrcIndices) {
1993	int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx);
1994	if (OtherSrcIdx < `0` \|\| OtherSelIdx < `0`)
1995	continue;
1996	if (HasDst) {
1997	OtherSrcIdx--;
1998	OtherSelIdx--;
1999	}
2000	if (RegisterSDNode *Reg =
2001	dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) {
2002	if (Reg->getReg() == R600::ALU_CONST) {
2003	Consts.push_back(ParentNode->getConstantOperandVal(OtherSelIdx));
2004	}
2005	}
2006	}
2007
2008	ConstantSDNode *Cst = cast<ConstantSDNode>(Val&: CstOffset);
2009	Consts.push_back(x: Cst->getZExtValue());
2010	if (!TII->fitsConstReadLimitations(Consts)) {
2011	return false;
2012	}
2013
2014	Sel = CstOffset;
2015	Src = DAG.getRegister(R600::ALU_CONST, MVT::f32);
2016	return true;
2017	}
2018	case R600::MOV_IMM_GLOBAL_ADDR:
2019	// Check if the Imm slot is used. Taken from below.
2020	if (Imm ->getAsZExtVal())
2021	return false;
2022	Imm = Src.getOperand(i: `0`);
2023	Src = DAG.getRegister(R600::ALU_LITERAL_X, MVT::i32);
2024	return true;
2025	case R600::MOV_IMM_I32:
2026	case R600::MOV_IMM_F32: {
2027	unsigned ImmReg = R600::ALU_LITERAL_X;
2028	uint64_t ImmValue = `0`;
2029
2030	if (Src.getMachineOpcode() == R600::MOV_IMM_F32) {
2031	ConstantFPSDNode *FPC = cast<ConstantFPSDNode>(Val: Src.getOperand(i: `0`));
2032	float FloatValue = FPC->getValueAPF().convertToFloat();
2033	if (FloatValue == `0.0`) {
2034	ImmReg = R600::ZERO;
2035	} else if (FloatValue == `0.5`) {
2036	ImmReg = R600::HALF;
2037	} else if (FloatValue == `1.0`) {
2038	ImmReg = R600::ONE;
2039	} else {
2040	ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
2041	}
2042	} else {
2043	uint64_t Value = Src.getConstantOperandVal(i: `0`);
2044	if (Value == `0`) {
2045	ImmReg = R600::ZERO;
2046	} else if (Value == `1`) {
2047	ImmReg = R600::ONE_INT;
2048	} else {
2049	ImmValue = Value;
2050	}
2051	}
2052
2053	// Check that we aren't already using an immediate.
2054	// XXX: It's possible for an instruction to have more than one
2055	// immediate operand, but this is not supported yet.
2056	if (ImmReg == R600::ALU_LITERAL_X) {
2057	if (!Imm.getNode())
2058	return false;
2059	ConstantSDNode *C = cast<ConstantSDNode>(Val&: Imm);
2060	if (C->getZExtValue())
2061	return false;
2062	Imm = DAG.getTargetConstant(ImmValue, SDLoc(ParentNode), MVT::i32);
2063	}
2064	Src = DAG.getRegister(ImmReg, MVT::i32);
2065	return true;
2066	}
2067	default:
2068	return false;
2069	}
2070	}
2071
2072	/// Fold the instructions after selecting them
2073	SDNode R600TargetLowering::PostISelFolding(MachineSDNode Node,
2074	SelectionDAG &DAG) const {
2075	const R600InstrInfo *TII = Subtarget->getInstrInfo();
2076	if (!Node->isMachineOpcode())
2077	return Node;
2078
2079	unsigned Opcode = Node->getMachineOpcode();
2080	SDValue FakeOp;
2081
2082	std::vector<SDValue> Ops(Node->op_begin(), Node->op_end());
2083
2084	if (Opcode == R600::DOT_4) {
2085	int OperandIdx[] = {
2086	TII->getOperandIdx(Opcode, R600::OpName::src0_X),
2087	TII->getOperandIdx(Opcode, R600::OpName::src0_Y),
2088	TII->getOperandIdx(Opcode, R600::OpName::src0_Z),
2089	TII->getOperandIdx(Opcode, R600::OpName::src0_W),
2090	TII->getOperandIdx(Opcode, R600::OpName::src1_X),
2091	TII->getOperandIdx(Opcode, R600::OpName::src1_Y),
2092	TII->getOperandIdx(Opcode, R600::OpName::src1_Z),
2093	TII->getOperandIdx(Opcode, R600::OpName::src1_W)
2094	};
2095	int NegIdx[] = {
2096	TII->getOperandIdx(Opcode, R600::OpName::src0_neg_X),
2097	TII->getOperandIdx(Opcode, R600::OpName::src0_neg_Y),
2098	TII->getOperandIdx(Opcode, R600::OpName::src0_neg_Z),
2099	TII->getOperandIdx(Opcode, R600::OpName::src0_neg_W),
2100	TII->getOperandIdx(Opcode, R600::OpName::src1_neg_X),
2101	TII->getOperandIdx(Opcode, R600::OpName::src1_neg_Y),
2102	TII->getOperandIdx(Opcode, R600::OpName::src1_neg_Z),
2103	TII->getOperandIdx(Opcode, R600::OpName::src1_neg_W)
2104	};
2105	int AbsIdx[] = {
2106	TII->getOperandIdx(Opcode, R600::OpName::src0_abs_X),
2107	TII->getOperandIdx(Opcode, R600::OpName::src0_abs_Y),
2108	TII->getOperandIdx(Opcode, R600::OpName::src0_abs_Z),
2109	TII->getOperandIdx(Opcode, R600::OpName::src0_abs_W),
2110	TII->getOperandIdx(Opcode, R600::OpName::src1_abs_X),
2111	TII->getOperandIdx(Opcode, R600::OpName::src1_abs_Y),
2112	TII->getOperandIdx(Opcode, R600::OpName::src1_abs_Z),
2113	TII->getOperandIdx(Opcode, R600::OpName::src1_abs_W)
2114	};
2115	for (unsigned i = `0`; i < `8`; i++) {
2116	if (OperandIdx[i] < `0`)
2117	return Node;
2118	SDValue &Src = Ops [OperandIdx[i] - `1`];
2119	SDValue &Neg = Ops [NegIdx[i] - `1`];
2120	SDValue &Abs = Ops [AbsIdx[i] - `1`];
2121	bool HasDst = TII->getOperandIdx(Opcode, R600::OpName::dst) > -`1`;
2122	int SelIdx = TII->getSelIdx(Opcode, SrcIdx: OperandIdx[i]);
2123	if (HasDst)
2124	SelIdx--;
2125	SDValue &Sel = (SelIdx > -`1`) ? Ops [SelIdx] : FakeOp;
2126	if (FoldOperand(ParentNode: Node, SrcIdx: i, Src, Neg, Abs, Sel, Imm&: FakeOp, DAG))
2127	return DAG.getMachineNode(Opcode, dl: SDLoc (Node), VTs: Node->getVTList(), Ops);
2128	}
2129	} else if (Opcode == R600::REG_SEQUENCE) {
2130	for (unsigned i = `1`, e = Node->getNumOperands(); i < e; i += `2`) {
2131	SDValue &Src = Ops [i];
2132	if (FoldOperand(ParentNode: Node, SrcIdx: i, Src, Neg&: FakeOp, Abs&: FakeOp, Sel&: FakeOp, Imm&: FakeOp, DAG))
2133	return DAG.getMachineNode(Opcode, dl: SDLoc (Node), VTs: Node->getVTList(), Ops);
2134	}
2135	} else {
2136	if (!TII->hasInstrModifiers(Opcode))
2137	return Node;
2138	int OperandIdx[] = {
2139	TII->getOperandIdx(Opcode, R600::OpName::src0),
2140	TII->getOperandIdx(Opcode, R600::OpName::src1),
2141	TII->getOperandIdx(Opcode, R600::OpName::src2)
2142	};
2143	int NegIdx[] = {
2144	TII->getOperandIdx(Opcode, R600::OpName::src0_neg),
2145	TII->getOperandIdx(Opcode, R600::OpName::src1_neg),
2146	TII->getOperandIdx(Opcode, R600::OpName::src2_neg)
2147	};
2148	int AbsIdx[] = {
2149	TII->getOperandIdx(Opcode, R600::OpName::src0_abs),
2150	TII->getOperandIdx(Opcode, R600::OpName::src1_abs),
2151	-`1`
2152	};
2153	for (unsigned i = `0`; i < `3`; i++) {
2154	if (OperandIdx[i] < `0`)
2155	return Node;
2156	SDValue &Src = Ops [OperandIdx[i] - `1`];
2157	SDValue &Neg = Ops [NegIdx[i] - `1`];
2158	SDValue FakeAbs;
2159	SDValue &Abs = (AbsIdx[i] > -`1`) ? Ops [AbsIdx[i] - `1`] : FakeAbs;
2160	bool HasDst = TII->getOperandIdx(Opcode, R600::OpName::dst) > -`1`;
2161	int SelIdx = TII->getSelIdx(Opcode, SrcIdx: OperandIdx[i]);
2162	int ImmIdx = TII->getOperandIdx(Opcode, R600::OpName::literal);
2163	if (HasDst) {
2164	SelIdx--;
2165	ImmIdx--;
2166	}
2167	SDValue &Sel = (SelIdx > -`1`) ? Ops [SelIdx] : FakeOp;
2168	SDValue &Imm = Ops [ImmIdx];
2169	if (FoldOperand(ParentNode: Node, SrcIdx: i, Src, Neg, Abs, Sel, Imm, DAG))
2170	return DAG.getMachineNode(Opcode, dl: SDLoc (Node), VTs: Node->getVTList(), Ops);
2171	}
2172	}
2173
2174	return Node;
2175	}
2176
2177	TargetLowering::AtomicExpansionKind
2178	R600TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst RMW) const* {
2179	switch (RMW->getOperation()) {
2180	case AtomicRMWInst::UIncWrap:
2181	case AtomicRMWInst::UDecWrap:
2182	// FIXME: Cayman at least appears to have instructions for this, but the
2183	// instruction defintions appear to be missing.
2184	return AtomicExpansionKind::CmpXChg;
2185	default:
2186	break;
2187	}
2188
2189	return AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(RMW);
2190	}
2191

source code of llvm/lib/Target/AMDGPU/R600ISelLowering.cpp