ARMSelectionDAGInfo.cpp source code [llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp]

1	//===-- ARMSelectionDAGInfo.cpp - ARM SelectionDAG Info -------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file implements the ARMSelectionDAGInfo class.
10	//
11	//===----------------------------------------------------------------------===//
12
13	#include "ARMTargetMachine.h"
14	#include "ARMTargetTransformInfo.h"
15	#include "llvm/CodeGen/SelectionDAG.h"
16	#include "llvm/IR/DerivedTypes.h"
17	#include "llvm/Support/CommandLine.h"
18	using namespace llvm;
19
20	#define DEBUG_TYPE "arm-selectiondag-info"
21
22	cl::opt<TPLoop::MemTransfer> EnableMemtransferTPLoop(
23	"arm-memtransfer-tploop", cl::Hidden,
24	cl::desc ("Control conversion of memcpy to "
25	"Tail predicated loops (WLSTP)"),
26	cl::init(Val: TPLoop::ForceDisabled),
27	cl::values(clEnumValN(TPLoop::ForceDisabled, "force-disabled",
28	"Don't convert memcpy to TP loop."),
29	clEnumValN(TPLoop::ForceEnabled, "force-enabled",
30	"Always convert memcpy to TP loop."),
31	clEnumValN(TPLoop::Allow, "allow",
32	"Allow (may be subject to certain conditions) "
33	"conversion of memcpy to TP loop.")));
34
35	// Emit, if possible, a specialized version of the given Libcall. Typically this
36	// means selecting the appropriately aligned version, but we also convert memset
37	// of 0 into memclr.
38	SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall(
39	SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
40	SDValue Size, unsigned Align, RTLIB::Libcall LC) const {
41	const ARMSubtarget &Subtarget =
42	DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
43	const ARMTargetLowering *TLI = Subtarget.getTargetLowering();
44
45	// Only use a specialized AEABI function if the default version of this
46	// Libcall is an AEABI function.
47	if (std::strncmp(s1: TLI->getLibcallName(Call: LC), s2: "__aeabi", n: `7`) != `0`)
48	return SDValue ();
49
50	// Translate RTLIB::Libcall to AEABILibcall. We only do this in order to be
51	// able to translate memset to memclr and use the value to index the function
52	// name array.
53	enum {
54	AEABI_MEMCPY = `0`,
55	AEABI_MEMMOVE,
56	AEABI_MEMSET,
57	AEABI_MEMCLR
58	} AEABILibcall;
59	switch (LC) {
60	case RTLIB::MEMCPY:
61	AEABILibcall = AEABI_MEMCPY;
62	break;
63	case RTLIB::MEMMOVE:
64	AEABILibcall = AEABI_MEMMOVE;
65	break;
66	case RTLIB::MEMSET:
67	AEABILibcall = AEABI_MEMSET;
68	if (isNullConstant(V: Src))
69	AEABILibcall = AEABI_MEMCLR;
70	break;
71	default:
72	return SDValue ();
73	}
74
75	// Choose the most-aligned libcall variant that we can
76	enum {
77	ALIGN1 = `0`,
78	ALIGN4,
79	ALIGN8
80	} AlignVariant;
81	if ((Align & `7`) == `0`)
82	AlignVariant = ALIGN8;
83	else if ((Align & `3`) == `0`)
84	AlignVariant = ALIGN4;
85	else
86	AlignVariant = ALIGN1;
87
88	TargetLowering::ArgListTy Args;
89	TargetLowering::ArgListEntry Entry;
90	Entry.Ty = DAG.getDataLayout().getIntPtrType(C&: *DAG.getContext());
91	Entry.Node = Dst;
92	Args.push_back(x: Entry);
93	if (AEABILibcall == AEABI_MEMCLR) {
94	Entry.Node = Size;
95	Args.push_back(x: Entry);
96	} else if (AEABILibcall == AEABI_MEMSET) {
97	// Adjust parameters for memset, EABI uses format (ptr, size, value),
98	// GNU library uses (ptr, value, size)
99	// See RTABI section 4.3.4
100	Entry.Node = Size;
101	Args.push_back(x: Entry);
102
103	// Extend or truncate the argument to be an i32 value for the call.
104	if (Src.getValueType().bitsGT(MVT::i32))
105	Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
106	else if (Src.getValueType().bitsLT(MVT::i32))
107	Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Src);
108
109	Entry.Node = Src;
110	Entry.Ty = Type::getInt32Ty(C&: *DAG.getContext());
111	Entry.IsSExt = false;
112	Args.push_back(x: Entry);
113	} else {
114	Entry.Node = Src;
115	Args.push_back(x: Entry);
116
117	Entry.Node = Size;
118	Args.push_back(x: Entry);
119	}
120
121	char const *FunctionNames[`4`][`3`] = {
122	{ "__aeabi_memcpy", "__aeabi_memcpy4", "__aeabi_memcpy8" },
123	{ "__aeabi_memmove", "__aeabi_memmove4", "__aeabi_memmove8" },
124	{ "__aeabi_memset", "__aeabi_memset4", "__aeabi_memset8" },
125	{ "__aeabi_memclr", "__aeabi_memclr4", "__aeabi_memclr8" }
126	};
127	TargetLowering::CallLoweringInfo CLI(DAG);
128	CLI.setDebugLoc(dl)
129	.setChain(Chain)
130	.setLibCallee(
131	CC: TLI->getLibcallCallingConv(Call: LC), ResultType: Type::getVoidTy(C&: *DAG.getContext()),
132	Target: DAG.getExternalSymbol(Sym: FunctionNames[AEABILibcall][AlignVariant],
133	VT: TLI->getPointerTy(DL: DAG.getDataLayout())),
134	ArgsList: std::move(Args))
135	.setDiscardResult();
136	std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);
137
138	return CallResult.second;
139	}
140
141	static bool shouldGenerateInlineTPLoop(const ARMSubtarget &Subtarget,
142	const SelectionDAG &DAG,
143	ConstantSDNode *ConstantSize,
144	Align Alignment, bool IsMemcpy) {
145	auto &F = DAG.getMachineFunction().getFunction();
146	if (!EnableMemtransferTPLoop)
147	return false;
148	if (EnableMemtransferTPLoop == TPLoop::ForceEnabled)
149	return true;
150	// Do not generate inline TP loop if optimizations is disabled,
151	// or if optimization for size (-Os or -Oz) is on.
152	if (F.hasOptNone() \|\| F.hasOptSize())
153	return false;
154	// If cli option is unset, for memset always generate inline TP.
155	// For memcpy, check some conditions
156	if (!IsMemcpy)
157	return true;
158	if (!ConstantSize && Alignment >= Align (`4`))
159	return true;
160	if (ConstantSize &&
161	ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold() &&
162	ConstantSize->getZExtValue() <
163	Subtarget.getMaxMemcpyTPInlineSizeThreshold())
164	return true;
165	return false;
166	}
167
168	SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(
169	SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
170	SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
171	MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
172	const ARMSubtarget &Subtarget =
173	DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
174	ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Val&: Size);
175
176	if (Subtarget.hasMVEIntegerOps() &&
177	shouldGenerateInlineTPLoop(Subtarget, DAG, ConstantSize, Alignment, true))
178	return DAG.getNode(ARMISD::MEMCPYLOOP, dl, MVT::Other, Chain, Dst, Src,
179	DAG.getZExtOrTrunc(Size, dl, MVT::i32));
180
181	// Do repeated 4-byte loads and stores. To be improved.
182	// This requires 4-byte alignment.
183	if (Alignment < Align (`4`))
184	return SDValue ();
185	// This requires the copy size to be a constant, preferably
186	// within a subtarget-specific limit.
187	if (!ConstantSize)
188	return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
189	Align: Alignment.value(), LC: RTLIB::MEMCPY);
190	uint64_t SizeVal = ConstantSize->getZExtValue();
191	if (!AlwaysInline && SizeVal > Subtarget.getMaxInlineSizeThreshold())
192	return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
193	Align: Alignment.value(), LC: RTLIB::MEMCPY);
194
195	unsigned BytesLeft = SizeVal & `3`;
196	unsigned NumMemOps = SizeVal >> `2`;
197	unsigned EmittedNumMemOps = `0`;
198	EVT VT = MVT::i32;
199	unsigned VTSize = `4`;
200	unsigned i = `0`;
201	// Emit a maximum of 4 loads in Thumb1 since we have fewer registers
202	const unsigned MaxLoadsInLDM = Subtarget.isThumb1Only() ? `4` : `6`;
203	SDValue TFOps[`6`];
204	SDValue Loads[`6`];
205	uint64_t SrcOff = `0`, DstOff = `0`;
206
207	// FIXME: We should invent a VMEMCPY pseudo-instruction that lowers to
208	// VLDM/VSTM and make this code emit it when appropriate. This would reduce
209	// pressure on the general purpose registers. However this seems harder to map
210	// onto the register allocator's view of the world.
211
212	// The number of MEMCPY pseudo-instructions to emit. We use up to
213	// MaxLoadsInLDM registers per mcopy, which will get lowered into ldm/stm
214	// later on. This is a lower bound on the number of MEMCPY operations we must
215	// emit.
216	unsigned NumMEMCPYs = (NumMemOps + MaxLoadsInLDM - `1`) / MaxLoadsInLDM;
217
218	// Code size optimisation: do not inline memcpy if expansion results in
219	// more instructions than the libary call.
220	if (NumMEMCPYs > `1` && Subtarget.hasMinSize()) {
221	return SDValue ();
222	}
223
224	SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other, MVT::Glue);
225
226	for (unsigned I = `0`; I != NumMEMCPYs; ++I) {
227	// Evenly distribute registers among MEMCPY operations to reduce register
228	// pressure.
229	unsigned NextEmittedNumMemOps = NumMemOps * (I + `1`) / NumMEMCPYs;
230	unsigned NumRegs = NextEmittedNumMemOps - EmittedNumMemOps;
231
232	Dst = DAG.getNode(ARMISD::MEMCPY, dl, VTs, Chain, Dst, Src,
233	DAG.getConstant(NumRegs, dl, MVT::i32));
234	Src = Dst.getValue(R: `1`);
235	Chain = Dst.getValue(R: `2`);
236
237	DstPtrInfo = DstPtrInfo.getWithOffset(O: NumRegs * VTSize);
238	SrcPtrInfo = SrcPtrInfo.getWithOffset(O: NumRegs * VTSize);
239
240	EmittedNumMemOps = NextEmittedNumMemOps;
241	}
242
243	if (BytesLeft == `0`)
244	return Chain;
245
246	// Issue loads / stores for the trailing (1 - 3) bytes.
247	auto getRemainingValueType = [](unsigned BytesLeft) {
248	return (BytesLeft >= `2`) ? MVT::i16 : MVT::i8;
249	};
250	auto getRemainingSize = [](unsigned BytesLeft) {
251	return (BytesLeft >= `2`) ? `2` : `1`;
252	};
253
254	unsigned BytesLeftSave = BytesLeft;
255	i = `0`;
256	while (BytesLeft) {
257	VT = getRemainingValueType (BytesLeft);
258	VTSize = getRemainingSize (BytesLeft);
259	Loads[i] = DAG.getLoad(VT, dl, Chain,
260	DAG.getNode(ISD::ADD, dl, MVT::i32, Src,
261	DAG.getConstant(SrcOff, dl, MVT::i32)),
262	SrcPtrInfo.getWithOffset(SrcOff));
263	TFOps[i] = Loads[i].getValue(R: `1`);
264	++i;
265	SrcOff += VTSize;
266	BytesLeft -= VTSize;
267	}
268	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ArrayRef(TFOps, i));
269
270	i = `0`;
271	BytesLeft = BytesLeftSave;
272	while (BytesLeft) {
273	VT = getRemainingValueType (BytesLeft);
274	VTSize = getRemainingSize (BytesLeft);
275	TFOps[i] = DAG.getStore(Chain, dl, Loads[i],
276	DAG.getNode(ISD::ADD, dl, MVT::i32, Dst,
277	DAG.getConstant(DstOff, dl, MVT::i32)),
278	DstPtrInfo.getWithOffset(DstOff));
279	++i;
280	DstOff += VTSize;
281	BytesLeft -= VTSize;
282	}
283	return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ArrayRef(TFOps, i));
284	}
285
286	SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemmove(
287	SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
288	SDValue Size, Align Alignment, bool isVolatile,
289	MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
290	return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
291	Align: Alignment.value(), LC: RTLIB::MEMMOVE);
292	}
293
294	SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemset(
295	SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
296	SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
297	MachinePointerInfo DstPtrInfo) const {
298
299	const ARMSubtarget &Subtarget =
300	DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
301
302	ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Val&: Size);
303
304	// Generate TP loop for llvm.memset
305	if (Subtarget.hasMVEIntegerOps() &&
306	shouldGenerateInlineTPLoop(Subtarget, DAG, ConstantSize, Alignment,
307	IsMemcpy: false)) {
308	Src = DAG.getSplatBuildVector(MVT::v16i8, dl,
309	DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src));
310	return DAG.getNode(ARMISD::MEMSETLOOP, dl, MVT::Other, Chain, Dst, Src,
311	DAG.getZExtOrTrunc(Size, dl, MVT::i32));
312	}
313
314	if (!AlwaysInline)
315	return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
316	Align: Alignment.value(), LC: RTLIB::MEMSET);
317
318	return SDValue ();
319	}
320

source code of llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp