1 | //===-- NVPTXISelDAGToDAG.cpp - A dag to dag inst selector for NVPTX ------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file defines an instruction selector for the NVPTX target. |
10 | // |
11 | //===----------------------------------------------------------------------===// |
12 | |
13 | #include "NVPTXISelDAGToDAG.h" |
14 | #include "MCTargetDesc/NVPTXBaseInfo.h" |
15 | #include "NVPTXUtilities.h" |
16 | #include "llvm/Analysis/ValueTracking.h" |
17 | #include "llvm/CodeGen/ISDOpcodes.h" |
18 | #include "llvm/IR/GlobalValue.h" |
19 | #include "llvm/IR/Instructions.h" |
20 | #include "llvm/IR/IntrinsicsNVPTX.h" |
21 | #include "llvm/Support/AtomicOrdering.h" |
22 | #include "llvm/Support/CommandLine.h" |
23 | #include "llvm/Support/Debug.h" |
24 | #include "llvm/Support/ErrorHandling.h" |
25 | #include "llvm/Support/raw_ostream.h" |
26 | #include "llvm/Target/TargetIntrinsicInfo.h" |
27 | |
28 | using namespace llvm; |
29 | |
30 | #define DEBUG_TYPE "nvptx-isel" |
31 | #define PASS_NAME "NVPTX DAG->DAG Pattern Instruction Selection" |
32 | |
33 | static cl::opt<bool> |
34 | EnableRsqrtOpt("nvptx-rsqrt-approx-opt" , cl::init(Val: true), cl::Hidden, |
35 | cl::desc("Enable reciprocal sqrt optimization" )); |
36 | |
37 | /// createNVPTXISelDag - This pass converts a legalized DAG into a |
38 | /// NVPTX-specific DAG, ready for instruction scheduling. |
39 | FunctionPass *llvm::createNVPTXISelDag(NVPTXTargetMachine &TM, |
40 | llvm::CodeGenOptLevel OptLevel) { |
41 | return new NVPTXDAGToDAGISel(TM, OptLevel); |
42 | } |
43 | |
44 | char NVPTXDAGToDAGISel::ID = 0; |
45 | |
46 | INITIALIZE_PASS(NVPTXDAGToDAGISel, DEBUG_TYPE, PASS_NAME, false, false) |
47 | |
48 | NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm, |
49 | CodeGenOptLevel OptLevel) |
50 | : SelectionDAGISel(ID, tm, OptLevel), TM(tm) { |
51 | doMulWide = (OptLevel > CodeGenOptLevel::None); |
52 | } |
53 | |
54 | bool NVPTXDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) { |
55 | Subtarget = &MF.getSubtarget<NVPTXSubtarget>(); |
56 | return SelectionDAGISel::runOnMachineFunction(MF); |
57 | } |
58 | |
59 | int NVPTXDAGToDAGISel::getDivF32Level() const { |
60 | return Subtarget->getTargetLowering()->getDivF32Level(); |
61 | } |
62 | |
63 | bool NVPTXDAGToDAGISel::usePrecSqrtF32() const { |
64 | return Subtarget->getTargetLowering()->usePrecSqrtF32(); |
65 | } |
66 | |
67 | bool NVPTXDAGToDAGISel::useF32FTZ() const { |
68 | return Subtarget->getTargetLowering()->useF32FTZ(MF: *MF); |
69 | } |
70 | |
71 | bool NVPTXDAGToDAGISel::allowFMA() const { |
72 | const NVPTXTargetLowering *TL = Subtarget->getTargetLowering(); |
73 | return TL->allowFMA(MF&: *MF, OptLevel); |
74 | } |
75 | |
76 | bool NVPTXDAGToDAGISel::allowUnsafeFPMath() const { |
77 | const NVPTXTargetLowering *TL = Subtarget->getTargetLowering(); |
78 | return TL->allowUnsafeFPMath(MF&: *MF); |
79 | } |
80 | |
81 | bool NVPTXDAGToDAGISel::doRsqrtOpt() const { return EnableRsqrtOpt; } |
82 | |
83 | /// Select - Select instructions not customized! Used for |
84 | /// expanded, promoted and normal instructions. |
85 | void NVPTXDAGToDAGISel::Select(SDNode *N) { |
86 | |
87 | if (N->isMachineOpcode()) { |
88 | N->setNodeId(-1); |
89 | return; // Already selected. |
90 | } |
91 | |
92 | switch (N->getOpcode()) { |
93 | case ISD::LOAD: |
94 | case ISD::ATOMIC_LOAD: |
95 | if (tryLoad(N)) |
96 | return; |
97 | break; |
98 | case ISD::STORE: |
99 | case ISD::ATOMIC_STORE: |
100 | if (tryStore(N)) |
101 | return; |
102 | break; |
103 | case ISD::EXTRACT_VECTOR_ELT: |
104 | if (tryEXTRACT_VECTOR_ELEMENT(N)) |
105 | return; |
106 | break; |
107 | case NVPTXISD::SETP_F16X2: |
108 | SelectSETP_F16X2(N); |
109 | return; |
110 | case NVPTXISD::SETP_BF16X2: |
111 | SelectSETP_BF16X2(N); |
112 | return; |
113 | case NVPTXISD::LoadV2: |
114 | case NVPTXISD::LoadV4: |
115 | if (tryLoadVector(N)) |
116 | return; |
117 | break; |
118 | case NVPTXISD::LDGV2: |
119 | case NVPTXISD::LDGV4: |
120 | case NVPTXISD::LDUV2: |
121 | case NVPTXISD::LDUV4: |
122 | if (tryLDGLDU(N)) |
123 | return; |
124 | break; |
125 | case NVPTXISD::StoreV2: |
126 | case NVPTXISD::StoreV4: |
127 | if (tryStoreVector(N)) |
128 | return; |
129 | break; |
130 | case NVPTXISD::LoadParam: |
131 | case NVPTXISD::LoadParamV2: |
132 | case NVPTXISD::LoadParamV4: |
133 | if (tryLoadParam(N)) |
134 | return; |
135 | break; |
136 | case NVPTXISD::StoreRetval: |
137 | case NVPTXISD::StoreRetvalV2: |
138 | case NVPTXISD::StoreRetvalV4: |
139 | if (tryStoreRetval(N)) |
140 | return; |
141 | break; |
142 | case NVPTXISD::StoreParam: |
143 | case NVPTXISD::StoreParamV2: |
144 | case NVPTXISD::StoreParamV4: |
145 | case NVPTXISD::StoreParamS32: |
146 | case NVPTXISD::StoreParamU32: |
147 | if (tryStoreParam(N)) |
148 | return; |
149 | break; |
150 | case ISD::INTRINSIC_WO_CHAIN: |
151 | if (tryIntrinsicNoChain(N)) |
152 | return; |
153 | break; |
154 | case ISD::INTRINSIC_W_CHAIN: |
155 | if (tryIntrinsicChain(N)) |
156 | return; |
157 | break; |
158 | case NVPTXISD::Tex1DFloatS32: |
159 | case NVPTXISD::Tex1DFloatFloat: |
160 | case NVPTXISD::Tex1DFloatFloatLevel: |
161 | case NVPTXISD::Tex1DFloatFloatGrad: |
162 | case NVPTXISD::Tex1DS32S32: |
163 | case NVPTXISD::Tex1DS32Float: |
164 | case NVPTXISD::Tex1DS32FloatLevel: |
165 | case NVPTXISD::Tex1DS32FloatGrad: |
166 | case NVPTXISD::Tex1DU32S32: |
167 | case NVPTXISD::Tex1DU32Float: |
168 | case NVPTXISD::Tex1DU32FloatLevel: |
169 | case NVPTXISD::Tex1DU32FloatGrad: |
170 | case NVPTXISD::Tex1DArrayFloatS32: |
171 | case NVPTXISD::Tex1DArrayFloatFloat: |
172 | case NVPTXISD::Tex1DArrayFloatFloatLevel: |
173 | case NVPTXISD::Tex1DArrayFloatFloatGrad: |
174 | case NVPTXISD::Tex1DArrayS32S32: |
175 | case NVPTXISD::Tex1DArrayS32Float: |
176 | case NVPTXISD::Tex1DArrayS32FloatLevel: |
177 | case NVPTXISD::Tex1DArrayS32FloatGrad: |
178 | case NVPTXISD::Tex1DArrayU32S32: |
179 | case NVPTXISD::Tex1DArrayU32Float: |
180 | case NVPTXISD::Tex1DArrayU32FloatLevel: |
181 | case NVPTXISD::Tex1DArrayU32FloatGrad: |
182 | case NVPTXISD::Tex2DFloatS32: |
183 | case NVPTXISD::Tex2DFloatFloat: |
184 | case NVPTXISD::Tex2DFloatFloatLevel: |
185 | case NVPTXISD::Tex2DFloatFloatGrad: |
186 | case NVPTXISD::Tex2DS32S32: |
187 | case NVPTXISD::Tex2DS32Float: |
188 | case NVPTXISD::Tex2DS32FloatLevel: |
189 | case NVPTXISD::Tex2DS32FloatGrad: |
190 | case NVPTXISD::Tex2DU32S32: |
191 | case NVPTXISD::Tex2DU32Float: |
192 | case NVPTXISD::Tex2DU32FloatLevel: |
193 | case NVPTXISD::Tex2DU32FloatGrad: |
194 | case NVPTXISD::Tex2DArrayFloatS32: |
195 | case NVPTXISD::Tex2DArrayFloatFloat: |
196 | case NVPTXISD::Tex2DArrayFloatFloatLevel: |
197 | case NVPTXISD::Tex2DArrayFloatFloatGrad: |
198 | case NVPTXISD::Tex2DArrayS32S32: |
199 | case NVPTXISD::Tex2DArrayS32Float: |
200 | case NVPTXISD::Tex2DArrayS32FloatLevel: |
201 | case NVPTXISD::Tex2DArrayS32FloatGrad: |
202 | case NVPTXISD::Tex2DArrayU32S32: |
203 | case NVPTXISD::Tex2DArrayU32Float: |
204 | case NVPTXISD::Tex2DArrayU32FloatLevel: |
205 | case NVPTXISD::Tex2DArrayU32FloatGrad: |
206 | case NVPTXISD::Tex3DFloatS32: |
207 | case NVPTXISD::Tex3DFloatFloat: |
208 | case NVPTXISD::Tex3DFloatFloatLevel: |
209 | case NVPTXISD::Tex3DFloatFloatGrad: |
210 | case NVPTXISD::Tex3DS32S32: |
211 | case NVPTXISD::Tex3DS32Float: |
212 | case NVPTXISD::Tex3DS32FloatLevel: |
213 | case NVPTXISD::Tex3DS32FloatGrad: |
214 | case NVPTXISD::Tex3DU32S32: |
215 | case NVPTXISD::Tex3DU32Float: |
216 | case NVPTXISD::Tex3DU32FloatLevel: |
217 | case NVPTXISD::Tex3DU32FloatGrad: |
218 | case NVPTXISD::TexCubeFloatFloat: |
219 | case NVPTXISD::TexCubeFloatFloatLevel: |
220 | case NVPTXISD::TexCubeS32Float: |
221 | case NVPTXISD::TexCubeS32FloatLevel: |
222 | case NVPTXISD::TexCubeU32Float: |
223 | case NVPTXISD::TexCubeU32FloatLevel: |
224 | case NVPTXISD::TexCubeArrayFloatFloat: |
225 | case NVPTXISD::TexCubeArrayFloatFloatLevel: |
226 | case NVPTXISD::TexCubeArrayS32Float: |
227 | case NVPTXISD::TexCubeArrayS32FloatLevel: |
228 | case NVPTXISD::TexCubeArrayU32Float: |
229 | case NVPTXISD::TexCubeArrayU32FloatLevel: |
230 | case NVPTXISD::Tld4R2DFloatFloat: |
231 | case NVPTXISD::Tld4G2DFloatFloat: |
232 | case NVPTXISD::Tld4B2DFloatFloat: |
233 | case NVPTXISD::Tld4A2DFloatFloat: |
234 | case NVPTXISD::Tld4R2DS64Float: |
235 | case NVPTXISD::Tld4G2DS64Float: |
236 | case NVPTXISD::Tld4B2DS64Float: |
237 | case NVPTXISD::Tld4A2DS64Float: |
238 | case NVPTXISD::Tld4R2DU64Float: |
239 | case NVPTXISD::Tld4G2DU64Float: |
240 | case NVPTXISD::Tld4B2DU64Float: |
241 | case NVPTXISD::Tld4A2DU64Float: |
242 | case NVPTXISD::TexUnified1DFloatS32: |
243 | case NVPTXISD::TexUnified1DFloatFloat: |
244 | case NVPTXISD::TexUnified1DFloatFloatLevel: |
245 | case NVPTXISD::TexUnified1DFloatFloatGrad: |
246 | case NVPTXISD::TexUnified1DS32S32: |
247 | case NVPTXISD::TexUnified1DS32Float: |
248 | case NVPTXISD::TexUnified1DS32FloatLevel: |
249 | case NVPTXISD::TexUnified1DS32FloatGrad: |
250 | case NVPTXISD::TexUnified1DU32S32: |
251 | case NVPTXISD::TexUnified1DU32Float: |
252 | case NVPTXISD::TexUnified1DU32FloatLevel: |
253 | case NVPTXISD::TexUnified1DU32FloatGrad: |
254 | case NVPTXISD::TexUnified1DArrayFloatS32: |
255 | case NVPTXISD::TexUnified1DArrayFloatFloat: |
256 | case NVPTXISD::TexUnified1DArrayFloatFloatLevel: |
257 | case NVPTXISD::TexUnified1DArrayFloatFloatGrad: |
258 | case NVPTXISD::TexUnified1DArrayS32S32: |
259 | case NVPTXISD::TexUnified1DArrayS32Float: |
260 | case NVPTXISD::TexUnified1DArrayS32FloatLevel: |
261 | case NVPTXISD::TexUnified1DArrayS32FloatGrad: |
262 | case NVPTXISD::TexUnified1DArrayU32S32: |
263 | case NVPTXISD::TexUnified1DArrayU32Float: |
264 | case NVPTXISD::TexUnified1DArrayU32FloatLevel: |
265 | case NVPTXISD::TexUnified1DArrayU32FloatGrad: |
266 | case NVPTXISD::TexUnified2DFloatS32: |
267 | case NVPTXISD::TexUnified2DFloatFloat: |
268 | case NVPTXISD::TexUnified2DFloatFloatLevel: |
269 | case NVPTXISD::TexUnified2DFloatFloatGrad: |
270 | case NVPTXISD::TexUnified2DS32S32: |
271 | case NVPTXISD::TexUnified2DS32Float: |
272 | case NVPTXISD::TexUnified2DS32FloatLevel: |
273 | case NVPTXISD::TexUnified2DS32FloatGrad: |
274 | case NVPTXISD::TexUnified2DU32S32: |
275 | case NVPTXISD::TexUnified2DU32Float: |
276 | case NVPTXISD::TexUnified2DU32FloatLevel: |
277 | case NVPTXISD::TexUnified2DU32FloatGrad: |
278 | case NVPTXISD::TexUnified2DArrayFloatS32: |
279 | case NVPTXISD::TexUnified2DArrayFloatFloat: |
280 | case NVPTXISD::TexUnified2DArrayFloatFloatLevel: |
281 | case NVPTXISD::TexUnified2DArrayFloatFloatGrad: |
282 | case NVPTXISD::TexUnified2DArrayS32S32: |
283 | case NVPTXISD::TexUnified2DArrayS32Float: |
284 | case NVPTXISD::TexUnified2DArrayS32FloatLevel: |
285 | case NVPTXISD::TexUnified2DArrayS32FloatGrad: |
286 | case NVPTXISD::TexUnified2DArrayU32S32: |
287 | case NVPTXISD::TexUnified2DArrayU32Float: |
288 | case NVPTXISD::TexUnified2DArrayU32FloatLevel: |
289 | case NVPTXISD::TexUnified2DArrayU32FloatGrad: |
290 | case NVPTXISD::TexUnified3DFloatS32: |
291 | case NVPTXISD::TexUnified3DFloatFloat: |
292 | case NVPTXISD::TexUnified3DFloatFloatLevel: |
293 | case NVPTXISD::TexUnified3DFloatFloatGrad: |
294 | case NVPTXISD::TexUnified3DS32S32: |
295 | case NVPTXISD::TexUnified3DS32Float: |
296 | case NVPTXISD::TexUnified3DS32FloatLevel: |
297 | case NVPTXISD::TexUnified3DS32FloatGrad: |
298 | case NVPTXISD::TexUnified3DU32S32: |
299 | case NVPTXISD::TexUnified3DU32Float: |
300 | case NVPTXISD::TexUnified3DU32FloatLevel: |
301 | case NVPTXISD::TexUnified3DU32FloatGrad: |
302 | case NVPTXISD::TexUnifiedCubeFloatFloat: |
303 | case NVPTXISD::TexUnifiedCubeFloatFloatLevel: |
304 | case NVPTXISD::TexUnifiedCubeS32Float: |
305 | case NVPTXISD::TexUnifiedCubeS32FloatLevel: |
306 | case NVPTXISD::TexUnifiedCubeU32Float: |
307 | case NVPTXISD::TexUnifiedCubeU32FloatLevel: |
308 | case NVPTXISD::TexUnifiedCubeArrayFloatFloat: |
309 | case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel: |
310 | case NVPTXISD::TexUnifiedCubeArrayS32Float: |
311 | case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel: |
312 | case NVPTXISD::TexUnifiedCubeArrayU32Float: |
313 | case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel: |
314 | case NVPTXISD::TexUnifiedCubeFloatFloatGrad: |
315 | case NVPTXISD::TexUnifiedCubeS32FloatGrad: |
316 | case NVPTXISD::TexUnifiedCubeU32FloatGrad: |
317 | case NVPTXISD::TexUnifiedCubeArrayFloatFloatGrad: |
318 | case NVPTXISD::TexUnifiedCubeArrayS32FloatGrad: |
319 | case NVPTXISD::TexUnifiedCubeArrayU32FloatGrad: |
320 | case NVPTXISD::Tld4UnifiedR2DFloatFloat: |
321 | case NVPTXISD::Tld4UnifiedG2DFloatFloat: |
322 | case NVPTXISD::Tld4UnifiedB2DFloatFloat: |
323 | case NVPTXISD::Tld4UnifiedA2DFloatFloat: |
324 | case NVPTXISD::Tld4UnifiedR2DS64Float: |
325 | case NVPTXISD::Tld4UnifiedG2DS64Float: |
326 | case NVPTXISD::Tld4UnifiedB2DS64Float: |
327 | case NVPTXISD::Tld4UnifiedA2DS64Float: |
328 | case NVPTXISD::Tld4UnifiedR2DU64Float: |
329 | case NVPTXISD::Tld4UnifiedG2DU64Float: |
330 | case NVPTXISD::Tld4UnifiedB2DU64Float: |
331 | case NVPTXISD::Tld4UnifiedA2DU64Float: |
332 | if (tryTextureIntrinsic(N)) |
333 | return; |
334 | break; |
335 | case NVPTXISD::Suld1DI8Clamp: |
336 | case NVPTXISD::Suld1DI16Clamp: |
337 | case NVPTXISD::Suld1DI32Clamp: |
338 | case NVPTXISD::Suld1DI64Clamp: |
339 | case NVPTXISD::Suld1DV2I8Clamp: |
340 | case NVPTXISD::Suld1DV2I16Clamp: |
341 | case NVPTXISD::Suld1DV2I32Clamp: |
342 | case NVPTXISD::Suld1DV2I64Clamp: |
343 | case NVPTXISD::Suld1DV4I8Clamp: |
344 | case NVPTXISD::Suld1DV4I16Clamp: |
345 | case NVPTXISD::Suld1DV4I32Clamp: |
346 | case NVPTXISD::Suld1DArrayI8Clamp: |
347 | case NVPTXISD::Suld1DArrayI16Clamp: |
348 | case NVPTXISD::Suld1DArrayI32Clamp: |
349 | case NVPTXISD::Suld1DArrayI64Clamp: |
350 | case NVPTXISD::Suld1DArrayV2I8Clamp: |
351 | case NVPTXISD::Suld1DArrayV2I16Clamp: |
352 | case NVPTXISD::Suld1DArrayV2I32Clamp: |
353 | case NVPTXISD::Suld1DArrayV2I64Clamp: |
354 | case NVPTXISD::Suld1DArrayV4I8Clamp: |
355 | case NVPTXISD::Suld1DArrayV4I16Clamp: |
356 | case NVPTXISD::Suld1DArrayV4I32Clamp: |
357 | case NVPTXISD::Suld2DI8Clamp: |
358 | case NVPTXISD::Suld2DI16Clamp: |
359 | case NVPTXISD::Suld2DI32Clamp: |
360 | case NVPTXISD::Suld2DI64Clamp: |
361 | case NVPTXISD::Suld2DV2I8Clamp: |
362 | case NVPTXISD::Suld2DV2I16Clamp: |
363 | case NVPTXISD::Suld2DV2I32Clamp: |
364 | case NVPTXISD::Suld2DV2I64Clamp: |
365 | case NVPTXISD::Suld2DV4I8Clamp: |
366 | case NVPTXISD::Suld2DV4I16Clamp: |
367 | case NVPTXISD::Suld2DV4I32Clamp: |
368 | case NVPTXISD::Suld2DArrayI8Clamp: |
369 | case NVPTXISD::Suld2DArrayI16Clamp: |
370 | case NVPTXISD::Suld2DArrayI32Clamp: |
371 | case NVPTXISD::Suld2DArrayI64Clamp: |
372 | case NVPTXISD::Suld2DArrayV2I8Clamp: |
373 | case NVPTXISD::Suld2DArrayV2I16Clamp: |
374 | case NVPTXISD::Suld2DArrayV2I32Clamp: |
375 | case NVPTXISD::Suld2DArrayV2I64Clamp: |
376 | case NVPTXISD::Suld2DArrayV4I8Clamp: |
377 | case NVPTXISD::Suld2DArrayV4I16Clamp: |
378 | case NVPTXISD::Suld2DArrayV4I32Clamp: |
379 | case NVPTXISD::Suld3DI8Clamp: |
380 | case NVPTXISD::Suld3DI16Clamp: |
381 | case NVPTXISD::Suld3DI32Clamp: |
382 | case NVPTXISD::Suld3DI64Clamp: |
383 | case NVPTXISD::Suld3DV2I8Clamp: |
384 | case NVPTXISD::Suld3DV2I16Clamp: |
385 | case NVPTXISD::Suld3DV2I32Clamp: |
386 | case NVPTXISD::Suld3DV2I64Clamp: |
387 | case NVPTXISD::Suld3DV4I8Clamp: |
388 | case NVPTXISD::Suld3DV4I16Clamp: |
389 | case NVPTXISD::Suld3DV4I32Clamp: |
390 | case NVPTXISD::Suld1DI8Trap: |
391 | case NVPTXISD::Suld1DI16Trap: |
392 | case NVPTXISD::Suld1DI32Trap: |
393 | case NVPTXISD::Suld1DI64Trap: |
394 | case NVPTXISD::Suld1DV2I8Trap: |
395 | case NVPTXISD::Suld1DV2I16Trap: |
396 | case NVPTXISD::Suld1DV2I32Trap: |
397 | case NVPTXISD::Suld1DV2I64Trap: |
398 | case NVPTXISD::Suld1DV4I8Trap: |
399 | case NVPTXISD::Suld1DV4I16Trap: |
400 | case NVPTXISD::Suld1DV4I32Trap: |
401 | case NVPTXISD::Suld1DArrayI8Trap: |
402 | case NVPTXISD::Suld1DArrayI16Trap: |
403 | case NVPTXISD::Suld1DArrayI32Trap: |
404 | case NVPTXISD::Suld1DArrayI64Trap: |
405 | case NVPTXISD::Suld1DArrayV2I8Trap: |
406 | case NVPTXISD::Suld1DArrayV2I16Trap: |
407 | case NVPTXISD::Suld1DArrayV2I32Trap: |
408 | case NVPTXISD::Suld1DArrayV2I64Trap: |
409 | case NVPTXISD::Suld1DArrayV4I8Trap: |
410 | case NVPTXISD::Suld1DArrayV4I16Trap: |
411 | case NVPTXISD::Suld1DArrayV4I32Trap: |
412 | case NVPTXISD::Suld2DI8Trap: |
413 | case NVPTXISD::Suld2DI16Trap: |
414 | case NVPTXISD::Suld2DI32Trap: |
415 | case NVPTXISD::Suld2DI64Trap: |
416 | case NVPTXISD::Suld2DV2I8Trap: |
417 | case NVPTXISD::Suld2DV2I16Trap: |
418 | case NVPTXISD::Suld2DV2I32Trap: |
419 | case NVPTXISD::Suld2DV2I64Trap: |
420 | case NVPTXISD::Suld2DV4I8Trap: |
421 | case NVPTXISD::Suld2DV4I16Trap: |
422 | case NVPTXISD::Suld2DV4I32Trap: |
423 | case NVPTXISD::Suld2DArrayI8Trap: |
424 | case NVPTXISD::Suld2DArrayI16Trap: |
425 | case NVPTXISD::Suld2DArrayI32Trap: |
426 | case NVPTXISD::Suld2DArrayI64Trap: |
427 | case NVPTXISD::Suld2DArrayV2I8Trap: |
428 | case NVPTXISD::Suld2DArrayV2I16Trap: |
429 | case NVPTXISD::Suld2DArrayV2I32Trap: |
430 | case NVPTXISD::Suld2DArrayV2I64Trap: |
431 | case NVPTXISD::Suld2DArrayV4I8Trap: |
432 | case NVPTXISD::Suld2DArrayV4I16Trap: |
433 | case NVPTXISD::Suld2DArrayV4I32Trap: |
434 | case NVPTXISD::Suld3DI8Trap: |
435 | case NVPTXISD::Suld3DI16Trap: |
436 | case NVPTXISD::Suld3DI32Trap: |
437 | case NVPTXISD::Suld3DI64Trap: |
438 | case NVPTXISD::Suld3DV2I8Trap: |
439 | case NVPTXISD::Suld3DV2I16Trap: |
440 | case NVPTXISD::Suld3DV2I32Trap: |
441 | case NVPTXISD::Suld3DV2I64Trap: |
442 | case NVPTXISD::Suld3DV4I8Trap: |
443 | case NVPTXISD::Suld3DV4I16Trap: |
444 | case NVPTXISD::Suld3DV4I32Trap: |
445 | case NVPTXISD::Suld1DI8Zero: |
446 | case NVPTXISD::Suld1DI16Zero: |
447 | case NVPTXISD::Suld1DI32Zero: |
448 | case NVPTXISD::Suld1DI64Zero: |
449 | case NVPTXISD::Suld1DV2I8Zero: |
450 | case NVPTXISD::Suld1DV2I16Zero: |
451 | case NVPTXISD::Suld1DV2I32Zero: |
452 | case NVPTXISD::Suld1DV2I64Zero: |
453 | case NVPTXISD::Suld1DV4I8Zero: |
454 | case NVPTXISD::Suld1DV4I16Zero: |
455 | case NVPTXISD::Suld1DV4I32Zero: |
456 | case NVPTXISD::Suld1DArrayI8Zero: |
457 | case NVPTXISD::Suld1DArrayI16Zero: |
458 | case NVPTXISD::Suld1DArrayI32Zero: |
459 | case NVPTXISD::Suld1DArrayI64Zero: |
460 | case NVPTXISD::Suld1DArrayV2I8Zero: |
461 | case NVPTXISD::Suld1DArrayV2I16Zero: |
462 | case NVPTXISD::Suld1DArrayV2I32Zero: |
463 | case NVPTXISD::Suld1DArrayV2I64Zero: |
464 | case NVPTXISD::Suld1DArrayV4I8Zero: |
465 | case NVPTXISD::Suld1DArrayV4I16Zero: |
466 | case NVPTXISD::Suld1DArrayV4I32Zero: |
467 | case NVPTXISD::Suld2DI8Zero: |
468 | case NVPTXISD::Suld2DI16Zero: |
469 | case NVPTXISD::Suld2DI32Zero: |
470 | case NVPTXISD::Suld2DI64Zero: |
471 | case NVPTXISD::Suld2DV2I8Zero: |
472 | case NVPTXISD::Suld2DV2I16Zero: |
473 | case NVPTXISD::Suld2DV2I32Zero: |
474 | case NVPTXISD::Suld2DV2I64Zero: |
475 | case NVPTXISD::Suld2DV4I8Zero: |
476 | case NVPTXISD::Suld2DV4I16Zero: |
477 | case NVPTXISD::Suld2DV4I32Zero: |
478 | case NVPTXISD::Suld2DArrayI8Zero: |
479 | case NVPTXISD::Suld2DArrayI16Zero: |
480 | case NVPTXISD::Suld2DArrayI32Zero: |
481 | case NVPTXISD::Suld2DArrayI64Zero: |
482 | case NVPTXISD::Suld2DArrayV2I8Zero: |
483 | case NVPTXISD::Suld2DArrayV2I16Zero: |
484 | case NVPTXISD::Suld2DArrayV2I32Zero: |
485 | case NVPTXISD::Suld2DArrayV2I64Zero: |
486 | case NVPTXISD::Suld2DArrayV4I8Zero: |
487 | case NVPTXISD::Suld2DArrayV4I16Zero: |
488 | case NVPTXISD::Suld2DArrayV4I32Zero: |
489 | case NVPTXISD::Suld3DI8Zero: |
490 | case NVPTXISD::Suld3DI16Zero: |
491 | case NVPTXISD::Suld3DI32Zero: |
492 | case NVPTXISD::Suld3DI64Zero: |
493 | case NVPTXISD::Suld3DV2I8Zero: |
494 | case NVPTXISD::Suld3DV2I16Zero: |
495 | case NVPTXISD::Suld3DV2I32Zero: |
496 | case NVPTXISD::Suld3DV2I64Zero: |
497 | case NVPTXISD::Suld3DV4I8Zero: |
498 | case NVPTXISD::Suld3DV4I16Zero: |
499 | case NVPTXISD::Suld3DV4I32Zero: |
500 | if (trySurfaceIntrinsic(N)) |
501 | return; |
502 | break; |
503 | case ISD::AND: |
504 | case ISD::SRA: |
505 | case ISD::SRL: |
506 | // Try to select BFE |
507 | if (tryBFE(N)) |
508 | return; |
509 | break; |
510 | case ISD::ADDRSPACECAST: |
511 | SelectAddrSpaceCast(N); |
512 | return; |
513 | case ISD::ConstantFP: |
514 | if (tryConstantFP(N)) |
515 | return; |
516 | break; |
517 | default: |
518 | break; |
519 | } |
520 | SelectCode(N); |
521 | } |
522 | |
523 | bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) { |
524 | unsigned IID = N->getConstantOperandVal(Num: 1); |
525 | switch (IID) { |
526 | default: |
527 | return false; |
528 | case Intrinsic::nvvm_ldg_global_f: |
529 | case Intrinsic::nvvm_ldg_global_i: |
530 | case Intrinsic::nvvm_ldg_global_p: |
531 | case Intrinsic::nvvm_ldu_global_f: |
532 | case Intrinsic::nvvm_ldu_global_i: |
533 | case Intrinsic::nvvm_ldu_global_p: |
534 | return tryLDGLDU(N); |
535 | } |
536 | } |
537 | |
538 | // There's no way to specify FP16 and BF16 immediates in .(b)f16 ops, so we |
539 | // have to load them into an .(b)f16 register first. |
540 | bool NVPTXDAGToDAGISel::tryConstantFP(SDNode *N) { |
541 | if (N->getValueType(ResNo: 0) != MVT::f16 && N->getValueType(ResNo: 0) != MVT::bf16) |
542 | return false; |
543 | SDValue Val = CurDAG->getTargetConstantFP( |
544 | Val: cast<ConstantFPSDNode>(Val: N)->getValueAPF(), DL: SDLoc(N), VT: N->getValueType(ResNo: 0)); |
545 | SDNode *LoadConstF16 = CurDAG->getMachineNode( |
546 | (N->getValueType(ResNo: 0) == MVT::f16 ? NVPTX::LOAD_CONST_F16 |
547 | : NVPTX::LOAD_CONST_BF16), |
548 | SDLoc(N), N->getValueType(ResNo: 0), Val); |
549 | ReplaceNode(F: N, T: LoadConstF16); |
550 | return true; |
551 | } |
552 | |
553 | // Map ISD:CONDCODE value to appropriate CmpMode expected by |
554 | // NVPTXInstPrinter::printCmpMode() |
555 | static unsigned getPTXCmpMode(const CondCodeSDNode &CondCode, bool FTZ) { |
556 | using NVPTX::PTXCmpMode::CmpMode; |
557 | unsigned PTXCmpMode = [](ISD::CondCode CC) { |
558 | switch (CC) { |
559 | default: |
560 | llvm_unreachable("Unexpected condition code." ); |
561 | case ISD::SETOEQ: |
562 | return CmpMode::EQ; |
563 | case ISD::SETOGT: |
564 | return CmpMode::GT; |
565 | case ISD::SETOGE: |
566 | return CmpMode::GE; |
567 | case ISD::SETOLT: |
568 | return CmpMode::LT; |
569 | case ISD::SETOLE: |
570 | return CmpMode::LE; |
571 | case ISD::SETONE: |
572 | return CmpMode::NE; |
573 | case ISD::SETO: |
574 | return CmpMode::NUM; |
575 | case ISD::SETUO: |
576 | return CmpMode::NotANumber; |
577 | case ISD::SETUEQ: |
578 | return CmpMode::EQU; |
579 | case ISD::SETUGT: |
580 | return CmpMode::GTU; |
581 | case ISD::SETUGE: |
582 | return CmpMode::GEU; |
583 | case ISD::SETULT: |
584 | return CmpMode::LTU; |
585 | case ISD::SETULE: |
586 | return CmpMode::LEU; |
587 | case ISD::SETUNE: |
588 | return CmpMode::NEU; |
589 | case ISD::SETEQ: |
590 | return CmpMode::EQ; |
591 | case ISD::SETGT: |
592 | return CmpMode::GT; |
593 | case ISD::SETGE: |
594 | return CmpMode::GE; |
595 | case ISD::SETLT: |
596 | return CmpMode::LT; |
597 | case ISD::SETLE: |
598 | return CmpMode::LE; |
599 | case ISD::SETNE: |
600 | return CmpMode::NE; |
601 | } |
602 | }(CondCode.get()); |
603 | |
604 | if (FTZ) |
605 | PTXCmpMode |= NVPTX::PTXCmpMode::FTZ_FLAG; |
606 | |
607 | return PTXCmpMode; |
608 | } |
609 | |
610 | bool NVPTXDAGToDAGISel::SelectSETP_F16X2(SDNode *N) { |
611 | unsigned PTXCmpMode = |
612 | getPTXCmpMode(CondCode: *cast<CondCodeSDNode>(Val: N->getOperand(Num: 2)), FTZ: useF32FTZ()); |
613 | SDLoc DL(N); |
614 | SDNode *SetP = CurDAG->getMachineNode( |
615 | NVPTX::SETP_f16x2rr, DL, MVT::i1, MVT::i1, N->getOperand(Num: 0), |
616 | N->getOperand(Num: 1), CurDAG->getTargetConstant(PTXCmpMode, DL, MVT::i32)); |
617 | ReplaceNode(F: N, T: SetP); |
618 | return true; |
619 | } |
620 | |
621 | bool NVPTXDAGToDAGISel::SelectSETP_BF16X2(SDNode *N) { |
622 | unsigned PTXCmpMode = |
623 | getPTXCmpMode(CondCode: *cast<CondCodeSDNode>(Val: N->getOperand(Num: 2)), FTZ: useF32FTZ()); |
624 | SDLoc DL(N); |
625 | SDNode *SetP = CurDAG->getMachineNode( |
626 | NVPTX::SETP_bf16x2rr, DL, MVT::i1, MVT::i1, N->getOperand(Num: 0), |
627 | N->getOperand(Num: 1), CurDAG->getTargetConstant(PTXCmpMode, DL, MVT::i32)); |
628 | ReplaceNode(F: N, T: SetP); |
629 | return true; |
630 | } |
631 | |
632 | // Find all instances of extract_vector_elt that use this v2f16 vector |
633 | // and coalesce them into a scattering move instruction. |
634 | bool NVPTXDAGToDAGISel::(SDNode *N) { |
635 | SDValue Vector = N->getOperand(Num: 0); |
636 | |
637 | // We only care about 16x2 as it's the only real vector type we |
638 | // need to deal with. |
639 | MVT VT = Vector.getSimpleValueType(); |
640 | if (!Isv2x16VT(VT)) |
641 | return false; |
642 | // Find and record all uses of this vector that extract element 0 or 1. |
643 | SmallVector<SDNode *, 4> E0, E1; |
644 | for (auto *U : Vector.getNode()->uses()) { |
645 | if (U->getOpcode() != ISD::EXTRACT_VECTOR_ELT) |
646 | continue; |
647 | if (U->getOperand(Num: 0) != Vector) |
648 | continue; |
649 | if (const ConstantSDNode *IdxConst = |
650 | dyn_cast<ConstantSDNode>(Val: U->getOperand(Num: 1))) { |
651 | if (IdxConst->getZExtValue() == 0) |
652 | E0.push_back(Elt: U); |
653 | else if (IdxConst->getZExtValue() == 1) |
654 | E1.push_back(Elt: U); |
655 | else |
656 | llvm_unreachable("Invalid vector index." ); |
657 | } |
658 | } |
659 | |
660 | // There's no point scattering f16x2 if we only ever access one |
661 | // element of it. |
662 | if (E0.empty() || E1.empty()) |
663 | return false; |
664 | |
665 | // Merge (f16 extractelt(V, 0), f16 extractelt(V,1)) |
666 | // into f16,f16 SplitF16x2(V) |
667 | MVT EltVT = VT.getVectorElementType(); |
668 | SDNode *ScatterOp = |
669 | CurDAG->getMachineNode(NVPTX::I32toV2I16, SDLoc(N), EltVT, EltVT, Vector); |
670 | for (auto *Node : E0) |
671 | ReplaceUses(F: SDValue(Node, 0), T: SDValue(ScatterOp, 0)); |
672 | for (auto *Node : E1) |
673 | ReplaceUses(F: SDValue(Node, 0), T: SDValue(ScatterOp, 1)); |
674 | |
675 | return true; |
676 | } |
677 | |
678 | static unsigned int getCodeAddrSpace(MemSDNode *N) { |
679 | const Value *Src = N->getMemOperand()->getValue(); |
680 | |
681 | if (!Src) |
682 | return NVPTX::PTXLdStInstCode::GENERIC; |
683 | |
684 | if (auto *PT = dyn_cast<PointerType>(Val: Src->getType())) { |
685 | switch (PT->getAddressSpace()) { |
686 | case llvm::ADDRESS_SPACE_LOCAL: return NVPTX::PTXLdStInstCode::LOCAL; |
687 | case llvm::ADDRESS_SPACE_GLOBAL: return NVPTX::PTXLdStInstCode::GLOBAL; |
688 | case llvm::ADDRESS_SPACE_SHARED: return NVPTX::PTXLdStInstCode::SHARED; |
689 | case llvm::ADDRESS_SPACE_GENERIC: return NVPTX::PTXLdStInstCode::GENERIC; |
690 | case llvm::ADDRESS_SPACE_PARAM: return NVPTX::PTXLdStInstCode::PARAM; |
691 | case llvm::ADDRESS_SPACE_CONST: return NVPTX::PTXLdStInstCode::CONSTANT; |
692 | default: break; |
693 | } |
694 | } |
695 | return NVPTX::PTXLdStInstCode::GENERIC; |
696 | } |
697 | |
698 | static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget, |
699 | unsigned CodeAddrSpace, MachineFunction *F) { |
700 | // We use ldg (i.e. ld.global.nc) for invariant loads from the global address |
701 | // space. |
702 | // |
703 | // We have two ways of identifying invariant loads: Loads may be explicitly |
704 | // marked as invariant, or we may infer them to be invariant. |
705 | // |
706 | // We currently infer invariance for loads from |
707 | // - constant global variables, and |
708 | // - kernel function pointer params that are noalias (i.e. __restrict) and |
709 | // never written to. |
710 | // |
711 | // TODO: Perform a more powerful invariance analysis (ideally IPO, and ideally |
712 | // not during the SelectionDAG phase). |
713 | // |
714 | // TODO: Infer invariance only at -O2. We still want to use ldg at -O0 for |
715 | // explicitly invariant loads because these are how clang tells us to use ldg |
716 | // when the user uses a builtin. |
717 | if (!Subtarget.hasLDG() || CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL) |
718 | return false; |
719 | |
720 | if (N->isInvariant()) |
721 | return true; |
722 | |
723 | bool IsKernelFn = isKernelFunction(F->getFunction()); |
724 | |
725 | // We use getUnderlyingObjects() here instead of getUnderlyingObject() mainly |
726 | // because the former looks through phi nodes while the latter does not. We |
727 | // need to look through phi nodes to handle pointer induction variables. |
728 | SmallVector<const Value *, 8> Objs; |
729 | getUnderlyingObjects(V: N->getMemOperand()->getValue(), Objects&: Objs); |
730 | |
731 | return all_of(Range&: Objs, P: [&](const Value *V) { |
732 | if (auto *A = dyn_cast<const Argument>(Val: V)) |
733 | return IsKernelFn && A->onlyReadsMemory() && A->hasNoAliasAttr(); |
734 | if (auto *GV = dyn_cast<const GlobalVariable>(Val: V)) |
735 | return GV->isConstant(); |
736 | return false; |
737 | }); |
738 | } |
739 | |
740 | bool NVPTXDAGToDAGISel::tryIntrinsicNoChain(SDNode *N) { |
741 | unsigned IID = N->getConstantOperandVal(Num: 0); |
742 | switch (IID) { |
743 | default: |
744 | return false; |
745 | case Intrinsic::nvvm_texsurf_handle_internal: |
746 | SelectTexSurfHandle(N); |
747 | return true; |
748 | } |
749 | } |
750 | |
751 | void NVPTXDAGToDAGISel::SelectTexSurfHandle(SDNode *N) { |
752 | // Op 0 is the intrinsic ID |
753 | SDValue Wrapper = N->getOperand(Num: 1); |
754 | SDValue GlobalVal = Wrapper.getOperand(i: 0); |
755 | ReplaceNode(F: N, T: CurDAG->getMachineNode(NVPTX::texsurf_handles, SDLoc(N), |
756 | MVT::i64, GlobalVal)); |
757 | } |
758 | |
759 | void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) { |
760 | SDValue Src = N->getOperand(Num: 0); |
761 | AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(Val: N); |
762 | unsigned SrcAddrSpace = CastN->getSrcAddressSpace(); |
763 | unsigned DstAddrSpace = CastN->getDestAddressSpace(); |
764 | assert(SrcAddrSpace != DstAddrSpace && |
765 | "addrspacecast must be between different address spaces" ); |
766 | |
767 | if (DstAddrSpace == ADDRESS_SPACE_GENERIC) { |
768 | // Specific to generic |
769 | unsigned Opc; |
770 | switch (SrcAddrSpace) { |
771 | default: report_fatal_error(reason: "Bad address space in addrspacecast" ); |
772 | case ADDRESS_SPACE_GLOBAL: |
773 | Opc = TM.is64Bit() ? NVPTX::cvta_global_64 : NVPTX::cvta_global; |
774 | break; |
775 | case ADDRESS_SPACE_SHARED: |
776 | Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(SrcAddrSpace) == 32 |
777 | ? NVPTX::cvta_shared_6432 |
778 | : NVPTX::cvta_shared_64) |
779 | : NVPTX::cvta_shared; |
780 | break; |
781 | case ADDRESS_SPACE_CONST: |
782 | Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(SrcAddrSpace) == 32 |
783 | ? NVPTX::cvta_const_6432 |
784 | : NVPTX::cvta_const_64) |
785 | : NVPTX::cvta_const; |
786 | break; |
787 | case ADDRESS_SPACE_LOCAL: |
788 | Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(SrcAddrSpace) == 32 |
789 | ? NVPTX::cvta_local_6432 |
790 | : NVPTX::cvta_local_64) |
791 | : NVPTX::cvta_local; |
792 | break; |
793 | } |
794 | ReplaceNode(F: N, T: CurDAG->getMachineNode(Opcode: Opc, dl: SDLoc(N), VT: N->getValueType(ResNo: 0), |
795 | Op1: Src)); |
796 | return; |
797 | } else { |
798 | // Generic to specific |
799 | if (SrcAddrSpace != 0) |
800 | report_fatal_error(reason: "Cannot cast between two non-generic address spaces" ); |
801 | unsigned Opc; |
802 | switch (DstAddrSpace) { |
803 | default: report_fatal_error(reason: "Bad address space in addrspacecast" ); |
804 | case ADDRESS_SPACE_GLOBAL: |
805 | Opc = TM.is64Bit() ? NVPTX::cvta_to_global_64 : NVPTX::cvta_to_global; |
806 | break; |
807 | case ADDRESS_SPACE_SHARED: |
808 | Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(DstAddrSpace) == 32 |
809 | ? NVPTX::cvta_to_shared_3264 |
810 | : NVPTX::cvta_to_shared_64) |
811 | : NVPTX::cvta_to_shared; |
812 | break; |
813 | case ADDRESS_SPACE_CONST: |
814 | Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(DstAddrSpace) == 32 |
815 | ? NVPTX::cvta_to_const_3264 |
816 | : NVPTX::cvta_to_const_64) |
817 | : NVPTX::cvta_to_const; |
818 | break; |
819 | case ADDRESS_SPACE_LOCAL: |
820 | Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(DstAddrSpace) == 32 |
821 | ? NVPTX::cvta_to_local_3264 |
822 | : NVPTX::cvta_to_local_64) |
823 | : NVPTX::cvta_to_local; |
824 | break; |
825 | case ADDRESS_SPACE_PARAM: |
826 | Opc = TM.is64Bit() ? NVPTX::nvvm_ptr_gen_to_param_64 |
827 | : NVPTX::nvvm_ptr_gen_to_param; |
828 | break; |
829 | } |
830 | ReplaceNode(F: N, T: CurDAG->getMachineNode(Opcode: Opc, dl: SDLoc(N), VT: N->getValueType(ResNo: 0), |
831 | Op1: Src)); |
832 | return; |
833 | } |
834 | } |
835 | |
836 | // Helper function template to reduce amount of boilerplate code for |
837 | // opcode selection. |
838 | static std::optional<unsigned> |
839 | pickOpcodeForVT(MVT::SimpleValueType VT, unsigned Opcode_i8, |
840 | unsigned Opcode_i16, unsigned Opcode_i32, |
841 | std::optional<unsigned> Opcode_i64, unsigned Opcode_f32, |
842 | std::optional<unsigned> Opcode_f64) { |
843 | switch (VT) { |
844 | case MVT::i1: |
845 | case MVT::i8: |
846 | return Opcode_i8; |
847 | case MVT::i16: |
848 | return Opcode_i16; |
849 | case MVT::i32: |
850 | return Opcode_i32; |
851 | case MVT::i64: |
852 | return Opcode_i64; |
853 | case MVT::f16: |
854 | case MVT::bf16: |
855 | return Opcode_i16; |
856 | case MVT::v2f16: |
857 | case MVT::v2bf16: |
858 | case MVT::v2i16: |
859 | case MVT::v4i8: |
860 | return Opcode_i32; |
861 | case MVT::f32: |
862 | return Opcode_f32; |
863 | case MVT::f64: |
864 | return Opcode_f64; |
865 | default: |
866 | return std::nullopt; |
867 | } |
868 | } |
869 | |
870 | static int getLdStRegType(EVT VT) { |
871 | if (VT.isFloatingPoint()) |
872 | switch (VT.getSimpleVT().SimpleTy) { |
873 | case MVT::f16: |
874 | case MVT::bf16: |
875 | case MVT::v2f16: |
876 | case MVT::v2bf16: |
877 | return NVPTX::PTXLdStInstCode::Untyped; |
878 | default: |
879 | return NVPTX::PTXLdStInstCode::Float; |
880 | } |
881 | else |
882 | return NVPTX::PTXLdStInstCode::Unsigned; |
883 | } |
884 | |
885 | bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { |
886 | SDLoc dl(N); |
887 | MemSDNode *LD = cast<MemSDNode>(Val: N); |
888 | assert(LD->readMem() && "Expected load" ); |
889 | LoadSDNode *PlainLoad = dyn_cast<LoadSDNode>(Val: N); |
890 | EVT LoadedVT = LD->getMemoryVT(); |
891 | SDNode *NVPTXLD = nullptr; |
892 | |
893 | // do not support pre/post inc/dec |
894 | if (PlainLoad && PlainLoad->isIndexed()) |
895 | return false; |
896 | |
897 | if (!LoadedVT.isSimple()) |
898 | return false; |
899 | |
900 | AtomicOrdering Ordering = LD->getSuccessOrdering(); |
901 | // In order to lower atomic loads with stronger guarantees we would need to |
902 | // use load.acquire or insert fences. However these features were only added |
903 | // with PTX ISA 6.0 / sm_70. |
904 | // TODO: Check if we can actually use the new instructions and implement them. |
905 | if (isStrongerThanMonotonic(AO: Ordering)) |
906 | return false; |
907 | |
908 | // Address Space Setting |
909 | unsigned int CodeAddrSpace = getCodeAddrSpace(N: LD); |
910 | if (canLowerToLDG(N: LD, Subtarget: *Subtarget, CodeAddrSpace, F: MF)) { |
911 | return tryLDGLDU(N); |
912 | } |
913 | |
914 | unsigned int PointerSize = |
915 | CurDAG->getDataLayout().getPointerSizeInBits(AS: LD->getAddressSpace()); |
916 | |
917 | // Volatile Setting |
918 | // - .volatile is only available for .global and .shared |
919 | // - .volatile has the same memory synchronization semantics as .relaxed.sys |
920 | bool isVolatile = LD->isVolatile() || Ordering == AtomicOrdering::Monotonic; |
921 | if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL && |
922 | CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED && |
923 | CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC) |
924 | isVolatile = false; |
925 | |
926 | // Type Setting: fromType + fromTypeWidth |
927 | // |
928 | // Sign : ISD::SEXTLOAD |
929 | // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the |
930 | // type is integer |
931 | // Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float |
932 | MVT SimpleVT = LoadedVT.getSimpleVT(); |
933 | MVT ScalarVT = SimpleVT.getScalarType(); |
934 | // Read at least 8 bits (predicates are stored as 8-bit values) |
935 | unsigned fromTypeWidth = std::max(a: 8U, b: (unsigned)ScalarVT.getSizeInBits()); |
936 | unsigned int fromType; |
937 | |
938 | // Vector Setting |
939 | unsigned vecType = NVPTX::PTXLdStInstCode::Scalar; |
940 | if (SimpleVT.isVector()) { |
941 | assert((Isv2x16VT(LoadedVT) || LoadedVT == MVT::v4i8) && |
942 | "Unexpected vector type" ); |
943 | // v2f16/v2bf16/v2i16 is loaded using ld.b32 |
944 | fromTypeWidth = 32; |
945 | } |
946 | |
947 | if (PlainLoad && (PlainLoad->getExtensionType() == ISD::SEXTLOAD)) |
948 | fromType = NVPTX::PTXLdStInstCode::Signed; |
949 | else |
950 | fromType = getLdStRegType(VT: ScalarVT); |
951 | |
952 | // Create the machine instruction DAG |
953 | SDValue Chain = N->getOperand(Num: 0); |
954 | SDValue N1 = N->getOperand(Num: 1); |
955 | SDValue Addr; |
956 | SDValue Offset, Base; |
957 | std::optional<unsigned> Opcode; |
958 | MVT::SimpleValueType TargetVT = LD->getSimpleValueType(ResNo: 0).SimpleTy; |
959 | |
960 | if (SelectDirectAddr(N: N1, Address&: Addr)) { |
961 | Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_avar, NVPTX::LD_i16_avar, |
962 | NVPTX::LD_i32_avar, NVPTX::LD_i64_avar, |
963 | NVPTX::LD_f32_avar, NVPTX::LD_f64_avar); |
964 | if (!Opcode) |
965 | return false; |
966 | SDValue Ops[] = { getI32Imm(Imm: isVolatile, DL: dl), getI32Imm(Imm: CodeAddrSpace, DL: dl), |
967 | getI32Imm(Imm: vecType, DL: dl), getI32Imm(Imm: fromType, DL: dl), |
968 | getI32Imm(Imm: fromTypeWidth, DL: dl), Addr, Chain }; |
969 | NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops); |
970 | } else if (PointerSize == 64 ? SelectADDRsi64(OpNode: N1.getNode(), Addr: N1, Base, Offset) |
971 | : SelectADDRsi(OpNode: N1.getNode(), Addr: N1, Base, Offset)) { |
972 | Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_asi, NVPTX::LD_i16_asi, |
973 | NVPTX::LD_i32_asi, NVPTX::LD_i64_asi, |
974 | NVPTX::LD_f32_asi, NVPTX::LD_f64_asi); |
975 | if (!Opcode) |
976 | return false; |
977 | SDValue Ops[] = { getI32Imm(Imm: isVolatile, DL: dl), getI32Imm(Imm: CodeAddrSpace, DL: dl), |
978 | getI32Imm(Imm: vecType, DL: dl), getI32Imm(Imm: fromType, DL: dl), |
979 | getI32Imm(Imm: fromTypeWidth, DL: dl), Base, Offset, Chain }; |
980 | NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops); |
981 | } else if (PointerSize == 64 ? SelectADDRri64(OpNode: N1.getNode(), Addr: N1, Base, Offset) |
982 | : SelectADDRri(OpNode: N1.getNode(), Addr: N1, Base, Offset)) { |
983 | if (PointerSize == 64) |
984 | Opcode = |
985 | pickOpcodeForVT(TargetVT, NVPTX::LD_i8_ari_64, NVPTX::LD_i16_ari_64, |
986 | NVPTX::LD_i32_ari_64, NVPTX::LD_i64_ari_64, |
987 | NVPTX::LD_f32_ari_64, NVPTX::LD_f64_ari_64); |
988 | else |
989 | Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_ari, NVPTX::LD_i16_ari, |
990 | NVPTX::LD_i32_ari, NVPTX::LD_i64_ari, |
991 | NVPTX::LD_f32_ari, NVPTX::LD_f64_ari); |
992 | if (!Opcode) |
993 | return false; |
994 | SDValue Ops[] = { getI32Imm(Imm: isVolatile, DL: dl), getI32Imm(Imm: CodeAddrSpace, DL: dl), |
995 | getI32Imm(Imm: vecType, DL: dl), getI32Imm(Imm: fromType, DL: dl), |
996 | getI32Imm(Imm: fromTypeWidth, DL: dl), Base, Offset, Chain }; |
997 | NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops); |
998 | } else { |
999 | if (PointerSize == 64) |
1000 | Opcode = |
1001 | pickOpcodeForVT(TargetVT, NVPTX::LD_i8_areg_64, NVPTX::LD_i16_areg_64, |
1002 | NVPTX::LD_i32_areg_64, NVPTX::LD_i64_areg_64, |
1003 | NVPTX::LD_f32_areg_64, NVPTX::LD_f64_areg_64); |
1004 | else |
1005 | Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_areg, NVPTX::LD_i16_areg, |
1006 | NVPTX::LD_i32_areg, NVPTX::LD_i64_areg, |
1007 | NVPTX::LD_f32_areg, NVPTX::LD_f64_areg); |
1008 | if (!Opcode) |
1009 | return false; |
1010 | SDValue Ops[] = { getI32Imm(Imm: isVolatile, DL: dl), getI32Imm(Imm: CodeAddrSpace, DL: dl), |
1011 | getI32Imm(Imm: vecType, DL: dl), getI32Imm(Imm: fromType, DL: dl), |
1012 | getI32Imm(Imm: fromTypeWidth, DL: dl), N1, Chain }; |
1013 | NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops); |
1014 | } |
1015 | |
1016 | if (!NVPTXLD) |
1017 | return false; |
1018 | |
1019 | MachineMemOperand *MemRef = cast<MemSDNode>(Val: N)->getMemOperand(); |
1020 | CurDAG->setNodeMemRefs(N: cast<MachineSDNode>(Val: NVPTXLD), NewMemRefs: {MemRef}); |
1021 | |
1022 | ReplaceNode(F: N, T: NVPTXLD); |
1023 | return true; |
1024 | } |
1025 | |
1026 | bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { |
1027 | |
1028 | SDValue Chain = N->getOperand(Num: 0); |
1029 | SDValue Op1 = N->getOperand(Num: 1); |
1030 | SDValue Addr, Offset, Base; |
1031 | std::optional<unsigned> Opcode; |
1032 | SDLoc DL(N); |
1033 | SDNode *LD; |
1034 | MemSDNode *MemSD = cast<MemSDNode>(Val: N); |
1035 | EVT LoadedVT = MemSD->getMemoryVT(); |
1036 | |
1037 | if (!LoadedVT.isSimple()) |
1038 | return false; |
1039 | |
1040 | // Address Space Setting |
1041 | unsigned int CodeAddrSpace = getCodeAddrSpace(N: MemSD); |
1042 | if (canLowerToLDG(N: MemSD, Subtarget: *Subtarget, CodeAddrSpace, F: MF)) { |
1043 | return tryLDGLDU(N); |
1044 | } |
1045 | |
1046 | unsigned int PointerSize = |
1047 | CurDAG->getDataLayout().getPointerSizeInBits(AS: MemSD->getAddressSpace()); |
1048 | |
1049 | // Volatile Setting |
1050 | // - .volatile is only availalble for .global and .shared |
1051 | bool IsVolatile = MemSD->isVolatile(); |
1052 | if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL && |
1053 | CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED && |
1054 | CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC) |
1055 | IsVolatile = false; |
1056 | |
1057 | // Vector Setting |
1058 | MVT SimpleVT = LoadedVT.getSimpleVT(); |
1059 | |
1060 | // Type Setting: fromType + fromTypeWidth |
1061 | // |
1062 | // Sign : ISD::SEXTLOAD |
1063 | // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the |
1064 | // type is integer |
1065 | // Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float |
1066 | MVT ScalarVT = SimpleVT.getScalarType(); |
1067 | // Read at least 8 bits (predicates are stored as 8-bit values) |
1068 | unsigned FromTypeWidth = std::max(a: 8U, b: (unsigned)ScalarVT.getSizeInBits()); |
1069 | unsigned int FromType; |
1070 | // The last operand holds the original LoadSDNode::getExtensionType() value |
1071 | unsigned ExtensionType = cast<ConstantSDNode>( |
1072 | Val: N->getOperand(Num: N->getNumOperands() - 1))->getZExtValue(); |
1073 | if (ExtensionType == ISD::SEXTLOAD) |
1074 | FromType = NVPTX::PTXLdStInstCode::Signed; |
1075 | else |
1076 | FromType = getLdStRegType(VT: ScalarVT); |
1077 | |
1078 | unsigned VecType; |
1079 | |
1080 | switch (N->getOpcode()) { |
1081 | case NVPTXISD::LoadV2: |
1082 | VecType = NVPTX::PTXLdStInstCode::V2; |
1083 | break; |
1084 | case NVPTXISD::LoadV4: |
1085 | VecType = NVPTX::PTXLdStInstCode::V4; |
1086 | break; |
1087 | default: |
1088 | return false; |
1089 | } |
1090 | |
1091 | EVT EltVT = N->getValueType(ResNo: 0); |
1092 | |
1093 | // v8x16 is a special case. PTX doesn't have ld.v8.16 |
1094 | // instruction. Instead, we split the vector into v2x16 chunks and |
1095 | // load them with ld.v4.b32. |
1096 | if (Isv2x16VT(VT: EltVT)) { |
1097 | assert(N->getOpcode() == NVPTXISD::LoadV4 && "Unexpected load opcode." ); |
1098 | EltVT = MVT::i32; |
1099 | FromType = NVPTX::PTXLdStInstCode::Untyped; |
1100 | FromTypeWidth = 32; |
1101 | } |
1102 | |
1103 | if (SelectDirectAddr(N: Op1, Address&: Addr)) { |
1104 | switch (N->getOpcode()) { |
1105 | default: |
1106 | return false; |
1107 | case NVPTXISD::LoadV2: |
1108 | Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, |
1109 | NVPTX::LDV_i8_v2_avar, NVPTX::LDV_i16_v2_avar, |
1110 | NVPTX::LDV_i32_v2_avar, NVPTX::LDV_i64_v2_avar, |
1111 | NVPTX::LDV_f32_v2_avar, NVPTX::LDV_f64_v2_avar); |
1112 | break; |
1113 | case NVPTXISD::LoadV4: |
1114 | Opcode = |
1115 | pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_avar, |
1116 | NVPTX::LDV_i16_v4_avar, NVPTX::LDV_i32_v4_avar, |
1117 | std::nullopt, NVPTX::LDV_f32_v4_avar, std::nullopt); |
1118 | break; |
1119 | } |
1120 | if (!Opcode) |
1121 | return false; |
1122 | SDValue Ops[] = { getI32Imm(Imm: IsVolatile, DL), getI32Imm(Imm: CodeAddrSpace, DL), |
1123 | getI32Imm(Imm: VecType, DL), getI32Imm(Imm: FromType, DL), |
1124 | getI32Imm(Imm: FromTypeWidth, DL), Addr, Chain }; |
1125 | LD = CurDAG->getMachineNode(Opcode: *Opcode, dl: DL, VTs: N->getVTList(), Ops); |
1126 | } else if (PointerSize == 64 |
1127 | ? SelectADDRsi64(OpNode: Op1.getNode(), Addr: Op1, Base, Offset) |
1128 | : SelectADDRsi(OpNode: Op1.getNode(), Addr: Op1, Base, Offset)) { |
1129 | switch (N->getOpcode()) { |
1130 | default: |
1131 | return false; |
1132 | case NVPTXISD::LoadV2: |
1133 | Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, |
1134 | NVPTX::LDV_i8_v2_asi, NVPTX::LDV_i16_v2_asi, |
1135 | NVPTX::LDV_i32_v2_asi, NVPTX::LDV_i64_v2_asi, |
1136 | NVPTX::LDV_f32_v2_asi, NVPTX::LDV_f64_v2_asi); |
1137 | break; |
1138 | case NVPTXISD::LoadV4: |
1139 | Opcode = |
1140 | pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_asi, |
1141 | NVPTX::LDV_i16_v4_asi, NVPTX::LDV_i32_v4_asi, |
1142 | std::nullopt, NVPTX::LDV_f32_v4_asi, std::nullopt); |
1143 | break; |
1144 | } |
1145 | if (!Opcode) |
1146 | return false; |
1147 | SDValue Ops[] = { getI32Imm(Imm: IsVolatile, DL), getI32Imm(Imm: CodeAddrSpace, DL), |
1148 | getI32Imm(Imm: VecType, DL), getI32Imm(Imm: FromType, DL), |
1149 | getI32Imm(Imm: FromTypeWidth, DL), Base, Offset, Chain }; |
1150 | LD = CurDAG->getMachineNode(Opcode: *Opcode, dl: DL, VTs: N->getVTList(), Ops); |
1151 | } else if (PointerSize == 64 |
1152 | ? SelectADDRri64(OpNode: Op1.getNode(), Addr: Op1, Base, Offset) |
1153 | : SelectADDRri(OpNode: Op1.getNode(), Addr: Op1, Base, Offset)) { |
1154 | if (PointerSize == 64) { |
1155 | switch (N->getOpcode()) { |
1156 | default: |
1157 | return false; |
1158 | case NVPTXISD::LoadV2: |
1159 | Opcode = |
1160 | pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, |
1161 | NVPTX::LDV_i8_v2_ari_64, NVPTX::LDV_i16_v2_ari_64, |
1162 | NVPTX::LDV_i32_v2_ari_64, NVPTX::LDV_i64_v2_ari_64, |
1163 | NVPTX::LDV_f32_v2_ari_64, NVPTX::LDV_f64_v2_ari_64); |
1164 | break; |
1165 | case NVPTXISD::LoadV4: |
1166 | Opcode = pickOpcodeForVT( |
1167 | EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_ari_64, |
1168 | NVPTX::LDV_i16_v4_ari_64, NVPTX::LDV_i32_v4_ari_64, std::nullopt, |
1169 | NVPTX::LDV_f32_v4_ari_64, std::nullopt); |
1170 | break; |
1171 | } |
1172 | } else { |
1173 | switch (N->getOpcode()) { |
1174 | default: |
1175 | return false; |
1176 | case NVPTXISD::LoadV2: |
1177 | Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, |
1178 | NVPTX::LDV_i8_v2_ari, NVPTX::LDV_i16_v2_ari, |
1179 | NVPTX::LDV_i32_v2_ari, NVPTX::LDV_i64_v2_ari, |
1180 | NVPTX::LDV_f32_v2_ari, NVPTX::LDV_f64_v2_ari); |
1181 | break; |
1182 | case NVPTXISD::LoadV4: |
1183 | Opcode = |
1184 | pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_ari, |
1185 | NVPTX::LDV_i16_v4_ari, NVPTX::LDV_i32_v4_ari, |
1186 | std::nullopt, NVPTX::LDV_f32_v4_ari, std::nullopt); |
1187 | break; |
1188 | } |
1189 | } |
1190 | if (!Opcode) |
1191 | return false; |
1192 | SDValue Ops[] = { getI32Imm(Imm: IsVolatile, DL), getI32Imm(Imm: CodeAddrSpace, DL), |
1193 | getI32Imm(Imm: VecType, DL), getI32Imm(Imm: FromType, DL), |
1194 | getI32Imm(Imm: FromTypeWidth, DL), Base, Offset, Chain }; |
1195 | |
1196 | LD = CurDAG->getMachineNode(Opcode: *Opcode, dl: DL, VTs: N->getVTList(), Ops); |
1197 | } else { |
1198 | if (PointerSize == 64) { |
1199 | switch (N->getOpcode()) { |
1200 | default: |
1201 | return false; |
1202 | case NVPTXISD::LoadV2: |
1203 | Opcode = pickOpcodeForVT( |
1204 | EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg_64, |
1205 | NVPTX::LDV_i16_v2_areg_64, NVPTX::LDV_i32_v2_areg_64, |
1206 | NVPTX::LDV_i64_v2_areg_64, NVPTX::LDV_f32_v2_areg_64, |
1207 | NVPTX::LDV_f64_v2_areg_64); |
1208 | break; |
1209 | case NVPTXISD::LoadV4: |
1210 | Opcode = pickOpcodeForVT( |
1211 | EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg_64, |
1212 | NVPTX::LDV_i16_v4_areg_64, NVPTX::LDV_i32_v4_areg_64, std::nullopt, |
1213 | NVPTX::LDV_f32_v4_areg_64, std::nullopt); |
1214 | break; |
1215 | } |
1216 | } else { |
1217 | switch (N->getOpcode()) { |
1218 | default: |
1219 | return false; |
1220 | case NVPTXISD::LoadV2: |
1221 | Opcode = |
1222 | pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg, |
1223 | NVPTX::LDV_i16_v2_areg, NVPTX::LDV_i32_v2_areg, |
1224 | NVPTX::LDV_i64_v2_areg, NVPTX::LDV_f32_v2_areg, |
1225 | NVPTX::LDV_f64_v2_areg); |
1226 | break; |
1227 | case NVPTXISD::LoadV4: |
1228 | Opcode = |
1229 | pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg, |
1230 | NVPTX::LDV_i16_v4_areg, NVPTX::LDV_i32_v4_areg, |
1231 | std::nullopt, NVPTX::LDV_f32_v4_areg, std::nullopt); |
1232 | break; |
1233 | } |
1234 | } |
1235 | if (!Opcode) |
1236 | return false; |
1237 | SDValue Ops[] = { getI32Imm(Imm: IsVolatile, DL), getI32Imm(Imm: CodeAddrSpace, DL), |
1238 | getI32Imm(Imm: VecType, DL), getI32Imm(Imm: FromType, DL), |
1239 | getI32Imm(Imm: FromTypeWidth, DL), Op1, Chain }; |
1240 | LD = CurDAG->getMachineNode(Opcode: *Opcode, dl: DL, VTs: N->getVTList(), Ops); |
1241 | } |
1242 | |
1243 | MachineMemOperand *MemRef = cast<MemSDNode>(Val: N)->getMemOperand(); |
1244 | CurDAG->setNodeMemRefs(N: cast<MachineSDNode>(Val: LD), NewMemRefs: {MemRef}); |
1245 | |
1246 | ReplaceNode(F: N, T: LD); |
1247 | return true; |
1248 | } |
1249 | |
1250 | bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) { |
1251 | |
1252 | SDValue Chain = N->getOperand(Num: 0); |
1253 | SDValue Op1; |
1254 | MemSDNode *Mem; |
1255 | bool IsLDG = true; |
1256 | |
1257 | // If this is an LDG intrinsic, the address is the third operand. If its an |
1258 | // LDG/LDU SD node (from custom vector handling), then its the second operand |
1259 | if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) { |
1260 | Op1 = N->getOperand(Num: 2); |
1261 | Mem = cast<MemIntrinsicSDNode>(Val: N); |
1262 | unsigned IID = N->getConstantOperandVal(Num: 1); |
1263 | switch (IID) { |
1264 | default: |
1265 | return false; |
1266 | case Intrinsic::nvvm_ldg_global_f: |
1267 | case Intrinsic::nvvm_ldg_global_i: |
1268 | case Intrinsic::nvvm_ldg_global_p: |
1269 | IsLDG = true; |
1270 | break; |
1271 | case Intrinsic::nvvm_ldu_global_f: |
1272 | case Intrinsic::nvvm_ldu_global_i: |
1273 | case Intrinsic::nvvm_ldu_global_p: |
1274 | IsLDG = false; |
1275 | break; |
1276 | } |
1277 | } else { |
1278 | Op1 = N->getOperand(Num: 1); |
1279 | Mem = cast<MemSDNode>(Val: N); |
1280 | } |
1281 | |
1282 | std::optional<unsigned> Opcode; |
1283 | SDLoc DL(N); |
1284 | SDNode *LD; |
1285 | SDValue Base, Offset, Addr; |
1286 | EVT OrigType = N->getValueType(ResNo: 0); |
1287 | |
1288 | EVT EltVT = Mem->getMemoryVT(); |
1289 | unsigned NumElts = 1; |
1290 | if (EltVT.isVector()) { |
1291 | NumElts = EltVT.getVectorNumElements(); |
1292 | EltVT = EltVT.getVectorElementType(); |
1293 | // vectors of 16bits type are loaded/stored as multiples of v2x16 elements. |
1294 | if ((EltVT == MVT::f16 && OrigType == MVT::v2f16) || |
1295 | (EltVT == MVT::bf16 && OrigType == MVT::v2bf16) || |
1296 | (EltVT == MVT::i16 && OrigType == MVT::v2i16)) { |
1297 | assert(NumElts % 2 == 0 && "Vector must have even number of elements" ); |
1298 | EltVT = OrigType; |
1299 | NumElts /= 2; |
1300 | } else if (OrigType == MVT::v4i8) { |
1301 | EltVT = OrigType; |
1302 | NumElts = 1; |
1303 | } |
1304 | } |
1305 | |
1306 | // Build the "promoted" result VTList for the load. If we are really loading |
1307 | // i8s, then the return type will be promoted to i16 since we do not expose |
1308 | // 8-bit registers in NVPTX. |
1309 | EVT NodeVT = (EltVT == MVT::i8) ? MVT::i16 : EltVT; |
1310 | SmallVector<EVT, 5> InstVTs; |
1311 | for (unsigned i = 0; i != NumElts; ++i) { |
1312 | InstVTs.push_back(Elt: NodeVT); |
1313 | } |
1314 | InstVTs.push_back(MVT::Other); |
1315 | SDVTList InstVTList = CurDAG->getVTList(VTs: InstVTs); |
1316 | |
1317 | if (SelectDirectAddr(N: Op1, Address&: Addr)) { |
1318 | switch (N->getOpcode()) { |
1319 | default: |
1320 | return false; |
1321 | case ISD::LOAD: |
1322 | case ISD::INTRINSIC_W_CHAIN: |
1323 | if (IsLDG) |
1324 | Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, |
1325 | NVPTX::INT_PTX_LDG_GLOBAL_i8avar, |
1326 | NVPTX::INT_PTX_LDG_GLOBAL_i16avar, |
1327 | NVPTX::INT_PTX_LDG_GLOBAL_i32avar, |
1328 | NVPTX::INT_PTX_LDG_GLOBAL_i64avar, |
1329 | NVPTX::INT_PTX_LDG_GLOBAL_f32avar, |
1330 | NVPTX::INT_PTX_LDG_GLOBAL_f64avar); |
1331 | else |
1332 | Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, |
1333 | NVPTX::INT_PTX_LDU_GLOBAL_i8avar, |
1334 | NVPTX::INT_PTX_LDU_GLOBAL_i16avar, |
1335 | NVPTX::INT_PTX_LDU_GLOBAL_i32avar, |
1336 | NVPTX::INT_PTX_LDU_GLOBAL_i64avar, |
1337 | NVPTX::INT_PTX_LDU_GLOBAL_f32avar, |
1338 | NVPTX::INT_PTX_LDU_GLOBAL_f64avar); |
1339 | break; |
1340 | case NVPTXISD::LoadV2: |
1341 | case NVPTXISD::LDGV2: |
1342 | Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, |
1343 | NVPTX::INT_PTX_LDG_G_v2i8_ELE_avar, |
1344 | NVPTX::INT_PTX_LDG_G_v2i16_ELE_avar, |
1345 | NVPTX::INT_PTX_LDG_G_v2i32_ELE_avar, |
1346 | NVPTX::INT_PTX_LDG_G_v2i64_ELE_avar, |
1347 | NVPTX::INT_PTX_LDG_G_v2f32_ELE_avar, |
1348 | NVPTX::INT_PTX_LDG_G_v2f64_ELE_avar); |
1349 | break; |
1350 | case NVPTXISD::LDUV2: |
1351 | Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, |
1352 | NVPTX::INT_PTX_LDU_G_v2i8_ELE_avar, |
1353 | NVPTX::INT_PTX_LDU_G_v2i16_ELE_avar, |
1354 | NVPTX::INT_PTX_LDU_G_v2i32_ELE_avar, |
1355 | NVPTX::INT_PTX_LDU_G_v2i64_ELE_avar, |
1356 | NVPTX::INT_PTX_LDU_G_v2f32_ELE_avar, |
1357 | NVPTX::INT_PTX_LDU_G_v2f64_ELE_avar); |
1358 | break; |
1359 | case NVPTXISD::LoadV4: |
1360 | case NVPTXISD::LDGV4: |
1361 | Opcode = pickOpcodeForVT( |
1362 | EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_avar, |
1363 | NVPTX::INT_PTX_LDG_G_v4i16_ELE_avar, |
1364 | NVPTX::INT_PTX_LDG_G_v4i32_ELE_avar, std::nullopt, |
1365 | NVPTX::INT_PTX_LDG_G_v4f32_ELE_avar, std::nullopt); |
1366 | break; |
1367 | case NVPTXISD::LDUV4: |
1368 | Opcode = pickOpcodeForVT( |
1369 | EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_avar, |
1370 | NVPTX::INT_PTX_LDU_G_v4i16_ELE_avar, |
1371 | NVPTX::INT_PTX_LDU_G_v4i32_ELE_avar, std::nullopt, |
1372 | NVPTX::INT_PTX_LDU_G_v4f32_ELE_avar, std::nullopt); |
1373 | break; |
1374 | } |
1375 | if (!Opcode) |
1376 | return false; |
1377 | SDValue Ops[] = { Addr, Chain }; |
1378 | LD = CurDAG->getMachineNode(Opcode: *Opcode, dl: DL, VTs: InstVTList, Ops); |
1379 | } else if (TM.is64Bit() ? SelectADDRri64(OpNode: Op1.getNode(), Addr: Op1, Base, Offset) |
1380 | : SelectADDRri(OpNode: Op1.getNode(), Addr: Op1, Base, Offset)) { |
1381 | if (TM.is64Bit()) { |
1382 | switch (N->getOpcode()) { |
1383 | default: |
1384 | return false; |
1385 | case ISD::LOAD: |
1386 | case ISD::INTRINSIC_W_CHAIN: |
1387 | if (IsLDG) |
1388 | Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, |
1389 | NVPTX::INT_PTX_LDG_GLOBAL_i8ari64, |
1390 | NVPTX::INT_PTX_LDG_GLOBAL_i16ari64, |
1391 | NVPTX::INT_PTX_LDG_GLOBAL_i32ari64, |
1392 | NVPTX::INT_PTX_LDG_GLOBAL_i64ari64, |
1393 | NVPTX::INT_PTX_LDG_GLOBAL_f32ari64, |
1394 | NVPTX::INT_PTX_LDG_GLOBAL_f64ari64); |
1395 | else |
1396 | Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, |
1397 | NVPTX::INT_PTX_LDU_GLOBAL_i8ari64, |
1398 | NVPTX::INT_PTX_LDU_GLOBAL_i16ari64, |
1399 | NVPTX::INT_PTX_LDU_GLOBAL_i32ari64, |
1400 | NVPTX::INT_PTX_LDU_GLOBAL_i64ari64, |
1401 | NVPTX::INT_PTX_LDU_GLOBAL_f32ari64, |
1402 | NVPTX::INT_PTX_LDU_GLOBAL_f64ari64); |
1403 | break; |
1404 | case NVPTXISD::LoadV2: |
1405 | case NVPTXISD::LDGV2: |
1406 | Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, |
1407 | NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari64, |
1408 | NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari64, |
1409 | NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari64, |
1410 | NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari64, |
1411 | NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari64, |
1412 | NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari64); |
1413 | break; |
1414 | case NVPTXISD::LDUV2: |
1415 | Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, |
1416 | NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari64, |
1417 | NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari64, |
1418 | NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari64, |
1419 | NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari64, |
1420 | NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari64, |
1421 | NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari64); |
1422 | break; |
1423 | case NVPTXISD::LoadV4: |
1424 | case NVPTXISD::LDGV4: |
1425 | Opcode = pickOpcodeForVT( |
1426 | EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari64, |
1427 | NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari64, |
1428 | NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari64, std::nullopt, |
1429 | NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari64, std::nullopt); |
1430 | break; |
1431 | case NVPTXISD::LDUV4: |
1432 | Opcode = pickOpcodeForVT( |
1433 | EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari64, |
1434 | NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari64, |
1435 | NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari64, std::nullopt, |
1436 | NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari64, std::nullopt); |
1437 | break; |
1438 | } |
1439 | } else { |
1440 | switch (N->getOpcode()) { |
1441 | default: |
1442 | return false; |
1443 | case ISD::LOAD: |
1444 | case ISD::INTRINSIC_W_CHAIN: |
1445 | if (IsLDG) |
1446 | Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, |
1447 | NVPTX::INT_PTX_LDG_GLOBAL_i8ari, |
1448 | NVPTX::INT_PTX_LDG_GLOBAL_i16ari, |
1449 | NVPTX::INT_PTX_LDG_GLOBAL_i32ari, |
1450 | NVPTX::INT_PTX_LDG_GLOBAL_i64ari, |
1451 | NVPTX::INT_PTX_LDG_GLOBAL_f32ari, |
1452 | NVPTX::INT_PTX_LDG_GLOBAL_f64ari); |
1453 | else |
1454 | Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, |
1455 | NVPTX::INT_PTX_LDU_GLOBAL_i8ari, |
1456 | NVPTX::INT_PTX_LDU_GLOBAL_i16ari, |
1457 | NVPTX::INT_PTX_LDU_GLOBAL_i32ari, |
1458 | NVPTX::INT_PTX_LDU_GLOBAL_i64ari, |
1459 | NVPTX::INT_PTX_LDU_GLOBAL_f32ari, |
1460 | NVPTX::INT_PTX_LDU_GLOBAL_f64ari); |
1461 | break; |
1462 | case NVPTXISD::LoadV2: |
1463 | case NVPTXISD::LDGV2: |
1464 | Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, |
1465 | NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari32, |
1466 | NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari32, |
1467 | NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari32, |
1468 | NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari32, |
1469 | NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari32, |
1470 | NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari32); |
1471 | break; |
1472 | case NVPTXISD::LDUV2: |
1473 | Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, |
1474 | NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari32, |
1475 | NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari32, |
1476 | NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari32, |
1477 | NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari32, |
1478 | NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari32, |
1479 | NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari32); |
1480 | break; |
1481 | case NVPTXISD::LoadV4: |
1482 | case NVPTXISD::LDGV4: |
1483 | Opcode = pickOpcodeForVT( |
1484 | EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari32, |
1485 | NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari32, |
1486 | NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari32, std::nullopt, |
1487 | NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari32, std::nullopt); |
1488 | break; |
1489 | case NVPTXISD::LDUV4: |
1490 | Opcode = pickOpcodeForVT( |
1491 | EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari32, |
1492 | NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari32, |
1493 | NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari32, std::nullopt, |
1494 | NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari32, std::nullopt); |
1495 | break; |
1496 | } |
1497 | } |
1498 | if (!Opcode) |
1499 | return false; |
1500 | SDValue Ops[] = {Base, Offset, Chain}; |
1501 | LD = CurDAG->getMachineNode(Opcode: *Opcode, dl: DL, VTs: InstVTList, Ops); |
1502 | } else { |
1503 | if (TM.is64Bit()) { |
1504 | switch (N->getOpcode()) { |
1505 | default: |
1506 | return false; |
1507 | case ISD::LOAD: |
1508 | case ISD::INTRINSIC_W_CHAIN: |
1509 | if (IsLDG) |
1510 | Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, |
1511 | NVPTX::INT_PTX_LDG_GLOBAL_i8areg64, |
1512 | NVPTX::INT_PTX_LDG_GLOBAL_i16areg64, |
1513 | NVPTX::INT_PTX_LDG_GLOBAL_i32areg64, |
1514 | NVPTX::INT_PTX_LDG_GLOBAL_i64areg64, |
1515 | NVPTX::INT_PTX_LDG_GLOBAL_f32areg64, |
1516 | NVPTX::INT_PTX_LDG_GLOBAL_f64areg64); |
1517 | else |
1518 | Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, |
1519 | NVPTX::INT_PTX_LDU_GLOBAL_i8areg64, |
1520 | NVPTX::INT_PTX_LDU_GLOBAL_i16areg64, |
1521 | NVPTX::INT_PTX_LDU_GLOBAL_i32areg64, |
1522 | NVPTX::INT_PTX_LDU_GLOBAL_i64areg64, |
1523 | NVPTX::INT_PTX_LDU_GLOBAL_f32areg64, |
1524 | NVPTX::INT_PTX_LDU_GLOBAL_f64areg64); |
1525 | break; |
1526 | case NVPTXISD::LoadV2: |
1527 | case NVPTXISD::LDGV2: |
1528 | Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, |
1529 | NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg64, |
1530 | NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg64, |
1531 | NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg64, |
1532 | NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg64, |
1533 | NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg64, |
1534 | NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg64); |
1535 | break; |
1536 | case NVPTXISD::LDUV2: |
1537 | Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, |
1538 | NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg64, |
1539 | NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg64, |
1540 | NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg64, |
1541 | NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg64, |
1542 | NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg64, |
1543 | NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg64); |
1544 | break; |
1545 | case NVPTXISD::LoadV4: |
1546 | case NVPTXISD::LDGV4: |
1547 | Opcode = pickOpcodeForVT( |
1548 | EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg64, |
1549 | NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg64, |
1550 | NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg64, std::nullopt, |
1551 | NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg64, std::nullopt); |
1552 | break; |
1553 | case NVPTXISD::LDUV4: |
1554 | Opcode = pickOpcodeForVT( |
1555 | EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg64, |
1556 | NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg64, |
1557 | NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg64, std::nullopt, |
1558 | NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg64, std::nullopt); |
1559 | break; |
1560 | } |
1561 | } else { |
1562 | switch (N->getOpcode()) { |
1563 | default: |
1564 | return false; |
1565 | case ISD::LOAD: |
1566 | case ISD::INTRINSIC_W_CHAIN: |
1567 | if (IsLDG) |
1568 | Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, |
1569 | NVPTX::INT_PTX_LDG_GLOBAL_i8areg, |
1570 | NVPTX::INT_PTX_LDG_GLOBAL_i16areg, |
1571 | NVPTX::INT_PTX_LDG_GLOBAL_i32areg, |
1572 | NVPTX::INT_PTX_LDG_GLOBAL_i64areg, |
1573 | NVPTX::INT_PTX_LDG_GLOBAL_f32areg, |
1574 | NVPTX::INT_PTX_LDG_GLOBAL_f64areg); |
1575 | else |
1576 | Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, |
1577 | NVPTX::INT_PTX_LDU_GLOBAL_i8areg, |
1578 | NVPTX::INT_PTX_LDU_GLOBAL_i16areg, |
1579 | NVPTX::INT_PTX_LDU_GLOBAL_i32areg, |
1580 | NVPTX::INT_PTX_LDU_GLOBAL_i64areg, |
1581 | NVPTX::INT_PTX_LDU_GLOBAL_f32areg, |
1582 | NVPTX::INT_PTX_LDU_GLOBAL_f64areg); |
1583 | break; |
1584 | case NVPTXISD::LoadV2: |
1585 | case NVPTXISD::LDGV2: |
1586 | Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, |
1587 | NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg32, |
1588 | NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg32, |
1589 | NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg32, |
1590 | NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg32, |
1591 | NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg32, |
1592 | NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg32); |
1593 | break; |
1594 | case NVPTXISD::LDUV2: |
1595 | Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, |
1596 | NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg32, |
1597 | NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg32, |
1598 | NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg32, |
1599 | NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg32, |
1600 | NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg32, |
1601 | NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg32); |
1602 | break; |
1603 | case NVPTXISD::LoadV4: |
1604 | case NVPTXISD::LDGV4: |
1605 | Opcode = pickOpcodeForVT( |
1606 | EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg32, |
1607 | NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg32, |
1608 | NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg32, std::nullopt, |
1609 | NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg32, std::nullopt); |
1610 | break; |
1611 | case NVPTXISD::LDUV4: |
1612 | Opcode = pickOpcodeForVT( |
1613 | EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg32, |
1614 | NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg32, |
1615 | NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg32, std::nullopt, |
1616 | NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg32, std::nullopt); |
1617 | break; |
1618 | } |
1619 | } |
1620 | if (!Opcode) |
1621 | return false; |
1622 | SDValue Ops[] = { Op1, Chain }; |
1623 | LD = CurDAG->getMachineNode(Opcode: *Opcode, dl: DL, VTs: InstVTList, Ops); |
1624 | } |
1625 | |
1626 | // For automatic generation of LDG (through SelectLoad[Vector], not the |
1627 | // intrinsics), we may have an extending load like: |
1628 | // |
1629 | // i32,ch = load<LD1[%data1(addrspace=1)], zext from i8> t0, t7, undef:i64 |
1630 | // |
1631 | // In this case, the matching logic above will select a load for the original |
1632 | // memory type (in this case, i8) and our types will not match (the node needs |
1633 | // to return an i32 in this case). Our LDG/LDU nodes do not support the |
1634 | // concept of sign-/zero-extension, so emulate it here by adding an explicit |
1635 | // CVT instruction. Ptxas should clean up any redundancies here. |
1636 | |
1637 | LoadSDNode *LdNode = dyn_cast<LoadSDNode>(Val: N); |
1638 | |
1639 | if (OrigType != EltVT && |
1640 | (LdNode || (OrigType.isFloatingPoint() && EltVT.isFloatingPoint()))) { |
1641 | // We have an extending-load. The instruction we selected operates on the |
1642 | // smaller type, but the SDNode we are replacing has the larger type. We |
1643 | // need to emit a CVT to make the types match. |
1644 | unsigned CvtOpc = |
1645 | GetConvertOpcode(DestTy: OrigType.getSimpleVT(), SrcTy: EltVT.getSimpleVT(), N: LdNode); |
1646 | |
1647 | // For each output value, apply the manual sign/zero-extension and make sure |
1648 | // all users of the load go through that CVT. |
1649 | for (unsigned i = 0; i != NumElts; ++i) { |
1650 | SDValue Res(LD, i); |
1651 | SDValue OrigVal(N, i); |
1652 | |
1653 | SDNode *CvtNode = |
1654 | CurDAG->getMachineNode(CvtOpc, DL, OrigType, Res, |
1655 | CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, |
1656 | DL, MVT::i32)); |
1657 | ReplaceUses(F: OrigVal, T: SDValue(CvtNode, 0)); |
1658 | } |
1659 | } |
1660 | |
1661 | ReplaceNode(F: N, T: LD); |
1662 | return true; |
1663 | } |
1664 | |
1665 | bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { |
1666 | SDLoc dl(N); |
1667 | MemSDNode *ST = cast<MemSDNode>(Val: N); |
1668 | assert(ST->writeMem() && "Expected store" ); |
1669 | StoreSDNode *PlainStore = dyn_cast<StoreSDNode>(Val: N); |
1670 | AtomicSDNode *AtomicStore = dyn_cast<AtomicSDNode>(Val: N); |
1671 | assert((PlainStore || AtomicStore) && "Expected store" ); |
1672 | EVT StoreVT = ST->getMemoryVT(); |
1673 | SDNode *NVPTXST = nullptr; |
1674 | |
1675 | // do not support pre/post inc/dec |
1676 | if (PlainStore && PlainStore->isIndexed()) |
1677 | return false; |
1678 | |
1679 | if (!StoreVT.isSimple()) |
1680 | return false; |
1681 | |
1682 | AtomicOrdering Ordering = ST->getSuccessOrdering(); |
1683 | // In order to lower atomic loads with stronger guarantees we would need to |
1684 | // use store.release or insert fences. However these features were only added |
1685 | // with PTX ISA 6.0 / sm_70. |
1686 | // TODO: Check if we can actually use the new instructions and implement them. |
1687 | if (isStrongerThanMonotonic(AO: Ordering)) |
1688 | return false; |
1689 | |
1690 | // Address Space Setting |
1691 | unsigned int CodeAddrSpace = getCodeAddrSpace(N: ST); |
1692 | unsigned int PointerSize = |
1693 | CurDAG->getDataLayout().getPointerSizeInBits(AS: ST->getAddressSpace()); |
1694 | |
1695 | // Volatile Setting |
1696 | // - .volatile is only available for .global and .shared |
1697 | // - .volatile has the same memory synchronization semantics as .relaxed.sys |
1698 | bool isVolatile = ST->isVolatile() || Ordering == AtomicOrdering::Monotonic; |
1699 | if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL && |
1700 | CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED && |
1701 | CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC) |
1702 | isVolatile = false; |
1703 | |
1704 | // Vector Setting |
1705 | MVT SimpleVT = StoreVT.getSimpleVT(); |
1706 | unsigned vecType = NVPTX::PTXLdStInstCode::Scalar; |
1707 | |
1708 | // Type Setting: toType + toTypeWidth |
1709 | // - for integer type, always use 'u' |
1710 | // |
1711 | MVT ScalarVT = SimpleVT.getScalarType(); |
1712 | unsigned toTypeWidth = ScalarVT.getSizeInBits(); |
1713 | if (SimpleVT.isVector()) { |
1714 | assert((Isv2x16VT(StoreVT) || StoreVT == MVT::v4i8) && |
1715 | "Unexpected vector type" ); |
1716 | // v2x16 is stored using st.b32 |
1717 | toTypeWidth = 32; |
1718 | } |
1719 | |
1720 | unsigned int toType = getLdStRegType(VT: ScalarVT); |
1721 | |
1722 | // Create the machine instruction DAG |
1723 | SDValue Chain = ST->getChain(); |
1724 | SDValue Value = PlainStore ? PlainStore->getValue() : AtomicStore->getVal(); |
1725 | SDValue BasePtr = ST->getBasePtr(); |
1726 | SDValue Addr; |
1727 | SDValue Offset, Base; |
1728 | std::optional<unsigned> Opcode; |
1729 | MVT::SimpleValueType SourceVT = |
1730 | Value.getNode()->getSimpleValueType(ResNo: 0).SimpleTy; |
1731 | |
1732 | if (SelectDirectAddr(N: BasePtr, Address&: Addr)) { |
1733 | Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_avar, NVPTX::ST_i16_avar, |
1734 | NVPTX::ST_i32_avar, NVPTX::ST_i64_avar, |
1735 | NVPTX::ST_f32_avar, NVPTX::ST_f64_avar); |
1736 | if (!Opcode) |
1737 | return false; |
1738 | SDValue Ops[] = {Value, |
1739 | getI32Imm(Imm: isVolatile, DL: dl), |
1740 | getI32Imm(Imm: CodeAddrSpace, DL: dl), |
1741 | getI32Imm(Imm: vecType, DL: dl), |
1742 | getI32Imm(Imm: toType, DL: dl), |
1743 | getI32Imm(Imm: toTypeWidth, DL: dl), |
1744 | Addr, |
1745 | Chain}; |
1746 | NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops); |
1747 | } else if (PointerSize == 64 |
1748 | ? SelectADDRsi64(OpNode: BasePtr.getNode(), Addr: BasePtr, Base, Offset) |
1749 | : SelectADDRsi(OpNode: BasePtr.getNode(), Addr: BasePtr, Base, Offset)) { |
1750 | Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_asi, NVPTX::ST_i16_asi, |
1751 | NVPTX::ST_i32_asi, NVPTX::ST_i64_asi, |
1752 | NVPTX::ST_f32_asi, NVPTX::ST_f64_asi); |
1753 | if (!Opcode) |
1754 | return false; |
1755 | SDValue Ops[] = {Value, |
1756 | getI32Imm(Imm: isVolatile, DL: dl), |
1757 | getI32Imm(Imm: CodeAddrSpace, DL: dl), |
1758 | getI32Imm(Imm: vecType, DL: dl), |
1759 | getI32Imm(Imm: toType, DL: dl), |
1760 | getI32Imm(Imm: toTypeWidth, DL: dl), |
1761 | Base, |
1762 | Offset, |
1763 | Chain}; |
1764 | NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops); |
1765 | } else if (PointerSize == 64 |
1766 | ? SelectADDRri64(OpNode: BasePtr.getNode(), Addr: BasePtr, Base, Offset) |
1767 | : SelectADDRri(OpNode: BasePtr.getNode(), Addr: BasePtr, Base, Offset)) { |
1768 | if (PointerSize == 64) |
1769 | Opcode = |
1770 | pickOpcodeForVT(SourceVT, NVPTX::ST_i8_ari_64, NVPTX::ST_i16_ari_64, |
1771 | NVPTX::ST_i32_ari_64, NVPTX::ST_i64_ari_64, |
1772 | NVPTX::ST_f32_ari_64, NVPTX::ST_f64_ari_64); |
1773 | else |
1774 | Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_ari, NVPTX::ST_i16_ari, |
1775 | NVPTX::ST_i32_ari, NVPTX::ST_i64_ari, |
1776 | NVPTX::ST_f32_ari, NVPTX::ST_f64_ari); |
1777 | if (!Opcode) |
1778 | return false; |
1779 | |
1780 | SDValue Ops[] = {Value, |
1781 | getI32Imm(Imm: isVolatile, DL: dl), |
1782 | getI32Imm(Imm: CodeAddrSpace, DL: dl), |
1783 | getI32Imm(Imm: vecType, DL: dl), |
1784 | getI32Imm(Imm: toType, DL: dl), |
1785 | getI32Imm(Imm: toTypeWidth, DL: dl), |
1786 | Base, |
1787 | Offset, |
1788 | Chain}; |
1789 | NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops); |
1790 | } else { |
1791 | if (PointerSize == 64) |
1792 | Opcode = |
1793 | pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg_64, NVPTX::ST_i16_areg_64, |
1794 | NVPTX::ST_i32_areg_64, NVPTX::ST_i64_areg_64, |
1795 | NVPTX::ST_f32_areg_64, NVPTX::ST_f64_areg_64); |
1796 | else |
1797 | Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg, NVPTX::ST_i16_areg, |
1798 | NVPTX::ST_i32_areg, NVPTX::ST_i64_areg, |
1799 | NVPTX::ST_f32_areg, NVPTX::ST_f64_areg); |
1800 | if (!Opcode) |
1801 | return false; |
1802 | SDValue Ops[] = {Value, |
1803 | getI32Imm(Imm: isVolatile, DL: dl), |
1804 | getI32Imm(Imm: CodeAddrSpace, DL: dl), |
1805 | getI32Imm(Imm: vecType, DL: dl), |
1806 | getI32Imm(Imm: toType, DL: dl), |
1807 | getI32Imm(Imm: toTypeWidth, DL: dl), |
1808 | BasePtr, |
1809 | Chain}; |
1810 | NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops); |
1811 | } |
1812 | |
1813 | if (!NVPTXST) |
1814 | return false; |
1815 | |
1816 | MachineMemOperand *MemRef = cast<MemSDNode>(Val: N)->getMemOperand(); |
1817 | CurDAG->setNodeMemRefs(N: cast<MachineSDNode>(Val: NVPTXST), NewMemRefs: {MemRef}); |
1818 | ReplaceNode(F: N, T: NVPTXST); |
1819 | return true; |
1820 | } |
1821 | |
1822 | bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { |
1823 | SDValue Chain = N->getOperand(Num: 0); |
1824 | SDValue Op1 = N->getOperand(Num: 1); |
1825 | SDValue Addr, Offset, Base; |
1826 | std::optional<unsigned> Opcode; |
1827 | SDLoc DL(N); |
1828 | SDNode *ST; |
1829 | EVT EltVT = Op1.getValueType(); |
1830 | MemSDNode *MemSD = cast<MemSDNode>(Val: N); |
1831 | EVT StoreVT = MemSD->getMemoryVT(); |
1832 | |
1833 | // Address Space Setting |
1834 | unsigned CodeAddrSpace = getCodeAddrSpace(N: MemSD); |
1835 | if (CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT) { |
1836 | report_fatal_error(reason: "Cannot store to pointer that points to constant " |
1837 | "memory space" ); |
1838 | } |
1839 | unsigned int PointerSize = |
1840 | CurDAG->getDataLayout().getPointerSizeInBits(AS: MemSD->getAddressSpace()); |
1841 | |
1842 | // Volatile Setting |
1843 | // - .volatile is only availalble for .global and .shared |
1844 | bool IsVolatile = MemSD->isVolatile(); |
1845 | if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL && |
1846 | CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED && |
1847 | CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC) |
1848 | IsVolatile = false; |
1849 | |
1850 | // Type Setting: toType + toTypeWidth |
1851 | // - for integer type, always use 'u' |
1852 | assert(StoreVT.isSimple() && "Store value is not simple" ); |
1853 | MVT ScalarVT = StoreVT.getSimpleVT().getScalarType(); |
1854 | unsigned ToTypeWidth = ScalarVT.getSizeInBits(); |
1855 | unsigned ToType = getLdStRegType(VT: ScalarVT); |
1856 | |
1857 | SmallVector<SDValue, 12> StOps; |
1858 | SDValue N2; |
1859 | unsigned VecType; |
1860 | |
1861 | switch (N->getOpcode()) { |
1862 | case NVPTXISD::StoreV2: |
1863 | VecType = NVPTX::PTXLdStInstCode::V2; |
1864 | StOps.push_back(Elt: N->getOperand(Num: 1)); |
1865 | StOps.push_back(Elt: N->getOperand(Num: 2)); |
1866 | N2 = N->getOperand(Num: 3); |
1867 | break; |
1868 | case NVPTXISD::StoreV4: |
1869 | VecType = NVPTX::PTXLdStInstCode::V4; |
1870 | StOps.push_back(Elt: N->getOperand(Num: 1)); |
1871 | StOps.push_back(Elt: N->getOperand(Num: 2)); |
1872 | StOps.push_back(Elt: N->getOperand(Num: 3)); |
1873 | StOps.push_back(Elt: N->getOperand(Num: 4)); |
1874 | N2 = N->getOperand(Num: 5); |
1875 | break; |
1876 | default: |
1877 | return false; |
1878 | } |
1879 | |
1880 | // v8x16 is a special case. PTX doesn't have st.v8.x16 |
1881 | // instruction. Instead, we split the vector into v2x16 chunks and |
1882 | // store them with st.v4.b32. |
1883 | if (Isv2x16VT(VT: EltVT)) { |
1884 | assert(N->getOpcode() == NVPTXISD::StoreV4 && "Unexpected load opcode." ); |
1885 | EltVT = MVT::i32; |
1886 | ToType = NVPTX::PTXLdStInstCode::Untyped; |
1887 | ToTypeWidth = 32; |
1888 | } |
1889 | |
1890 | StOps.push_back(Elt: getI32Imm(Imm: IsVolatile, DL)); |
1891 | StOps.push_back(Elt: getI32Imm(Imm: CodeAddrSpace, DL)); |
1892 | StOps.push_back(Elt: getI32Imm(Imm: VecType, DL)); |
1893 | StOps.push_back(Elt: getI32Imm(Imm: ToType, DL)); |
1894 | StOps.push_back(Elt: getI32Imm(Imm: ToTypeWidth, DL)); |
1895 | |
1896 | if (SelectDirectAddr(N: N2, Address&: Addr)) { |
1897 | switch (N->getOpcode()) { |
1898 | default: |
1899 | return false; |
1900 | case NVPTXISD::StoreV2: |
1901 | Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, |
1902 | NVPTX::STV_i8_v2_avar, NVPTX::STV_i16_v2_avar, |
1903 | NVPTX::STV_i32_v2_avar, NVPTX::STV_i64_v2_avar, |
1904 | NVPTX::STV_f32_v2_avar, NVPTX::STV_f64_v2_avar); |
1905 | break; |
1906 | case NVPTXISD::StoreV4: |
1907 | Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, |
1908 | NVPTX::STV_i8_v4_avar, NVPTX::STV_i16_v4_avar, |
1909 | NVPTX::STV_i32_v4_avar, std::nullopt, |
1910 | NVPTX::STV_f32_v4_avar, std::nullopt); |
1911 | break; |
1912 | } |
1913 | StOps.push_back(Elt: Addr); |
1914 | } else if (PointerSize == 64 ? SelectADDRsi64(OpNode: N2.getNode(), Addr: N2, Base, Offset) |
1915 | : SelectADDRsi(OpNode: N2.getNode(), Addr: N2, Base, Offset)) { |
1916 | switch (N->getOpcode()) { |
1917 | default: |
1918 | return false; |
1919 | case NVPTXISD::StoreV2: |
1920 | Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, |
1921 | NVPTX::STV_i8_v2_asi, NVPTX::STV_i16_v2_asi, |
1922 | NVPTX::STV_i32_v2_asi, NVPTX::STV_i64_v2_asi, |
1923 | NVPTX::STV_f32_v2_asi, NVPTX::STV_f64_v2_asi); |
1924 | break; |
1925 | case NVPTXISD::StoreV4: |
1926 | Opcode = |
1927 | pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_asi, |
1928 | NVPTX::STV_i16_v4_asi, NVPTX::STV_i32_v4_asi, |
1929 | std::nullopt, NVPTX::STV_f32_v4_asi, std::nullopt); |
1930 | break; |
1931 | } |
1932 | StOps.push_back(Elt: Base); |
1933 | StOps.push_back(Elt: Offset); |
1934 | } else if (PointerSize == 64 ? SelectADDRri64(OpNode: N2.getNode(), Addr: N2, Base, Offset) |
1935 | : SelectADDRri(OpNode: N2.getNode(), Addr: N2, Base, Offset)) { |
1936 | if (PointerSize == 64) { |
1937 | switch (N->getOpcode()) { |
1938 | default: |
1939 | return false; |
1940 | case NVPTXISD::StoreV2: |
1941 | Opcode = |
1942 | pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, |
1943 | NVPTX::STV_i8_v2_ari_64, NVPTX::STV_i16_v2_ari_64, |
1944 | NVPTX::STV_i32_v2_ari_64, NVPTX::STV_i64_v2_ari_64, |
1945 | NVPTX::STV_f32_v2_ari_64, NVPTX::STV_f64_v2_ari_64); |
1946 | break; |
1947 | case NVPTXISD::StoreV4: |
1948 | Opcode = pickOpcodeForVT( |
1949 | EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_ari_64, |
1950 | NVPTX::STV_i16_v4_ari_64, NVPTX::STV_i32_v4_ari_64, std::nullopt, |
1951 | NVPTX::STV_f32_v4_ari_64, std::nullopt); |
1952 | break; |
1953 | } |
1954 | } else { |
1955 | switch (N->getOpcode()) { |
1956 | default: |
1957 | return false; |
1958 | case NVPTXISD::StoreV2: |
1959 | Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, |
1960 | NVPTX::STV_i8_v2_ari, NVPTX::STV_i16_v2_ari, |
1961 | NVPTX::STV_i32_v2_ari, NVPTX::STV_i64_v2_ari, |
1962 | NVPTX::STV_f32_v2_ari, NVPTX::STV_f64_v2_ari); |
1963 | break; |
1964 | case NVPTXISD::StoreV4: |
1965 | Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, |
1966 | NVPTX::STV_i8_v4_ari, NVPTX::STV_i16_v4_ari, |
1967 | NVPTX::STV_i32_v4_ari, std::nullopt, |
1968 | NVPTX::STV_f32_v4_ari, std::nullopt); |
1969 | break; |
1970 | } |
1971 | } |
1972 | StOps.push_back(Elt: Base); |
1973 | StOps.push_back(Elt: Offset); |
1974 | } else { |
1975 | if (PointerSize == 64) { |
1976 | switch (N->getOpcode()) { |
1977 | default: |
1978 | return false; |
1979 | case NVPTXISD::StoreV2: |
1980 | Opcode = pickOpcodeForVT( |
1981 | EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg_64, |
1982 | NVPTX::STV_i16_v2_areg_64, NVPTX::STV_i32_v2_areg_64, |
1983 | NVPTX::STV_i64_v2_areg_64, NVPTX::STV_f32_v2_areg_64, |
1984 | NVPTX::STV_f64_v2_areg_64); |
1985 | break; |
1986 | case NVPTXISD::StoreV4: |
1987 | Opcode = pickOpcodeForVT( |
1988 | EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg_64, |
1989 | NVPTX::STV_i16_v4_areg_64, NVPTX::STV_i32_v4_areg_64, std::nullopt, |
1990 | NVPTX::STV_f32_v4_areg_64, std::nullopt); |
1991 | break; |
1992 | } |
1993 | } else { |
1994 | switch (N->getOpcode()) { |
1995 | default: |
1996 | return false; |
1997 | case NVPTXISD::StoreV2: |
1998 | Opcode = |
1999 | pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg, |
2000 | NVPTX::STV_i16_v2_areg, NVPTX::STV_i32_v2_areg, |
2001 | NVPTX::STV_i64_v2_areg, NVPTX::STV_f32_v2_areg, |
2002 | NVPTX::STV_f64_v2_areg); |
2003 | break; |
2004 | case NVPTXISD::StoreV4: |
2005 | Opcode = |
2006 | pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg, |
2007 | NVPTX::STV_i16_v4_areg, NVPTX::STV_i32_v4_areg, |
2008 | std::nullopt, NVPTX::STV_f32_v4_areg, std::nullopt); |
2009 | break; |
2010 | } |
2011 | } |
2012 | StOps.push_back(Elt: N2); |
2013 | } |
2014 | |
2015 | if (!Opcode) |
2016 | return false; |
2017 | |
2018 | StOps.push_back(Elt: Chain); |
2019 | |
2020 | ST = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, StOps); |
2021 | |
2022 | MachineMemOperand *MemRef = cast<MemSDNode>(Val: N)->getMemOperand(); |
2023 | CurDAG->setNodeMemRefs(N: cast<MachineSDNode>(Val: ST), NewMemRefs: {MemRef}); |
2024 | |
2025 | ReplaceNode(F: N, T: ST); |
2026 | return true; |
2027 | } |
2028 | |
2029 | bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) { |
2030 | SDValue Chain = Node->getOperand(Num: 0); |
2031 | SDValue Offset = Node->getOperand(Num: 2); |
2032 | SDValue Glue = Node->getOperand(Num: 3); |
2033 | SDLoc DL(Node); |
2034 | MemSDNode *Mem = cast<MemSDNode>(Val: Node); |
2035 | |
2036 | unsigned VecSize; |
2037 | switch (Node->getOpcode()) { |
2038 | default: |
2039 | return false; |
2040 | case NVPTXISD::LoadParam: |
2041 | VecSize = 1; |
2042 | break; |
2043 | case NVPTXISD::LoadParamV2: |
2044 | VecSize = 2; |
2045 | break; |
2046 | case NVPTXISD::LoadParamV4: |
2047 | VecSize = 4; |
2048 | break; |
2049 | } |
2050 | |
2051 | EVT EltVT = Node->getValueType(ResNo: 0); |
2052 | EVT MemVT = Mem->getMemoryVT(); |
2053 | |
2054 | std::optional<unsigned> Opcode; |
2055 | |
2056 | switch (VecSize) { |
2057 | default: |
2058 | return false; |
2059 | case 1: |
2060 | Opcode = pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, |
2061 | NVPTX::LoadParamMemI8, NVPTX::LoadParamMemI16, |
2062 | NVPTX::LoadParamMemI32, NVPTX::LoadParamMemI64, |
2063 | NVPTX::LoadParamMemF32, NVPTX::LoadParamMemF64); |
2064 | break; |
2065 | case 2: |
2066 | Opcode = |
2067 | pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV2I8, |
2068 | NVPTX::LoadParamMemV2I16, NVPTX::LoadParamMemV2I32, |
2069 | NVPTX::LoadParamMemV2I64, NVPTX::LoadParamMemV2F32, |
2070 | NVPTX::LoadParamMemV2F64); |
2071 | break; |
2072 | case 4: |
2073 | Opcode = |
2074 | pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV4I8, |
2075 | NVPTX::LoadParamMemV4I16, NVPTX::LoadParamMemV4I32, |
2076 | std::nullopt, NVPTX::LoadParamMemV4F32, std::nullopt); |
2077 | break; |
2078 | } |
2079 | if (!Opcode) |
2080 | return false; |
2081 | |
2082 | SDVTList VTs; |
2083 | if (VecSize == 1) { |
2084 | VTs = CurDAG->getVTList(EltVT, MVT::Other, MVT::Glue); |
2085 | } else if (VecSize == 2) { |
2086 | VTs = CurDAG->getVTList(EltVT, EltVT, MVT::Other, MVT::Glue); |
2087 | } else { |
2088 | EVT EVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other, MVT::Glue }; |
2089 | VTs = CurDAG->getVTList(EVTs); |
2090 | } |
2091 | |
2092 | unsigned OffsetVal = Offset->getAsZExtVal(); |
2093 | |
2094 | SmallVector<SDValue, 2> Ops; |
2095 | Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32)); |
2096 | Ops.push_back(Elt: Chain); |
2097 | Ops.push_back(Elt: Glue); |
2098 | |
2099 | ReplaceNode(F: Node, T: CurDAG->getMachineNode(Opcode: *Opcode, dl: DL, VTs, Ops)); |
2100 | return true; |
2101 | } |
2102 | |
2103 | bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) { |
2104 | SDLoc DL(N); |
2105 | SDValue Chain = N->getOperand(Num: 0); |
2106 | SDValue Offset = N->getOperand(Num: 1); |
2107 | unsigned OffsetVal = Offset->getAsZExtVal(); |
2108 | MemSDNode *Mem = cast<MemSDNode>(Val: N); |
2109 | |
2110 | // How many elements do we have? |
2111 | unsigned NumElts = 1; |
2112 | switch (N->getOpcode()) { |
2113 | default: |
2114 | return false; |
2115 | case NVPTXISD::StoreRetval: |
2116 | NumElts = 1; |
2117 | break; |
2118 | case NVPTXISD::StoreRetvalV2: |
2119 | NumElts = 2; |
2120 | break; |
2121 | case NVPTXISD::StoreRetvalV4: |
2122 | NumElts = 4; |
2123 | break; |
2124 | } |
2125 | |
2126 | // Build vector of operands |
2127 | SmallVector<SDValue, 6> Ops; |
2128 | for (unsigned i = 0; i < NumElts; ++i) |
2129 | Ops.push_back(Elt: N->getOperand(Num: i + 2)); |
2130 | Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32)); |
2131 | Ops.push_back(Elt: Chain); |
2132 | |
2133 | // Determine target opcode |
2134 | // If we have an i1, use an 8-bit store. The lowering code in |
2135 | // NVPTXISelLowering will have already emitted an upcast. |
2136 | std::optional<unsigned> Opcode = 0; |
2137 | switch (NumElts) { |
2138 | default: |
2139 | return false; |
2140 | case 1: |
2141 | Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy, |
2142 | NVPTX::StoreRetvalI8, NVPTX::StoreRetvalI16, |
2143 | NVPTX::StoreRetvalI32, NVPTX::StoreRetvalI64, |
2144 | NVPTX::StoreRetvalF32, NVPTX::StoreRetvalF64); |
2145 | if (Opcode == NVPTX::StoreRetvalI8) { |
2146 | // Fine tune the opcode depending on the size of the operand. |
2147 | // This helps to avoid creating redundant COPY instructions in |
2148 | // InstrEmitter::AddRegisterOperand(). |
2149 | switch (Ops[0].getSimpleValueType().SimpleTy) { |
2150 | default: |
2151 | break; |
2152 | case MVT::i32: |
2153 | Opcode = NVPTX::StoreRetvalI8TruncI32; |
2154 | break; |
2155 | case MVT::i64: |
2156 | Opcode = NVPTX::StoreRetvalI8TruncI64; |
2157 | break; |
2158 | } |
2159 | } |
2160 | break; |
2161 | case 2: |
2162 | Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy, |
2163 | NVPTX::StoreRetvalV2I8, NVPTX::StoreRetvalV2I16, |
2164 | NVPTX::StoreRetvalV2I32, NVPTX::StoreRetvalV2I64, |
2165 | NVPTX::StoreRetvalV2F32, NVPTX::StoreRetvalV2F64); |
2166 | break; |
2167 | case 4: |
2168 | Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy, |
2169 | NVPTX::StoreRetvalV4I8, NVPTX::StoreRetvalV4I16, |
2170 | NVPTX::StoreRetvalV4I32, std::nullopt, |
2171 | NVPTX::StoreRetvalV4F32, std::nullopt); |
2172 | break; |
2173 | } |
2174 | if (!Opcode) |
2175 | return false; |
2176 | |
2177 | SDNode *Ret = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops); |
2178 | MachineMemOperand *MemRef = cast<MemSDNode>(Val: N)->getMemOperand(); |
2179 | CurDAG->setNodeMemRefs(N: cast<MachineSDNode>(Val: Ret), NewMemRefs: {MemRef}); |
2180 | |
2181 | ReplaceNode(F: N, T: Ret); |
2182 | return true; |
2183 | } |
2184 | |
2185 | bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) { |
2186 | SDLoc DL(N); |
2187 | SDValue Chain = N->getOperand(Num: 0); |
2188 | SDValue Param = N->getOperand(Num: 1); |
2189 | unsigned ParamVal = Param->getAsZExtVal(); |
2190 | SDValue Offset = N->getOperand(Num: 2); |
2191 | unsigned OffsetVal = Offset->getAsZExtVal(); |
2192 | MemSDNode *Mem = cast<MemSDNode>(Val: N); |
2193 | SDValue Glue = N->getOperand(Num: N->getNumOperands() - 1); |
2194 | |
2195 | // How many elements do we have? |
2196 | unsigned NumElts = 1; |
2197 | switch (N->getOpcode()) { |
2198 | default: |
2199 | return false; |
2200 | case NVPTXISD::StoreParamU32: |
2201 | case NVPTXISD::StoreParamS32: |
2202 | case NVPTXISD::StoreParam: |
2203 | NumElts = 1; |
2204 | break; |
2205 | case NVPTXISD::StoreParamV2: |
2206 | NumElts = 2; |
2207 | break; |
2208 | case NVPTXISD::StoreParamV4: |
2209 | NumElts = 4; |
2210 | break; |
2211 | } |
2212 | |
2213 | // Build vector of operands |
2214 | SmallVector<SDValue, 8> Ops; |
2215 | for (unsigned i = 0; i < NumElts; ++i) |
2216 | Ops.push_back(Elt: N->getOperand(Num: i + 3)); |
2217 | Ops.push_back(CurDAG->getTargetConstant(ParamVal, DL, MVT::i32)); |
2218 | Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32)); |
2219 | Ops.push_back(Elt: Chain); |
2220 | Ops.push_back(Elt: Glue); |
2221 | |
2222 | // Determine target opcode |
2223 | // If we have an i1, use an 8-bit store. The lowering code in |
2224 | // NVPTXISelLowering will have already emitted an upcast. |
2225 | std::optional<unsigned> Opcode = 0; |
2226 | switch (N->getOpcode()) { |
2227 | default: |
2228 | switch (NumElts) { |
2229 | default: |
2230 | return false; |
2231 | case 1: |
2232 | Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy, |
2233 | NVPTX::StoreParamI8, NVPTX::StoreParamI16, |
2234 | NVPTX::StoreParamI32, NVPTX::StoreParamI64, |
2235 | NVPTX::StoreParamF32, NVPTX::StoreParamF64); |
2236 | if (Opcode == NVPTX::StoreParamI8) { |
2237 | // Fine tune the opcode depending on the size of the operand. |
2238 | // This helps to avoid creating redundant COPY instructions in |
2239 | // InstrEmitter::AddRegisterOperand(). |
2240 | switch (Ops[0].getSimpleValueType().SimpleTy) { |
2241 | default: |
2242 | break; |
2243 | case MVT::i32: |
2244 | Opcode = NVPTX::StoreParamI8TruncI32; |
2245 | break; |
2246 | case MVT::i64: |
2247 | Opcode = NVPTX::StoreParamI8TruncI64; |
2248 | break; |
2249 | } |
2250 | } |
2251 | break; |
2252 | case 2: |
2253 | Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy, |
2254 | NVPTX::StoreParamV2I8, NVPTX::StoreParamV2I16, |
2255 | NVPTX::StoreParamV2I32, NVPTX::StoreParamV2I64, |
2256 | NVPTX::StoreParamV2F32, NVPTX::StoreParamV2F64); |
2257 | break; |
2258 | case 4: |
2259 | Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy, |
2260 | NVPTX::StoreParamV4I8, NVPTX::StoreParamV4I16, |
2261 | NVPTX::StoreParamV4I32, std::nullopt, |
2262 | NVPTX::StoreParamV4F32, std::nullopt); |
2263 | break; |
2264 | } |
2265 | if (!Opcode) |
2266 | return false; |
2267 | break; |
2268 | // Special case: if we have a sign-extend/zero-extend node, insert the |
2269 | // conversion instruction first, and use that as the value operand to |
2270 | // the selected StoreParam node. |
2271 | case NVPTXISD::StoreParamU32: { |
2272 | Opcode = NVPTX::StoreParamI32; |
2273 | SDValue CvtNone = CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL, |
2274 | MVT::i32); |
2275 | SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_u32_u16, DL, |
2276 | MVT::i32, Ops[0], CvtNone); |
2277 | Ops[0] = SDValue(Cvt, 0); |
2278 | break; |
2279 | } |
2280 | case NVPTXISD::StoreParamS32: { |
2281 | Opcode = NVPTX::StoreParamI32; |
2282 | SDValue CvtNone = CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL, |
2283 | MVT::i32); |
2284 | SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_s32_s16, DL, |
2285 | MVT::i32, Ops[0], CvtNone); |
2286 | Ops[0] = SDValue(Cvt, 0); |
2287 | break; |
2288 | } |
2289 | } |
2290 | |
2291 | SDVTList RetVTs = CurDAG->getVTList(MVT::Other, MVT::Glue); |
2292 | SDNode *Ret = CurDAG->getMachineNode(Opcode: *Opcode, dl: DL, VTs: RetVTs, Ops); |
2293 | MachineMemOperand *MemRef = cast<MemSDNode>(Val: N)->getMemOperand(); |
2294 | CurDAG->setNodeMemRefs(N: cast<MachineSDNode>(Val: Ret), NewMemRefs: {MemRef}); |
2295 | |
2296 | ReplaceNode(F: N, T: Ret); |
2297 | return true; |
2298 | } |
2299 | |
2300 | bool NVPTXDAGToDAGISel::tryTextureIntrinsic(SDNode *N) { |
2301 | unsigned Opc = 0; |
2302 | |
2303 | switch (N->getOpcode()) { |
2304 | default: return false; |
2305 | case NVPTXISD::Tex1DFloatS32: |
2306 | Opc = NVPTX::TEX_1D_F32_S32_RR; |
2307 | break; |
2308 | case NVPTXISD::Tex1DFloatFloat: |
2309 | Opc = NVPTX::TEX_1D_F32_F32_RR; |
2310 | break; |
2311 | case NVPTXISD::Tex1DFloatFloatLevel: |
2312 | Opc = NVPTX::TEX_1D_F32_F32_LEVEL_RR; |
2313 | break; |
2314 | case NVPTXISD::Tex1DFloatFloatGrad: |
2315 | Opc = NVPTX::TEX_1D_F32_F32_GRAD_RR; |
2316 | break; |
2317 | case NVPTXISD::Tex1DS32S32: |
2318 | Opc = NVPTX::TEX_1D_S32_S32_RR; |
2319 | break; |
2320 | case NVPTXISD::Tex1DS32Float: |
2321 | Opc = NVPTX::TEX_1D_S32_F32_RR; |
2322 | break; |
2323 | case NVPTXISD::Tex1DS32FloatLevel: |
2324 | Opc = NVPTX::TEX_1D_S32_F32_LEVEL_RR; |
2325 | break; |
2326 | case NVPTXISD::Tex1DS32FloatGrad: |
2327 | Opc = NVPTX::TEX_1D_S32_F32_GRAD_RR; |
2328 | break; |
2329 | case NVPTXISD::Tex1DU32S32: |
2330 | Opc = NVPTX::TEX_1D_U32_S32_RR; |
2331 | break; |
2332 | case NVPTXISD::Tex1DU32Float: |
2333 | Opc = NVPTX::TEX_1D_U32_F32_RR; |
2334 | break; |
2335 | case NVPTXISD::Tex1DU32FloatLevel: |
2336 | Opc = NVPTX::TEX_1D_U32_F32_LEVEL_RR; |
2337 | break; |
2338 | case NVPTXISD::Tex1DU32FloatGrad: |
2339 | Opc = NVPTX::TEX_1D_U32_F32_GRAD_RR; |
2340 | break; |
2341 | case NVPTXISD::Tex1DArrayFloatS32: |
2342 | Opc = NVPTX::TEX_1D_ARRAY_F32_S32_RR; |
2343 | break; |
2344 | case NVPTXISD::Tex1DArrayFloatFloat: |
2345 | Opc = NVPTX::TEX_1D_ARRAY_F32_F32_RR; |
2346 | break; |
2347 | case NVPTXISD::Tex1DArrayFloatFloatLevel: |
2348 | Opc = NVPTX::TEX_1D_ARRAY_F32_F32_LEVEL_RR; |
2349 | break; |
2350 | case NVPTXISD::Tex1DArrayFloatFloatGrad: |
2351 | Opc = NVPTX::TEX_1D_ARRAY_F32_F32_GRAD_RR; |
2352 | break; |
2353 | case NVPTXISD::Tex1DArrayS32S32: |
2354 | Opc = NVPTX::TEX_1D_ARRAY_S32_S32_RR; |
2355 | break; |
2356 | case NVPTXISD::Tex1DArrayS32Float: |
2357 | Opc = NVPTX::TEX_1D_ARRAY_S32_F32_RR; |
2358 | break; |
2359 | case NVPTXISD::Tex1DArrayS32FloatLevel: |
2360 | Opc = NVPTX::TEX_1D_ARRAY_S32_F32_LEVEL_RR; |
2361 | break; |
2362 | case NVPTXISD::Tex1DArrayS32FloatGrad: |
2363 | Opc = NVPTX::TEX_1D_ARRAY_S32_F32_GRAD_RR; |
2364 | break; |
2365 | case NVPTXISD::Tex1DArrayU32S32: |
2366 | Opc = NVPTX::TEX_1D_ARRAY_U32_S32_RR; |
2367 | break; |
2368 | case NVPTXISD::Tex1DArrayU32Float: |
2369 | Opc = NVPTX::TEX_1D_ARRAY_U32_F32_RR; |
2370 | break; |
2371 | case NVPTXISD::Tex1DArrayU32FloatLevel: |
2372 | Opc = NVPTX::TEX_1D_ARRAY_U32_F32_LEVEL_RR; |
2373 | break; |
2374 | case NVPTXISD::Tex1DArrayU32FloatGrad: |
2375 | Opc = NVPTX::TEX_1D_ARRAY_U32_F32_GRAD_RR; |
2376 | break; |
2377 | case NVPTXISD::Tex2DFloatS32: |
2378 | Opc = NVPTX::TEX_2D_F32_S32_RR; |
2379 | break; |
2380 | case NVPTXISD::Tex2DFloatFloat: |
2381 | Opc = NVPTX::TEX_2D_F32_F32_RR; |
2382 | break; |
2383 | case NVPTXISD::Tex2DFloatFloatLevel: |
2384 | Opc = NVPTX::TEX_2D_F32_F32_LEVEL_RR; |
2385 | break; |
2386 | case NVPTXISD::Tex2DFloatFloatGrad: |
2387 | Opc = NVPTX::TEX_2D_F32_F32_GRAD_RR; |
2388 | break; |
2389 | case NVPTXISD::Tex2DS32S32: |
2390 | Opc = NVPTX::TEX_2D_S32_S32_RR; |
2391 | break; |
2392 | case NVPTXISD::Tex2DS32Float: |
2393 | Opc = NVPTX::TEX_2D_S32_F32_RR; |
2394 | break; |
2395 | case NVPTXISD::Tex2DS32FloatLevel: |
2396 | Opc = NVPTX::TEX_2D_S32_F32_LEVEL_RR; |
2397 | break; |
2398 | case NVPTXISD::Tex2DS32FloatGrad: |
2399 | Opc = NVPTX::TEX_2D_S32_F32_GRAD_RR; |
2400 | break; |
2401 | case NVPTXISD::Tex2DU32S32: |
2402 | Opc = NVPTX::TEX_2D_U32_S32_RR; |
2403 | break; |
2404 | case NVPTXISD::Tex2DU32Float: |
2405 | Opc = NVPTX::TEX_2D_U32_F32_RR; |
2406 | break; |
2407 | case NVPTXISD::Tex2DU32FloatLevel: |
2408 | Opc = NVPTX::TEX_2D_U32_F32_LEVEL_RR; |
2409 | break; |
2410 | case NVPTXISD::Tex2DU32FloatGrad: |
2411 | Opc = NVPTX::TEX_2D_U32_F32_GRAD_RR; |
2412 | break; |
2413 | case NVPTXISD::Tex2DArrayFloatS32: |
2414 | Opc = NVPTX::TEX_2D_ARRAY_F32_S32_RR; |
2415 | break; |
2416 | case NVPTXISD::Tex2DArrayFloatFloat: |
2417 | Opc = NVPTX::TEX_2D_ARRAY_F32_F32_RR; |
2418 | break; |
2419 | case NVPTXISD::Tex2DArrayFloatFloatLevel: |
2420 | Opc = NVPTX::TEX_2D_ARRAY_F32_F32_LEVEL_RR; |
2421 | break; |
2422 | case NVPTXISD::Tex2DArrayFloatFloatGrad: |
2423 | Opc = NVPTX::TEX_2D_ARRAY_F32_F32_GRAD_RR; |
2424 | break; |
2425 | case NVPTXISD::Tex2DArrayS32S32: |
2426 | Opc = NVPTX::TEX_2D_ARRAY_S32_S32_RR; |
2427 | break; |
2428 | case NVPTXISD::Tex2DArrayS32Float: |
2429 | Opc = NVPTX::TEX_2D_ARRAY_S32_F32_RR; |
2430 | break; |
2431 | case NVPTXISD::Tex2DArrayS32FloatLevel: |
2432 | Opc = NVPTX::TEX_2D_ARRAY_S32_F32_LEVEL_RR; |
2433 | break; |
2434 | case NVPTXISD::Tex2DArrayS32FloatGrad: |
2435 | Opc = NVPTX::TEX_2D_ARRAY_S32_F32_GRAD_RR; |
2436 | break; |
2437 | case NVPTXISD::Tex2DArrayU32S32: |
2438 | Opc = NVPTX::TEX_2D_ARRAY_U32_S32_RR; |
2439 | break; |
2440 | case NVPTXISD::Tex2DArrayU32Float: |
2441 | Opc = NVPTX::TEX_2D_ARRAY_U32_F32_RR; |
2442 | break; |
2443 | case NVPTXISD::Tex2DArrayU32FloatLevel: |
2444 | Opc = NVPTX::TEX_2D_ARRAY_U32_F32_LEVEL_RR; |
2445 | break; |
2446 | case NVPTXISD::Tex2DArrayU32FloatGrad: |
2447 | Opc = NVPTX::TEX_2D_ARRAY_U32_F32_GRAD_RR; |
2448 | break; |
2449 | case NVPTXISD::Tex3DFloatS32: |
2450 | Opc = NVPTX::TEX_3D_F32_S32_RR; |
2451 | break; |
2452 | case NVPTXISD::Tex3DFloatFloat: |
2453 | Opc = NVPTX::TEX_3D_F32_F32_RR; |
2454 | break; |
2455 | case NVPTXISD::Tex3DFloatFloatLevel: |
2456 | Opc = NVPTX::TEX_3D_F32_F32_LEVEL_RR; |
2457 | break; |
2458 | case NVPTXISD::Tex3DFloatFloatGrad: |
2459 | Opc = NVPTX::TEX_3D_F32_F32_GRAD_RR; |
2460 | break; |
2461 | case NVPTXISD::Tex3DS32S32: |
2462 | Opc = NVPTX::TEX_3D_S32_S32_RR; |
2463 | break; |
2464 | case NVPTXISD::Tex3DS32Float: |
2465 | Opc = NVPTX::TEX_3D_S32_F32_RR; |
2466 | break; |
2467 | case NVPTXISD::Tex3DS32FloatLevel: |
2468 | Opc = NVPTX::TEX_3D_S32_F32_LEVEL_RR; |
2469 | break; |
2470 | case NVPTXISD::Tex3DS32FloatGrad: |
2471 | Opc = NVPTX::TEX_3D_S32_F32_GRAD_RR; |
2472 | break; |
2473 | case NVPTXISD::Tex3DU32S32: |
2474 | Opc = NVPTX::TEX_3D_U32_S32_RR; |
2475 | break; |
2476 | case NVPTXISD::Tex3DU32Float: |
2477 | Opc = NVPTX::TEX_3D_U32_F32_RR; |
2478 | break; |
2479 | case NVPTXISD::Tex3DU32FloatLevel: |
2480 | Opc = NVPTX::TEX_3D_U32_F32_LEVEL_RR; |
2481 | break; |
2482 | case NVPTXISD::Tex3DU32FloatGrad: |
2483 | Opc = NVPTX::TEX_3D_U32_F32_GRAD_RR; |
2484 | break; |
2485 | case NVPTXISD::TexCubeFloatFloat: |
2486 | Opc = NVPTX::TEX_CUBE_F32_F32_RR; |
2487 | break; |
2488 | case NVPTXISD::TexCubeFloatFloatLevel: |
2489 | Opc = NVPTX::TEX_CUBE_F32_F32_LEVEL_RR; |
2490 | break; |
2491 | case NVPTXISD::TexCubeS32Float: |
2492 | Opc = NVPTX::TEX_CUBE_S32_F32_RR; |
2493 | break; |
2494 | case NVPTXISD::TexCubeS32FloatLevel: |
2495 | Opc = NVPTX::TEX_CUBE_S32_F32_LEVEL_RR; |
2496 | break; |
2497 | case NVPTXISD::TexCubeU32Float: |
2498 | Opc = NVPTX::TEX_CUBE_U32_F32_RR; |
2499 | break; |
2500 | case NVPTXISD::TexCubeU32FloatLevel: |
2501 | Opc = NVPTX::TEX_CUBE_U32_F32_LEVEL_RR; |
2502 | break; |
2503 | case NVPTXISD::TexCubeArrayFloatFloat: |
2504 | Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32_RR; |
2505 | break; |
2506 | case NVPTXISD::TexCubeArrayFloatFloatLevel: |
2507 | Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32_LEVEL_RR; |
2508 | break; |
2509 | case NVPTXISD::TexCubeArrayS32Float: |
2510 | Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32_RR; |
2511 | break; |
2512 | case NVPTXISD::TexCubeArrayS32FloatLevel: |
2513 | Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32_LEVEL_RR; |
2514 | break; |
2515 | case NVPTXISD::TexCubeArrayU32Float: |
2516 | Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32_RR; |
2517 | break; |
2518 | case NVPTXISD::TexCubeArrayU32FloatLevel: |
2519 | Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32_LEVEL_RR; |
2520 | break; |
2521 | case NVPTXISD::Tld4R2DFloatFloat: |
2522 | Opc = NVPTX::TLD4_R_2D_F32_F32_RR; |
2523 | break; |
2524 | case NVPTXISD::Tld4G2DFloatFloat: |
2525 | Opc = NVPTX::TLD4_G_2D_F32_F32_RR; |
2526 | break; |
2527 | case NVPTXISD::Tld4B2DFloatFloat: |
2528 | Opc = NVPTX::TLD4_B_2D_F32_F32_RR; |
2529 | break; |
2530 | case NVPTXISD::Tld4A2DFloatFloat: |
2531 | Opc = NVPTX::TLD4_A_2D_F32_F32_RR; |
2532 | break; |
2533 | case NVPTXISD::Tld4R2DS64Float: |
2534 | Opc = NVPTX::TLD4_R_2D_S32_F32_RR; |
2535 | break; |
2536 | case NVPTXISD::Tld4G2DS64Float: |
2537 | Opc = NVPTX::TLD4_G_2D_S32_F32_RR; |
2538 | break; |
2539 | case NVPTXISD::Tld4B2DS64Float: |
2540 | Opc = NVPTX::TLD4_B_2D_S32_F32_RR; |
2541 | break; |
2542 | case NVPTXISD::Tld4A2DS64Float: |
2543 | Opc = NVPTX::TLD4_A_2D_S32_F32_RR; |
2544 | break; |
2545 | case NVPTXISD::Tld4R2DU64Float: |
2546 | Opc = NVPTX::TLD4_R_2D_U32_F32_RR; |
2547 | break; |
2548 | case NVPTXISD::Tld4G2DU64Float: |
2549 | Opc = NVPTX::TLD4_G_2D_U32_F32_RR; |
2550 | break; |
2551 | case NVPTXISD::Tld4B2DU64Float: |
2552 | Opc = NVPTX::TLD4_B_2D_U32_F32_RR; |
2553 | break; |
2554 | case NVPTXISD::Tld4A2DU64Float: |
2555 | Opc = NVPTX::TLD4_A_2D_U32_F32_RR; |
2556 | break; |
2557 | case NVPTXISD::TexUnified1DFloatS32: |
2558 | Opc = NVPTX::TEX_UNIFIED_1D_F32_S32_R; |
2559 | break; |
2560 | case NVPTXISD::TexUnified1DFloatFloat: |
2561 | Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_R; |
2562 | break; |
2563 | case NVPTXISD::TexUnified1DFloatFloatLevel: |
2564 | Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_LEVEL_R; |
2565 | break; |
2566 | case NVPTXISD::TexUnified1DFloatFloatGrad: |
2567 | Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_GRAD_R; |
2568 | break; |
2569 | case NVPTXISD::TexUnified1DS32S32: |
2570 | Opc = NVPTX::TEX_UNIFIED_1D_S32_S32_R; |
2571 | break; |
2572 | case NVPTXISD::TexUnified1DS32Float: |
2573 | Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_R; |
2574 | break; |
2575 | case NVPTXISD::TexUnified1DS32FloatLevel: |
2576 | Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_LEVEL_R; |
2577 | break; |
2578 | case NVPTXISD::TexUnified1DS32FloatGrad: |
2579 | Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_GRAD_R; |
2580 | break; |
2581 | case NVPTXISD::TexUnified1DU32S32: |
2582 | Opc = NVPTX::TEX_UNIFIED_1D_U32_S32_R; |
2583 | break; |
2584 | case NVPTXISD::TexUnified1DU32Float: |
2585 | Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_R; |
2586 | break; |
2587 | case NVPTXISD::TexUnified1DU32FloatLevel: |
2588 | Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_LEVEL_R; |
2589 | break; |
2590 | case NVPTXISD::TexUnified1DU32FloatGrad: |
2591 | Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_GRAD_R; |
2592 | break; |
2593 | case NVPTXISD::TexUnified1DArrayFloatS32: |
2594 | Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_S32_R; |
2595 | break; |
2596 | case NVPTXISD::TexUnified1DArrayFloatFloat: |
2597 | Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_R; |
2598 | break; |
2599 | case NVPTXISD::TexUnified1DArrayFloatFloatLevel: |
2600 | Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_LEVEL_R; |
2601 | break; |
2602 | case NVPTXISD::TexUnified1DArrayFloatFloatGrad: |
2603 | Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_GRAD_R; |
2604 | break; |
2605 | case NVPTXISD::TexUnified1DArrayS32S32: |
2606 | Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_S32_R; |
2607 | break; |
2608 | case NVPTXISD::TexUnified1DArrayS32Float: |
2609 | Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_R; |
2610 | break; |
2611 | case NVPTXISD::TexUnified1DArrayS32FloatLevel: |
2612 | Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_LEVEL_R; |
2613 | break; |
2614 | case NVPTXISD::TexUnified1DArrayS32FloatGrad: |
2615 | Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_GRAD_R; |
2616 | break; |
2617 | case NVPTXISD::TexUnified1DArrayU32S32: |
2618 | Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_S32_R; |
2619 | break; |
2620 | case NVPTXISD::TexUnified1DArrayU32Float: |
2621 | Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_R; |
2622 | break; |
2623 | case NVPTXISD::TexUnified1DArrayU32FloatLevel: |
2624 | Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_LEVEL_R; |
2625 | break; |
2626 | case NVPTXISD::TexUnified1DArrayU32FloatGrad: |
2627 | Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_GRAD_R; |
2628 | break; |
2629 | case NVPTXISD::TexUnified2DFloatS32: |
2630 | Opc = NVPTX::TEX_UNIFIED_2D_F32_S32_R; |
2631 | break; |
2632 | case NVPTXISD::TexUnified2DFloatFloat: |
2633 | Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_R; |
2634 | break; |
2635 | case NVPTXISD::TexUnified2DFloatFloatLevel: |
2636 | Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_LEVEL_R; |
2637 | break; |
2638 | case NVPTXISD::TexUnified2DFloatFloatGrad: |
2639 | Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_GRAD_R; |
2640 | break; |
2641 | case NVPTXISD::TexUnified2DS32S32: |
2642 | Opc = NVPTX::TEX_UNIFIED_2D_S32_S32_R; |
2643 | break; |
2644 | case NVPTXISD::TexUnified2DS32Float: |
2645 | Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_R; |
2646 | break; |
2647 | case NVPTXISD::TexUnified2DS32FloatLevel: |
2648 | Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_LEVEL_R; |
2649 | break; |
2650 | case NVPTXISD::TexUnified2DS32FloatGrad: |
2651 | Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_GRAD_R; |
2652 | break; |
2653 | case NVPTXISD::TexUnified2DU32S32: |
2654 | Opc = NVPTX::TEX_UNIFIED_2D_U32_S32_R; |
2655 | break; |
2656 | case NVPTXISD::TexUnified2DU32Float: |
2657 | Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_R; |
2658 | break; |
2659 | case NVPTXISD::TexUnified2DU32FloatLevel: |
2660 | Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_LEVEL_R; |
2661 | break; |
2662 | case NVPTXISD::TexUnified2DU32FloatGrad: |
2663 | Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_GRAD_R; |
2664 | break; |
2665 | case NVPTXISD::TexUnified2DArrayFloatS32: |
2666 | Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_S32_R; |
2667 | break; |
2668 | case NVPTXISD::TexUnified2DArrayFloatFloat: |
2669 | Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_R; |
2670 | break; |
2671 | case NVPTXISD::TexUnified2DArrayFloatFloatLevel: |
2672 | Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_LEVEL_R; |
2673 | break; |
2674 | case NVPTXISD::TexUnified2DArrayFloatFloatGrad: |
2675 | Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_GRAD_R; |
2676 | break; |
2677 | case NVPTXISD::TexUnified2DArrayS32S32: |
2678 | Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_S32_R; |
2679 | break; |
2680 | case NVPTXISD::TexUnified2DArrayS32Float: |
2681 | Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_R; |
2682 | break; |
2683 | case NVPTXISD::TexUnified2DArrayS32FloatLevel: |
2684 | Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_LEVEL_R; |
2685 | break; |
2686 | case NVPTXISD::TexUnified2DArrayS32FloatGrad: |
2687 | Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_GRAD_R; |
2688 | break; |
2689 | case NVPTXISD::TexUnified2DArrayU32S32: |
2690 | Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_S32_R; |
2691 | break; |
2692 | case NVPTXISD::TexUnified2DArrayU32Float: |
2693 | Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_R; |
2694 | break; |
2695 | case NVPTXISD::TexUnified2DArrayU32FloatLevel: |
2696 | Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_LEVEL_R; |
2697 | break; |
2698 | case NVPTXISD::TexUnified2DArrayU32FloatGrad: |
2699 | Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_GRAD_R; |
2700 | break; |
2701 | case NVPTXISD::TexUnified3DFloatS32: |
2702 | Opc = NVPTX::TEX_UNIFIED_3D_F32_S32_R; |
2703 | break; |
2704 | case NVPTXISD::TexUnified3DFloatFloat: |
2705 | Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_R; |
2706 | break; |
2707 | case NVPTXISD::TexUnified3DFloatFloatLevel: |
2708 | Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_LEVEL_R; |
2709 | break; |
2710 | case NVPTXISD::TexUnified3DFloatFloatGrad: |
2711 | Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_GRAD_R; |
2712 | break; |
2713 | case NVPTXISD::TexUnified3DS32S32: |
2714 | Opc = NVPTX::TEX_UNIFIED_3D_S32_S32_R; |
2715 | break; |
2716 | case NVPTXISD::TexUnified3DS32Float: |
2717 | Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_R; |
2718 | break; |
2719 | case NVPTXISD::TexUnified3DS32FloatLevel: |
2720 | Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_LEVEL_R; |
2721 | break; |
2722 | case NVPTXISD::TexUnified3DS32FloatGrad: |
2723 | Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_GRAD_R; |
2724 | break; |
2725 | case NVPTXISD::TexUnified3DU32S32: |
2726 | Opc = NVPTX::TEX_UNIFIED_3D_U32_S32_R; |
2727 | break; |
2728 | case NVPTXISD::TexUnified3DU32Float: |
2729 | Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_R; |
2730 | break; |
2731 | case NVPTXISD::TexUnified3DU32FloatLevel: |
2732 | Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_LEVEL_R; |
2733 | break; |
2734 | case NVPTXISD::TexUnified3DU32FloatGrad: |
2735 | Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_GRAD_R; |
2736 | break; |
2737 | case NVPTXISD::TexUnifiedCubeFloatFloat: |
2738 | Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_R; |
2739 | break; |
2740 | case NVPTXISD::TexUnifiedCubeFloatFloatLevel: |
2741 | Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_LEVEL_R; |
2742 | break; |
2743 | case NVPTXISD::TexUnifiedCubeS32Float: |
2744 | Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_R; |
2745 | break; |
2746 | case NVPTXISD::TexUnifiedCubeS32FloatLevel: |
2747 | Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_LEVEL_R; |
2748 | break; |
2749 | case NVPTXISD::TexUnifiedCubeU32Float: |
2750 | Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_R; |
2751 | break; |
2752 | case NVPTXISD::TexUnifiedCubeU32FloatLevel: |
2753 | Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_LEVEL_R; |
2754 | break; |
2755 | case NVPTXISD::TexUnifiedCubeArrayFloatFloat: |
2756 | Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_R; |
2757 | break; |
2758 | case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel: |
2759 | Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_LEVEL_R; |
2760 | break; |
2761 | case NVPTXISD::TexUnifiedCubeArrayS32Float: |
2762 | Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_R; |
2763 | break; |
2764 | case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel: |
2765 | Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_LEVEL_R; |
2766 | break; |
2767 | case NVPTXISD::TexUnifiedCubeArrayU32Float: |
2768 | Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_R; |
2769 | break; |
2770 | case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel: |
2771 | Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_LEVEL_R; |
2772 | break; |
2773 | case NVPTXISD::Tld4UnifiedR2DFloatFloat: |
2774 | Opc = NVPTX::TLD4_UNIFIED_R_2D_F32_F32_R; |
2775 | break; |
2776 | case NVPTXISD::Tld4UnifiedG2DFloatFloat: |
2777 | Opc = NVPTX::TLD4_UNIFIED_G_2D_F32_F32_R; |
2778 | break; |
2779 | case NVPTXISD::Tld4UnifiedB2DFloatFloat: |
2780 | Opc = NVPTX::TLD4_UNIFIED_B_2D_F32_F32_R; |
2781 | break; |
2782 | case NVPTXISD::Tld4UnifiedA2DFloatFloat: |
2783 | Opc = NVPTX::TLD4_UNIFIED_A_2D_F32_F32_R; |
2784 | break; |
2785 | case NVPTXISD::Tld4UnifiedR2DS64Float: |
2786 | Opc = NVPTX::TLD4_UNIFIED_R_2D_S32_F32_R; |
2787 | break; |
2788 | case NVPTXISD::Tld4UnifiedG2DS64Float: |
2789 | Opc = NVPTX::TLD4_UNIFIED_G_2D_S32_F32_R; |
2790 | break; |
2791 | case NVPTXISD::Tld4UnifiedB2DS64Float: |
2792 | Opc = NVPTX::TLD4_UNIFIED_B_2D_S32_F32_R; |
2793 | break; |
2794 | case NVPTXISD::Tld4UnifiedA2DS64Float: |
2795 | Opc = NVPTX::TLD4_UNIFIED_A_2D_S32_F32_R; |
2796 | break; |
2797 | case NVPTXISD::Tld4UnifiedR2DU64Float: |
2798 | Opc = NVPTX::TLD4_UNIFIED_R_2D_U32_F32_R; |
2799 | break; |
2800 | case NVPTXISD::Tld4UnifiedG2DU64Float: |
2801 | Opc = NVPTX::TLD4_UNIFIED_G_2D_U32_F32_R; |
2802 | break; |
2803 | case NVPTXISD::Tld4UnifiedB2DU64Float: |
2804 | Opc = NVPTX::TLD4_UNIFIED_B_2D_U32_F32_R; |
2805 | break; |
2806 | case NVPTXISD::Tld4UnifiedA2DU64Float: |
2807 | Opc = NVPTX::TLD4_UNIFIED_A_2D_U32_F32_R; |
2808 | break; |
2809 | case NVPTXISD::TexUnifiedCubeFloatFloatGrad: |
2810 | Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_GRAD_R; |
2811 | break; |
2812 | case NVPTXISD::TexUnifiedCubeS32FloatGrad: |
2813 | Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_GRAD_R; |
2814 | break; |
2815 | case NVPTXISD::TexUnifiedCubeU32FloatGrad: |
2816 | Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_GRAD_R; |
2817 | break; |
2818 | case NVPTXISD::TexUnifiedCubeArrayFloatFloatGrad: |
2819 | Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_GRAD_R; |
2820 | break; |
2821 | case NVPTXISD::TexUnifiedCubeArrayS32FloatGrad: |
2822 | Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_GRAD_R; |
2823 | break; |
2824 | case NVPTXISD::TexUnifiedCubeArrayU32FloatGrad: |
2825 | Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_GRAD_R; |
2826 | break; |
2827 | } |
2828 | |
2829 | // Copy over operands |
2830 | SmallVector<SDValue, 8> Ops(drop_begin(RangeOrContainer: N->ops())); |
2831 | Ops.push_back(Elt: N->getOperand(Num: 0)); // Move chain to the back. |
2832 | |
2833 | ReplaceNode(F: N, T: CurDAG->getMachineNode(Opcode: Opc, dl: SDLoc(N), VTs: N->getVTList(), Ops)); |
2834 | return true; |
2835 | } |
2836 | |
2837 | bool NVPTXDAGToDAGISel::trySurfaceIntrinsic(SDNode *N) { |
2838 | unsigned Opc = 0; |
2839 | switch (N->getOpcode()) { |
2840 | default: return false; |
2841 | case NVPTXISD::Suld1DI8Clamp: |
2842 | Opc = NVPTX::SULD_1D_I8_CLAMP_R; |
2843 | break; |
2844 | case NVPTXISD::Suld1DI16Clamp: |
2845 | Opc = NVPTX::SULD_1D_I16_CLAMP_R; |
2846 | break; |
2847 | case NVPTXISD::Suld1DI32Clamp: |
2848 | Opc = NVPTX::SULD_1D_I32_CLAMP_R; |
2849 | break; |
2850 | case NVPTXISD::Suld1DI64Clamp: |
2851 | Opc = NVPTX::SULD_1D_I64_CLAMP_R; |
2852 | break; |
2853 | case NVPTXISD::Suld1DV2I8Clamp: |
2854 | Opc = NVPTX::SULD_1D_V2I8_CLAMP_R; |
2855 | break; |
2856 | case NVPTXISD::Suld1DV2I16Clamp: |
2857 | Opc = NVPTX::SULD_1D_V2I16_CLAMP_R; |
2858 | break; |
2859 | case NVPTXISD::Suld1DV2I32Clamp: |
2860 | Opc = NVPTX::SULD_1D_V2I32_CLAMP_R; |
2861 | break; |
2862 | case NVPTXISD::Suld1DV2I64Clamp: |
2863 | Opc = NVPTX::SULD_1D_V2I64_CLAMP_R; |
2864 | break; |
2865 | case NVPTXISD::Suld1DV4I8Clamp: |
2866 | Opc = NVPTX::SULD_1D_V4I8_CLAMP_R; |
2867 | break; |
2868 | case NVPTXISD::Suld1DV4I16Clamp: |
2869 | Opc = NVPTX::SULD_1D_V4I16_CLAMP_R; |
2870 | break; |
2871 | case NVPTXISD::Suld1DV4I32Clamp: |
2872 | Opc = NVPTX::SULD_1D_V4I32_CLAMP_R; |
2873 | break; |
2874 | case NVPTXISD::Suld1DArrayI8Clamp: |
2875 | Opc = NVPTX::SULD_1D_ARRAY_I8_CLAMP_R; |
2876 | break; |
2877 | case NVPTXISD::Suld1DArrayI16Clamp: |
2878 | Opc = NVPTX::SULD_1D_ARRAY_I16_CLAMP_R; |
2879 | break; |
2880 | case NVPTXISD::Suld1DArrayI32Clamp: |
2881 | Opc = NVPTX::SULD_1D_ARRAY_I32_CLAMP_R; |
2882 | break; |
2883 | case NVPTXISD::Suld1DArrayI64Clamp: |
2884 | Opc = NVPTX::SULD_1D_ARRAY_I64_CLAMP_R; |
2885 | break; |
2886 | case NVPTXISD::Suld1DArrayV2I8Clamp: |
2887 | Opc = NVPTX::SULD_1D_ARRAY_V2I8_CLAMP_R; |
2888 | break; |
2889 | case NVPTXISD::Suld1DArrayV2I16Clamp: |
2890 | Opc = NVPTX::SULD_1D_ARRAY_V2I16_CLAMP_R; |
2891 | break; |
2892 | case NVPTXISD::Suld1DArrayV2I32Clamp: |
2893 | Opc = NVPTX::SULD_1D_ARRAY_V2I32_CLAMP_R; |
2894 | break; |
2895 | case NVPTXISD::Suld1DArrayV2I64Clamp: |
2896 | Opc = NVPTX::SULD_1D_ARRAY_V2I64_CLAMP_R; |
2897 | break; |
2898 | case NVPTXISD::Suld1DArrayV4I8Clamp: |
2899 | Opc = NVPTX::SULD_1D_ARRAY_V4I8_CLAMP_R; |
2900 | break; |
2901 | case NVPTXISD::Suld1DArrayV4I16Clamp: |
2902 | Opc = NVPTX::SULD_1D_ARRAY_V4I16_CLAMP_R; |
2903 | break; |
2904 | case NVPTXISD::Suld1DArrayV4I32Clamp: |
2905 | Opc = NVPTX::SULD_1D_ARRAY_V4I32_CLAMP_R; |
2906 | break; |
2907 | case NVPTXISD::Suld2DI8Clamp: |
2908 | Opc = NVPTX::SULD_2D_I8_CLAMP_R; |
2909 | break; |
2910 | case NVPTXISD::Suld2DI16Clamp: |
2911 | Opc = NVPTX::SULD_2D_I16_CLAMP_R; |
2912 | break; |
2913 | case NVPTXISD::Suld2DI32Clamp: |
2914 | Opc = NVPTX::SULD_2D_I32_CLAMP_R; |
2915 | break; |
2916 | case NVPTXISD::Suld2DI64Clamp: |
2917 | Opc = NVPTX::SULD_2D_I64_CLAMP_R; |
2918 | break; |
2919 | case NVPTXISD::Suld2DV2I8Clamp: |
2920 | Opc = NVPTX::SULD_2D_V2I8_CLAMP_R; |
2921 | break; |
2922 | case NVPTXISD::Suld2DV2I16Clamp: |
2923 | Opc = NVPTX::SULD_2D_V2I16_CLAMP_R; |
2924 | break; |
2925 | case NVPTXISD::Suld2DV2I32Clamp: |
2926 | Opc = NVPTX::SULD_2D_V2I32_CLAMP_R; |
2927 | break; |
2928 | case NVPTXISD::Suld2DV2I64Clamp: |
2929 | Opc = NVPTX::SULD_2D_V2I64_CLAMP_R; |
2930 | break; |
2931 | case NVPTXISD::Suld2DV4I8Clamp: |
2932 | Opc = NVPTX::SULD_2D_V4I8_CLAMP_R; |
2933 | break; |
2934 | case NVPTXISD::Suld2DV4I16Clamp: |
2935 | Opc = NVPTX::SULD_2D_V4I16_CLAMP_R; |
2936 | break; |
2937 | case NVPTXISD::Suld2DV4I32Clamp: |
2938 | Opc = NVPTX::SULD_2D_V4I32_CLAMP_R; |
2939 | break; |
2940 | case NVPTXISD::Suld2DArrayI8Clamp: |
2941 | Opc = NVPTX::SULD_2D_ARRAY_I8_CLAMP_R; |
2942 | break; |
2943 | case NVPTXISD::Suld2DArrayI16Clamp: |
2944 | Opc = NVPTX::SULD_2D_ARRAY_I16_CLAMP_R; |
2945 | break; |
2946 | case NVPTXISD::Suld2DArrayI32Clamp: |
2947 | Opc = NVPTX::SULD_2D_ARRAY_I32_CLAMP_R; |
2948 | break; |
2949 | case NVPTXISD::Suld2DArrayI64Clamp: |
2950 | Opc = NVPTX::SULD_2D_ARRAY_I64_CLAMP_R; |
2951 | break; |
2952 | case NVPTXISD::Suld2DArrayV2I8Clamp: |
2953 | Opc = NVPTX::SULD_2D_ARRAY_V2I8_CLAMP_R; |
2954 | break; |
2955 | case NVPTXISD::Suld2DArrayV2I16Clamp: |
2956 | Opc = NVPTX::SULD_2D_ARRAY_V2I16_CLAMP_R; |
2957 | break; |
2958 | case NVPTXISD::Suld2DArrayV2I32Clamp: |
2959 | Opc = NVPTX::SULD_2D_ARRAY_V2I32_CLAMP_R; |
2960 | break; |
2961 | case NVPTXISD::Suld2DArrayV2I64Clamp: |
2962 | Opc = NVPTX::SULD_2D_ARRAY_V2I64_CLAMP_R; |
2963 | break; |
2964 | case NVPTXISD::Suld2DArrayV4I8Clamp: |
2965 | Opc = NVPTX::SULD_2D_ARRAY_V4I8_CLAMP_R; |
2966 | break; |
2967 | case NVPTXISD::Suld2DArrayV4I16Clamp: |
2968 | Opc = NVPTX::SULD_2D_ARRAY_V4I16_CLAMP_R; |
2969 | break; |
2970 | case NVPTXISD::Suld2DArrayV4I32Clamp: |
2971 | Opc = NVPTX::SULD_2D_ARRAY_V4I32_CLAMP_R; |
2972 | break; |
2973 | case NVPTXISD::Suld3DI8Clamp: |
2974 | Opc = NVPTX::SULD_3D_I8_CLAMP_R; |
2975 | break; |
2976 | case NVPTXISD::Suld3DI16Clamp: |
2977 | Opc = NVPTX::SULD_3D_I16_CLAMP_R; |
2978 | break; |
2979 | case NVPTXISD::Suld3DI32Clamp: |
2980 | Opc = NVPTX::SULD_3D_I32_CLAMP_R; |
2981 | break; |
2982 | case NVPTXISD::Suld3DI64Clamp: |
2983 | Opc = NVPTX::SULD_3D_I64_CLAMP_R; |
2984 | break; |
2985 | case NVPTXISD::Suld3DV2I8Clamp: |
2986 | Opc = NVPTX::SULD_3D_V2I8_CLAMP_R; |
2987 | break; |
2988 | case NVPTXISD::Suld3DV2I16Clamp: |
2989 | Opc = NVPTX::SULD_3D_V2I16_CLAMP_R; |
2990 | break; |
2991 | case NVPTXISD::Suld3DV2I32Clamp: |
2992 | Opc = NVPTX::SULD_3D_V2I32_CLAMP_R; |
2993 | break; |
2994 | case NVPTXISD::Suld3DV2I64Clamp: |
2995 | Opc = NVPTX::SULD_3D_V2I64_CLAMP_R; |
2996 | break; |
2997 | case NVPTXISD::Suld3DV4I8Clamp: |
2998 | Opc = NVPTX::SULD_3D_V4I8_CLAMP_R; |
2999 | break; |
3000 | case NVPTXISD::Suld3DV4I16Clamp: |
3001 | Opc = NVPTX::SULD_3D_V4I16_CLAMP_R; |
3002 | break; |
3003 | case NVPTXISD::Suld3DV4I32Clamp: |
3004 | Opc = NVPTX::SULD_3D_V4I32_CLAMP_R; |
3005 | break; |
3006 | case NVPTXISD::Suld1DI8Trap: |
3007 | Opc = NVPTX::SULD_1D_I8_TRAP_R; |
3008 | break; |
3009 | case NVPTXISD::Suld1DI16Trap: |
3010 | Opc = NVPTX::SULD_1D_I16_TRAP_R; |
3011 | break; |
3012 | case NVPTXISD::Suld1DI32Trap: |
3013 | Opc = NVPTX::SULD_1D_I32_TRAP_R; |
3014 | break; |
3015 | case NVPTXISD::Suld1DI64Trap: |
3016 | Opc = NVPTX::SULD_1D_I64_TRAP_R; |
3017 | break; |
3018 | case NVPTXISD::Suld1DV2I8Trap: |
3019 | Opc = NVPTX::SULD_1D_V2I8_TRAP_R; |
3020 | break; |
3021 | case NVPTXISD::Suld1DV2I16Trap: |
3022 | Opc = NVPTX::SULD_1D_V2I16_TRAP_R; |
3023 | break; |
3024 | case NVPTXISD::Suld1DV2I32Trap: |
3025 | Opc = NVPTX::SULD_1D_V2I32_TRAP_R; |
3026 | break; |
3027 | case NVPTXISD::Suld1DV2I64Trap: |
3028 | Opc = NVPTX::SULD_1D_V2I64_TRAP_R; |
3029 | break; |
3030 | case NVPTXISD::Suld1DV4I8Trap: |
3031 | Opc = NVPTX::SULD_1D_V4I8_TRAP_R; |
3032 | break; |
3033 | case NVPTXISD::Suld1DV4I16Trap: |
3034 | Opc = NVPTX::SULD_1D_V4I16_TRAP_R; |
3035 | break; |
3036 | case NVPTXISD::Suld1DV4I32Trap: |
3037 | Opc = NVPTX::SULD_1D_V4I32_TRAP_R; |
3038 | break; |
3039 | case NVPTXISD::Suld1DArrayI8Trap: |
3040 | Opc = NVPTX::SULD_1D_ARRAY_I8_TRAP_R; |
3041 | break; |
3042 | case NVPTXISD::Suld1DArrayI16Trap: |
3043 | Opc = NVPTX::SULD_1D_ARRAY_I16_TRAP_R; |
3044 | break; |
3045 | case NVPTXISD::Suld1DArrayI32Trap: |
3046 | Opc = NVPTX::SULD_1D_ARRAY_I32_TRAP_R; |
3047 | break; |
3048 | case NVPTXISD::Suld1DArrayI64Trap: |
3049 | Opc = NVPTX::SULD_1D_ARRAY_I64_TRAP_R; |
3050 | break; |
3051 | case NVPTXISD::Suld1DArrayV2I8Trap: |
3052 | Opc = NVPTX::SULD_1D_ARRAY_V2I8_TRAP_R; |
3053 | break; |
3054 | case NVPTXISD::Suld1DArrayV2I16Trap: |
3055 | Opc = NVPTX::SULD_1D_ARRAY_V2I16_TRAP_R; |
3056 | break; |
3057 | case NVPTXISD::Suld1DArrayV2I32Trap: |
3058 | Opc = NVPTX::SULD_1D_ARRAY_V2I32_TRAP_R; |
3059 | break; |
3060 | case NVPTXISD::Suld1DArrayV2I64Trap: |
3061 | Opc = NVPTX::SULD_1D_ARRAY_V2I64_TRAP_R; |
3062 | break; |
3063 | case NVPTXISD::Suld1DArrayV4I8Trap: |
3064 | Opc = NVPTX::SULD_1D_ARRAY_V4I8_TRAP_R; |
3065 | break; |
3066 | case NVPTXISD::Suld1DArrayV4I16Trap: |
3067 | Opc = NVPTX::SULD_1D_ARRAY_V4I16_TRAP_R; |
3068 | break; |
3069 | case NVPTXISD::Suld1DArrayV4I32Trap: |
3070 | Opc = NVPTX::SULD_1D_ARRAY_V4I32_TRAP_R; |
3071 | break; |
3072 | case NVPTXISD::Suld2DI8Trap: |
3073 | Opc = NVPTX::SULD_2D_I8_TRAP_R; |
3074 | break; |
3075 | case NVPTXISD::Suld2DI16Trap: |
3076 | Opc = NVPTX::SULD_2D_I16_TRAP_R; |
3077 | break; |
3078 | case NVPTXISD::Suld2DI32Trap: |
3079 | Opc = NVPTX::SULD_2D_I32_TRAP_R; |
3080 | break; |
3081 | case NVPTXISD::Suld2DI64Trap: |
3082 | Opc = NVPTX::SULD_2D_I64_TRAP_R; |
3083 | break; |
3084 | case NVPTXISD::Suld2DV2I8Trap: |
3085 | Opc = NVPTX::SULD_2D_V2I8_TRAP_R; |
3086 | break; |
3087 | case NVPTXISD::Suld2DV2I16Trap: |
3088 | Opc = NVPTX::SULD_2D_V2I16_TRAP_R; |
3089 | break; |
3090 | case NVPTXISD::Suld2DV2I32Trap: |
3091 | Opc = NVPTX::SULD_2D_V2I32_TRAP_R; |
3092 | break; |
3093 | case NVPTXISD::Suld2DV2I64Trap: |
3094 | Opc = NVPTX::SULD_2D_V2I64_TRAP_R; |
3095 | break; |
3096 | case NVPTXISD::Suld2DV4I8Trap: |
3097 | Opc = NVPTX::SULD_2D_V4I8_TRAP_R; |
3098 | break; |
3099 | case NVPTXISD::Suld2DV4I16Trap: |
3100 | Opc = NVPTX::SULD_2D_V4I16_TRAP_R; |
3101 | break; |
3102 | case NVPTXISD::Suld2DV4I32Trap: |
3103 | Opc = NVPTX::SULD_2D_V4I32_TRAP_R; |
3104 | break; |
3105 | case NVPTXISD::Suld2DArrayI8Trap: |
3106 | Opc = NVPTX::SULD_2D_ARRAY_I8_TRAP_R; |
3107 | break; |
3108 | case NVPTXISD::Suld2DArrayI16Trap: |
3109 | Opc = NVPTX::SULD_2D_ARRAY_I16_TRAP_R; |
3110 | break; |
3111 | case NVPTXISD::Suld2DArrayI32Trap: |
3112 | Opc = NVPTX::SULD_2D_ARRAY_I32_TRAP_R; |
3113 | break; |
3114 | case NVPTXISD::Suld2DArrayI64Trap: |
3115 | Opc = NVPTX::SULD_2D_ARRAY_I64_TRAP_R; |
3116 | break; |
3117 | case NVPTXISD::Suld2DArrayV2I8Trap: |
3118 | Opc = NVPTX::SULD_2D_ARRAY_V2I8_TRAP_R; |
3119 | break; |
3120 | case NVPTXISD::Suld2DArrayV2I16Trap: |
3121 | Opc = NVPTX::SULD_2D_ARRAY_V2I16_TRAP_R; |
3122 | break; |
3123 | case NVPTXISD::Suld2DArrayV2I32Trap: |
3124 | Opc = NVPTX::SULD_2D_ARRAY_V2I32_TRAP_R; |
3125 | break; |
3126 | case NVPTXISD::Suld2DArrayV2I64Trap: |
3127 | Opc = NVPTX::SULD_2D_ARRAY_V2I64_TRAP_R; |
3128 | break; |
3129 | case NVPTXISD::Suld2DArrayV4I8Trap: |
3130 | Opc = NVPTX::SULD_2D_ARRAY_V4I8_TRAP_R; |
3131 | break; |
3132 | case NVPTXISD::Suld2DArrayV4I16Trap: |
3133 | Opc = NVPTX::SULD_2D_ARRAY_V4I16_TRAP_R; |
3134 | break; |
3135 | case NVPTXISD::Suld2DArrayV4I32Trap: |
3136 | Opc = NVPTX::SULD_2D_ARRAY_V4I32_TRAP_R; |
3137 | break; |
3138 | case NVPTXISD::Suld3DI8Trap: |
3139 | Opc = NVPTX::SULD_3D_I8_TRAP_R; |
3140 | break; |
3141 | case NVPTXISD::Suld3DI16Trap: |
3142 | Opc = NVPTX::SULD_3D_I16_TRAP_R; |
3143 | break; |
3144 | case NVPTXISD::Suld3DI32Trap: |
3145 | Opc = NVPTX::SULD_3D_I32_TRAP_R; |
3146 | break; |
3147 | case NVPTXISD::Suld3DI64Trap: |
3148 | Opc = NVPTX::SULD_3D_I64_TRAP_R; |
3149 | break; |
3150 | case NVPTXISD::Suld3DV2I8Trap: |
3151 | Opc = NVPTX::SULD_3D_V2I8_TRAP_R; |
3152 | break; |
3153 | case NVPTXISD::Suld3DV2I16Trap: |
3154 | Opc = NVPTX::SULD_3D_V2I16_TRAP_R; |
3155 | break; |
3156 | case NVPTXISD::Suld3DV2I32Trap: |
3157 | Opc = NVPTX::SULD_3D_V2I32_TRAP_R; |
3158 | break; |
3159 | case NVPTXISD::Suld3DV2I64Trap: |
3160 | Opc = NVPTX::SULD_3D_V2I64_TRAP_R; |
3161 | break; |
3162 | case NVPTXISD::Suld3DV4I8Trap: |
3163 | Opc = NVPTX::SULD_3D_V4I8_TRAP_R; |
3164 | break; |
3165 | case NVPTXISD::Suld3DV4I16Trap: |
3166 | Opc = NVPTX::SULD_3D_V4I16_TRAP_R; |
3167 | break; |
3168 | case NVPTXISD::Suld3DV4I32Trap: |
3169 | Opc = NVPTX::SULD_3D_V4I32_TRAP_R; |
3170 | break; |
3171 | case NVPTXISD::Suld1DI8Zero: |
3172 | Opc = NVPTX::SULD_1D_I8_ZERO_R; |
3173 | break; |
3174 | case NVPTXISD::Suld1DI16Zero: |
3175 | Opc = NVPTX::SULD_1D_I16_ZERO_R; |
3176 | break; |
3177 | case NVPTXISD::Suld1DI32Zero: |
3178 | Opc = NVPTX::SULD_1D_I32_ZERO_R; |
3179 | break; |
3180 | case NVPTXISD::Suld1DI64Zero: |
3181 | Opc = NVPTX::SULD_1D_I64_ZERO_R; |
3182 | break; |
3183 | case NVPTXISD::Suld1DV2I8Zero: |
3184 | Opc = NVPTX::SULD_1D_V2I8_ZERO_R; |
3185 | break; |
3186 | case NVPTXISD::Suld1DV2I16Zero: |
3187 | Opc = NVPTX::SULD_1D_V2I16_ZERO_R; |
3188 | break; |
3189 | case NVPTXISD::Suld1DV2I32Zero: |
3190 | Opc = NVPTX::SULD_1D_V2I32_ZERO_R; |
3191 | break; |
3192 | case NVPTXISD::Suld1DV2I64Zero: |
3193 | Opc = NVPTX::SULD_1D_V2I64_ZERO_R; |
3194 | break; |
3195 | case NVPTXISD::Suld1DV4I8Zero: |
3196 | Opc = NVPTX::SULD_1D_V4I8_ZERO_R; |
3197 | break; |
3198 | case NVPTXISD::Suld1DV4I16Zero: |
3199 | Opc = NVPTX::SULD_1D_V4I16_ZERO_R; |
3200 | break; |
3201 | case NVPTXISD::Suld1DV4I32Zero: |
3202 | Opc = NVPTX::SULD_1D_V4I32_ZERO_R; |
3203 | break; |
3204 | case NVPTXISD::Suld1DArrayI8Zero: |
3205 | Opc = NVPTX::SULD_1D_ARRAY_I8_ZERO_R; |
3206 | break; |
3207 | case NVPTXISD::Suld1DArrayI16Zero: |
3208 | Opc = NVPTX::SULD_1D_ARRAY_I16_ZERO_R; |
3209 | break; |
3210 | case NVPTXISD::Suld1DArrayI32Zero: |
3211 | Opc = NVPTX::SULD_1D_ARRAY_I32_ZERO_R; |
3212 | break; |
3213 | case NVPTXISD::Suld1DArrayI64Zero: |
3214 | Opc = NVPTX::SULD_1D_ARRAY_I64_ZERO_R; |
3215 | break; |
3216 | case NVPTXISD::Suld1DArrayV2I8Zero: |
3217 | Opc = NVPTX::SULD_1D_ARRAY_V2I8_ZERO_R; |
3218 | break; |
3219 | case NVPTXISD::Suld1DArrayV2I16Zero: |
3220 | Opc = NVPTX::SULD_1D_ARRAY_V2I16_ZERO_R; |
3221 | break; |
3222 | case NVPTXISD::Suld1DArrayV2I32Zero: |
3223 | Opc = NVPTX::SULD_1D_ARRAY_V2I32_ZERO_R; |
3224 | break; |
3225 | case NVPTXISD::Suld1DArrayV2I64Zero: |
3226 | Opc = NVPTX::SULD_1D_ARRAY_V2I64_ZERO_R; |
3227 | break; |
3228 | case NVPTXISD::Suld1DArrayV4I8Zero: |
3229 | Opc = NVPTX::SULD_1D_ARRAY_V4I8_ZERO_R; |
3230 | break; |
3231 | case NVPTXISD::Suld1DArrayV4I16Zero: |
3232 | Opc = NVPTX::SULD_1D_ARRAY_V4I16_ZERO_R; |
3233 | break; |
3234 | case NVPTXISD::Suld1DArrayV4I32Zero: |
3235 | Opc = NVPTX::SULD_1D_ARRAY_V4I32_ZERO_R; |
3236 | break; |
3237 | case NVPTXISD::Suld2DI8Zero: |
3238 | Opc = NVPTX::SULD_2D_I8_ZERO_R; |
3239 | break; |
3240 | case NVPTXISD::Suld2DI16Zero: |
3241 | Opc = NVPTX::SULD_2D_I16_ZERO_R; |
3242 | break; |
3243 | case NVPTXISD::Suld2DI32Zero: |
3244 | Opc = NVPTX::SULD_2D_I32_ZERO_R; |
3245 | break; |
3246 | case NVPTXISD::Suld2DI64Zero: |
3247 | Opc = NVPTX::SULD_2D_I64_ZERO_R; |
3248 | break; |
3249 | case NVPTXISD::Suld2DV2I8Zero: |
3250 | Opc = NVPTX::SULD_2D_V2I8_ZERO_R; |
3251 | break; |
3252 | case NVPTXISD::Suld2DV2I16Zero: |
3253 | Opc = NVPTX::SULD_2D_V2I16_ZERO_R; |
3254 | break; |
3255 | case NVPTXISD::Suld2DV2I32Zero: |
3256 | Opc = NVPTX::SULD_2D_V2I32_ZERO_R; |
3257 | break; |
3258 | case NVPTXISD::Suld2DV2I64Zero: |
3259 | Opc = NVPTX::SULD_2D_V2I64_ZERO_R; |
3260 | break; |
3261 | case NVPTXISD::Suld2DV4I8Zero: |
3262 | Opc = NVPTX::SULD_2D_V4I8_ZERO_R; |
3263 | break; |
3264 | case NVPTXISD::Suld2DV4I16Zero: |
3265 | Opc = NVPTX::SULD_2D_V4I16_ZERO_R; |
3266 | break; |
3267 | case NVPTXISD::Suld2DV4I32Zero: |
3268 | Opc = NVPTX::SULD_2D_V4I32_ZERO_R; |
3269 | break; |
3270 | case NVPTXISD::Suld2DArrayI8Zero: |
3271 | Opc = NVPTX::SULD_2D_ARRAY_I8_ZERO_R; |
3272 | break; |
3273 | case NVPTXISD::Suld2DArrayI16Zero: |
3274 | Opc = NVPTX::SULD_2D_ARRAY_I16_ZERO_R; |
3275 | break; |
3276 | case NVPTXISD::Suld2DArrayI32Zero: |
3277 | Opc = NVPTX::SULD_2D_ARRAY_I32_ZERO_R; |
3278 | break; |
3279 | case NVPTXISD::Suld2DArrayI64Zero: |
3280 | Opc = NVPTX::SULD_2D_ARRAY_I64_ZERO_R; |
3281 | break; |
3282 | case NVPTXISD::Suld2DArrayV2I8Zero: |
3283 | Opc = NVPTX::SULD_2D_ARRAY_V2I8_ZERO_R; |
3284 | break; |
3285 | case NVPTXISD::Suld2DArrayV2I16Zero: |
3286 | Opc = NVPTX::SULD_2D_ARRAY_V2I16_ZERO_R; |
3287 | break; |
3288 | case NVPTXISD::Suld2DArrayV2I32Zero: |
3289 | Opc = NVPTX::SULD_2D_ARRAY_V2I32_ZERO_R; |
3290 | break; |
3291 | case NVPTXISD::Suld2DArrayV2I64Zero: |
3292 | Opc = NVPTX::SULD_2D_ARRAY_V2I64_ZERO_R; |
3293 | break; |
3294 | case NVPTXISD::Suld2DArrayV4I8Zero: |
3295 | Opc = NVPTX::SULD_2D_ARRAY_V4I8_ZERO_R; |
3296 | break; |
3297 | case NVPTXISD::Suld2DArrayV4I16Zero: |
3298 | Opc = NVPTX::SULD_2D_ARRAY_V4I16_ZERO_R; |
3299 | break; |
3300 | case NVPTXISD::Suld2DArrayV4I32Zero: |
3301 | Opc = NVPTX::SULD_2D_ARRAY_V4I32_ZERO_R; |
3302 | break; |
3303 | case NVPTXISD::Suld3DI8Zero: |
3304 | Opc = NVPTX::SULD_3D_I8_ZERO_R; |
3305 | break; |
3306 | case NVPTXISD::Suld3DI16Zero: |
3307 | Opc = NVPTX::SULD_3D_I16_ZERO_R; |
3308 | break; |
3309 | case NVPTXISD::Suld3DI32Zero: |
3310 | Opc = NVPTX::SULD_3D_I32_ZERO_R; |
3311 | break; |
3312 | case NVPTXISD::Suld3DI64Zero: |
3313 | Opc = NVPTX::SULD_3D_I64_ZERO_R; |
3314 | break; |
3315 | case NVPTXISD::Suld3DV2I8Zero: |
3316 | Opc = NVPTX::SULD_3D_V2I8_ZERO_R; |
3317 | break; |
3318 | case NVPTXISD::Suld3DV2I16Zero: |
3319 | Opc = NVPTX::SULD_3D_V2I16_ZERO_R; |
3320 | break; |
3321 | case NVPTXISD::Suld3DV2I32Zero: |
3322 | Opc = NVPTX::SULD_3D_V2I32_ZERO_R; |
3323 | break; |
3324 | case NVPTXISD::Suld3DV2I64Zero: |
3325 | Opc = NVPTX::SULD_3D_V2I64_ZERO_R; |
3326 | break; |
3327 | case NVPTXISD::Suld3DV4I8Zero: |
3328 | Opc = NVPTX::SULD_3D_V4I8_ZERO_R; |
3329 | break; |
3330 | case NVPTXISD::Suld3DV4I16Zero: |
3331 | Opc = NVPTX::SULD_3D_V4I16_ZERO_R; |
3332 | break; |
3333 | case NVPTXISD::Suld3DV4I32Zero: |
3334 | Opc = NVPTX::SULD_3D_V4I32_ZERO_R; |
3335 | break; |
3336 | } |
3337 | |
3338 | // Copy over operands |
3339 | SmallVector<SDValue, 8> Ops(drop_begin(RangeOrContainer: N->ops())); |
3340 | Ops.push_back(Elt: N->getOperand(Num: 0)); // Move chain to the back. |
3341 | |
3342 | ReplaceNode(F: N, T: CurDAG->getMachineNode(Opcode: Opc, dl: SDLoc(N), VTs: N->getVTList(), Ops)); |
3343 | return true; |
3344 | } |
3345 | |
3346 | |
3347 | /// SelectBFE - Look for instruction sequences that can be made more efficient |
3348 | /// by using the 'bfe' (bit-field extract) PTX instruction |
3349 | bool NVPTXDAGToDAGISel::tryBFE(SDNode *N) { |
3350 | SDLoc DL(N); |
3351 | SDValue LHS = N->getOperand(Num: 0); |
3352 | SDValue RHS = N->getOperand(Num: 1); |
3353 | SDValue Len; |
3354 | SDValue Start; |
3355 | SDValue Val; |
3356 | bool IsSigned = false; |
3357 | |
3358 | if (N->getOpcode() == ISD::AND) { |
3359 | // Canonicalize the operands |
3360 | // We want 'and %val, %mask' |
3361 | if (isa<ConstantSDNode>(Val: LHS) && !isa<ConstantSDNode>(Val: RHS)) { |
3362 | std::swap(a&: LHS, b&: RHS); |
3363 | } |
3364 | |
3365 | ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(Val&: RHS); |
3366 | if (!Mask) { |
3367 | // We need a constant mask on the RHS of the AND |
3368 | return false; |
3369 | } |
3370 | |
3371 | // Extract the mask bits |
3372 | uint64_t MaskVal = Mask->getZExtValue(); |
3373 | if (!isMask_64(Value: MaskVal)) { |
3374 | // We *could* handle shifted masks here, but doing so would require an |
3375 | // 'and' operation to fix up the low-order bits so we would trade |
3376 | // shr+and for bfe+and, which has the same throughput |
3377 | return false; |
3378 | } |
3379 | |
3380 | // How many bits are in our mask? |
3381 | int64_t NumBits = countr_one(Value: MaskVal); |
3382 | Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32); |
3383 | |
3384 | if (LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SRA) { |
3385 | // We have a 'srl/and' pair, extract the effective start bit and length |
3386 | Val = LHS.getNode()->getOperand(Num: 0); |
3387 | Start = LHS.getNode()->getOperand(Num: 1); |
3388 | ConstantSDNode *StartConst = dyn_cast<ConstantSDNode>(Val&: Start); |
3389 | if (StartConst) { |
3390 | uint64_t StartVal = StartConst->getZExtValue(); |
3391 | // How many "good" bits do we have left? "good" is defined here as bits |
3392 | // that exist in the original value, not shifted in. |
3393 | int64_t GoodBits = Start.getValueSizeInBits() - StartVal; |
3394 | if (NumBits > GoodBits) { |
3395 | // Do not handle the case where bits have been shifted in. In theory |
3396 | // we could handle this, but the cost is likely higher than just |
3397 | // emitting the srl/and pair. |
3398 | return false; |
3399 | } |
3400 | Start = CurDAG->getTargetConstant(StartVal, DL, MVT::i32); |
3401 | } else { |
3402 | // Do not handle the case where the shift amount (can be zero if no srl |
3403 | // was found) is not constant. We could handle this case, but it would |
3404 | // require run-time logic that would be more expensive than just |
3405 | // emitting the srl/and pair. |
3406 | return false; |
3407 | } |
3408 | } else { |
3409 | // Do not handle the case where the LHS of the and is not a shift. While |
3410 | // it would be trivial to handle this case, it would just transform |
3411 | // 'and' -> 'bfe', but 'and' has higher-throughput. |
3412 | return false; |
3413 | } |
3414 | } else if (N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) { |
3415 | if (LHS->getOpcode() == ISD::AND) { |
3416 | ConstantSDNode *ShiftCnst = dyn_cast<ConstantSDNode>(Val&: RHS); |
3417 | if (!ShiftCnst) { |
3418 | // Shift amount must be constant |
3419 | return false; |
3420 | } |
3421 | |
3422 | uint64_t ShiftAmt = ShiftCnst->getZExtValue(); |
3423 | |
3424 | SDValue AndLHS = LHS->getOperand(Num: 0); |
3425 | SDValue AndRHS = LHS->getOperand(Num: 1); |
3426 | |
3427 | // Canonicalize the AND to have the mask on the RHS |
3428 | if (isa<ConstantSDNode>(Val: AndLHS)) { |
3429 | std::swap(a&: AndLHS, b&: AndRHS); |
3430 | } |
3431 | |
3432 | ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Val&: AndRHS); |
3433 | if (!MaskCnst) { |
3434 | // Mask must be constant |
3435 | return false; |
3436 | } |
3437 | |
3438 | uint64_t MaskVal = MaskCnst->getZExtValue(); |
3439 | uint64_t NumZeros; |
3440 | uint64_t NumBits; |
3441 | if (isMask_64(Value: MaskVal)) { |
3442 | NumZeros = 0; |
3443 | // The number of bits in the result bitfield will be the number of |
3444 | // trailing ones (the AND) minus the number of bits we shift off |
3445 | NumBits = llvm::countr_one(Value: MaskVal) - ShiftAmt; |
3446 | } else if (isShiftedMask_64(Value: MaskVal)) { |
3447 | NumZeros = llvm::countr_zero(Val: MaskVal); |
3448 | unsigned NumOnes = llvm::countr_one(Value: MaskVal >> NumZeros); |
3449 | // The number of bits in the result bitfield will be the number of |
3450 | // trailing zeros plus the number of set bits in the mask minus the |
3451 | // number of bits we shift off |
3452 | NumBits = NumZeros + NumOnes - ShiftAmt; |
3453 | } else { |
3454 | // This is not a mask we can handle |
3455 | return false; |
3456 | } |
3457 | |
3458 | if (ShiftAmt < NumZeros) { |
3459 | // Handling this case would require extra logic that would make this |
3460 | // transformation non-profitable |
3461 | return false; |
3462 | } |
3463 | |
3464 | Val = AndLHS; |
3465 | Start = CurDAG->getTargetConstant(ShiftAmt, DL, MVT::i32); |
3466 | Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32); |
3467 | } else if (LHS->getOpcode() == ISD::SHL) { |
3468 | // Here, we have a pattern like: |
3469 | // |
3470 | // (sra (shl val, NN), MM) |
3471 | // or |
3472 | // (srl (shl val, NN), MM) |
3473 | // |
3474 | // If MM >= NN, we can efficiently optimize this with bfe |
3475 | Val = LHS->getOperand(Num: 0); |
3476 | |
3477 | SDValue ShlRHS = LHS->getOperand(Num: 1); |
3478 | ConstantSDNode *ShlCnst = dyn_cast<ConstantSDNode>(Val&: ShlRHS); |
3479 | if (!ShlCnst) { |
3480 | // Shift amount must be constant |
3481 | return false; |
3482 | } |
3483 | uint64_t InnerShiftAmt = ShlCnst->getZExtValue(); |
3484 | |
3485 | SDValue ShrRHS = RHS; |
3486 | ConstantSDNode *ShrCnst = dyn_cast<ConstantSDNode>(Val&: ShrRHS); |
3487 | if (!ShrCnst) { |
3488 | // Shift amount must be constant |
3489 | return false; |
3490 | } |
3491 | uint64_t OuterShiftAmt = ShrCnst->getZExtValue(); |
3492 | |
3493 | // To avoid extra codegen and be profitable, we need Outer >= Inner |
3494 | if (OuterShiftAmt < InnerShiftAmt) { |
3495 | return false; |
3496 | } |
3497 | |
3498 | // If the outer shift is more than the type size, we have no bitfield to |
3499 | // extract (since we also check that the inner shift is <= the outer shift |
3500 | // then this also implies that the inner shift is < the type size) |
3501 | if (OuterShiftAmt >= Val.getValueSizeInBits()) { |
3502 | return false; |
3503 | } |
3504 | |
3505 | Start = CurDAG->getTargetConstant(OuterShiftAmt - InnerShiftAmt, DL, |
3506 | MVT::i32); |
3507 | Len = CurDAG->getTargetConstant(Val.getValueSizeInBits() - OuterShiftAmt, |
3508 | DL, MVT::i32); |
3509 | |
3510 | if (N->getOpcode() == ISD::SRA) { |
3511 | // If we have a arithmetic right shift, we need to use the signed bfe |
3512 | // variant |
3513 | IsSigned = true; |
3514 | } |
3515 | } else { |
3516 | // No can do... |
3517 | return false; |
3518 | } |
3519 | } else { |
3520 | // No can do... |
3521 | return false; |
3522 | } |
3523 | |
3524 | |
3525 | unsigned Opc; |
3526 | // For the BFE operations we form here from "and" and "srl", always use the |
3527 | // unsigned variants. |
3528 | if (Val.getValueType() == MVT::i32) { |
3529 | if (IsSigned) { |
3530 | Opc = NVPTX::BFE_S32rii; |
3531 | } else { |
3532 | Opc = NVPTX::BFE_U32rii; |
3533 | } |
3534 | } else if (Val.getValueType() == MVT::i64) { |
3535 | if (IsSigned) { |
3536 | Opc = NVPTX::BFE_S64rii; |
3537 | } else { |
3538 | Opc = NVPTX::BFE_U64rii; |
3539 | } |
3540 | } else { |
3541 | // We cannot handle this type |
3542 | return false; |
3543 | } |
3544 | |
3545 | SDValue Ops[] = { |
3546 | Val, Start, Len |
3547 | }; |
3548 | |
3549 | ReplaceNode(F: N, T: CurDAG->getMachineNode(Opcode: Opc, dl: DL, VTs: N->getVTList(), Ops)); |
3550 | return true; |
3551 | } |
3552 | |
3553 | // SelectDirectAddr - Match a direct address for DAG. |
3554 | // A direct address could be a globaladdress or externalsymbol. |
3555 | bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) { |
3556 | // Return true if TGA or ES. |
3557 | if (N.getOpcode() == ISD::TargetGlobalAddress || |
3558 | N.getOpcode() == ISD::TargetExternalSymbol) { |
3559 | Address = N; |
3560 | return true; |
3561 | } |
3562 | if (N.getOpcode() == NVPTXISD::Wrapper) { |
3563 | Address = N.getOperand(i: 0); |
3564 | return true; |
3565 | } |
3566 | // addrspacecast(MoveParam(arg_symbol) to addrspace(PARAM)) -> arg_symbol |
3567 | if (AddrSpaceCastSDNode *CastN = dyn_cast<AddrSpaceCastSDNode>(Val&: N)) { |
3568 | if (CastN->getSrcAddressSpace() == ADDRESS_SPACE_GENERIC && |
3569 | CastN->getDestAddressSpace() == ADDRESS_SPACE_PARAM && |
3570 | CastN->getOperand(Num: 0).getOpcode() == NVPTXISD::MoveParam) |
3571 | return SelectDirectAddr(N: CastN->getOperand(Num: 0).getOperand(i: 0), Address); |
3572 | } |
3573 | return false; |
3574 | } |
3575 | |
3576 | // symbol+offset |
3577 | bool NVPTXDAGToDAGISel::SelectADDRsi_imp( |
3578 | SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) { |
3579 | if (Addr.getOpcode() == ISD::ADD) { |
3580 | if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val: Addr.getOperand(i: 1))) { |
3581 | SDValue base = Addr.getOperand(i: 0); |
3582 | if (SelectDirectAddr(N: base, Address&: Base)) { |
3583 | Offset = CurDAG->getTargetConstant(Val: CN->getZExtValue(), DL: SDLoc(OpNode), |
3584 | VT: mvt); |
3585 | return true; |
3586 | } |
3587 | } |
3588 | } |
3589 | return false; |
3590 | } |
3591 | |
3592 | // symbol+offset |
3593 | bool NVPTXDAGToDAGISel::SelectADDRsi(SDNode *OpNode, SDValue Addr, |
3594 | SDValue &Base, SDValue &Offset) { |
3595 | return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i32); |
3596 | } |
3597 | |
3598 | // symbol+offset |
3599 | bool NVPTXDAGToDAGISel::SelectADDRsi64(SDNode *OpNode, SDValue Addr, |
3600 | SDValue &Base, SDValue &Offset) { |
3601 | return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i64); |
3602 | } |
3603 | |
3604 | // register+offset |
3605 | bool NVPTXDAGToDAGISel::SelectADDRri_imp( |
3606 | SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) { |
3607 | if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Val&: Addr)) { |
3608 | Base = CurDAG->getTargetFrameIndex(FI: FIN->getIndex(), VT: mvt); |
3609 | Offset = CurDAG->getTargetConstant(Val: 0, DL: SDLoc(OpNode), VT: mvt); |
3610 | return true; |
3611 | } |
3612 | if (Addr.getOpcode() == ISD::TargetExternalSymbol || |
3613 | Addr.getOpcode() == ISD::TargetGlobalAddress) |
3614 | return false; // direct calls. |
3615 | |
3616 | if (Addr.getOpcode() == ISD::ADD) { |
3617 | if (SelectDirectAddr(N: Addr.getOperand(i: 0), Address&: Addr)) { |
3618 | return false; |
3619 | } |
3620 | if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val: Addr.getOperand(i: 1))) { |
3621 | if (FrameIndexSDNode *FIN = |
3622 | dyn_cast<FrameIndexSDNode>(Val: Addr.getOperand(i: 0))) |
3623 | // Constant offset from frame ref. |
3624 | Base = CurDAG->getTargetFrameIndex(FI: FIN->getIndex(), VT: mvt); |
3625 | else |
3626 | Base = Addr.getOperand(i: 0); |
3627 | Offset = CurDAG->getTargetConstant(Val: CN->getZExtValue(), DL: SDLoc(OpNode), |
3628 | VT: mvt); |
3629 | return true; |
3630 | } |
3631 | } |
3632 | return false; |
3633 | } |
3634 | |
3635 | // register+offset |
3636 | bool NVPTXDAGToDAGISel::SelectADDRri(SDNode *OpNode, SDValue Addr, |
3637 | SDValue &Base, SDValue &Offset) { |
3638 | return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i32); |
3639 | } |
3640 | |
3641 | // register+offset |
3642 | bool NVPTXDAGToDAGISel::SelectADDRri64(SDNode *OpNode, SDValue Addr, |
3643 | SDValue &Base, SDValue &Offset) { |
3644 | return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i64); |
3645 | } |
3646 | |
3647 | bool NVPTXDAGToDAGISel::ChkMemSDNodeAddressSpace(SDNode *N, |
3648 | unsigned int spN) const { |
3649 | const Value *Src = nullptr; |
3650 | if (MemSDNode *mN = dyn_cast<MemSDNode>(Val: N)) { |
3651 | if (spN == 0 && mN->getMemOperand()->getPseudoValue()) |
3652 | return true; |
3653 | Src = mN->getMemOperand()->getValue(); |
3654 | } |
3655 | if (!Src) |
3656 | return false; |
3657 | if (auto *PT = dyn_cast<PointerType>(Val: Src->getType())) |
3658 | return (PT->getAddressSpace() == spN); |
3659 | return false; |
3660 | } |
3661 | |
3662 | /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for |
3663 | /// inline asm expressions. |
3664 | bool NVPTXDAGToDAGISel::SelectInlineAsmMemoryOperand( |
3665 | const SDValue &Op, InlineAsm::ConstraintCode ConstraintID, |
3666 | std::vector<SDValue> &OutOps) { |
3667 | SDValue Op0, Op1; |
3668 | switch (ConstraintID) { |
3669 | default: |
3670 | return true; |
3671 | case InlineAsm::ConstraintCode::m: // memory |
3672 | if (SelectDirectAddr(N: Op, Address&: Op0)) { |
3673 | OutOps.push_back(x: Op0); |
3674 | OutOps.push_back(CurDAG->getTargetConstant(0, SDLoc(Op), MVT::i32)); |
3675 | return false; |
3676 | } |
3677 | if (SelectADDRri(OpNode: Op.getNode(), Addr: Op, Base&: Op0, Offset&: Op1)) { |
3678 | OutOps.push_back(x: Op0); |
3679 | OutOps.push_back(x: Op1); |
3680 | return false; |
3681 | } |
3682 | break; |
3683 | } |
3684 | return true; |
3685 | } |
3686 | |
3687 | /// GetConvertOpcode - Returns the CVT_ instruction opcode that implements a |
3688 | /// conversion from \p SrcTy to \p DestTy. |
3689 | unsigned NVPTXDAGToDAGISel::GetConvertOpcode(MVT DestTy, MVT SrcTy, |
3690 | LoadSDNode *LdNode) { |
3691 | bool IsSigned = LdNode && LdNode->getExtensionType() == ISD::SEXTLOAD; |
3692 | switch (SrcTy.SimpleTy) { |
3693 | default: |
3694 | llvm_unreachable("Unhandled source type" ); |
3695 | case MVT::i8: |
3696 | switch (DestTy.SimpleTy) { |
3697 | default: |
3698 | llvm_unreachable("Unhandled dest type" ); |
3699 | case MVT::i16: |
3700 | return IsSigned ? NVPTX::CVT_s16_s8 : NVPTX::CVT_u16_u8; |
3701 | case MVT::i32: |
3702 | return IsSigned ? NVPTX::CVT_s32_s8 : NVPTX::CVT_u32_u8; |
3703 | case MVT::i64: |
3704 | return IsSigned ? NVPTX::CVT_s64_s8 : NVPTX::CVT_u64_u8; |
3705 | } |
3706 | case MVT::i16: |
3707 | switch (DestTy.SimpleTy) { |
3708 | default: |
3709 | llvm_unreachable("Unhandled dest type" ); |
3710 | case MVT::i8: |
3711 | return IsSigned ? NVPTX::CVT_s8_s16 : NVPTX::CVT_u8_u16; |
3712 | case MVT::i32: |
3713 | return IsSigned ? NVPTX::CVT_s32_s16 : NVPTX::CVT_u32_u16; |
3714 | case MVT::i64: |
3715 | return IsSigned ? NVPTX::CVT_s64_s16 : NVPTX::CVT_u64_u16; |
3716 | } |
3717 | case MVT::i32: |
3718 | switch (DestTy.SimpleTy) { |
3719 | default: |
3720 | llvm_unreachable("Unhandled dest type" ); |
3721 | case MVT::i8: |
3722 | return IsSigned ? NVPTX::CVT_s8_s32 : NVPTX::CVT_u8_u32; |
3723 | case MVT::i16: |
3724 | return IsSigned ? NVPTX::CVT_s16_s32 : NVPTX::CVT_u16_u32; |
3725 | case MVT::i64: |
3726 | return IsSigned ? NVPTX::CVT_s64_s32 : NVPTX::CVT_u64_u32; |
3727 | } |
3728 | case MVT::i64: |
3729 | switch (DestTy.SimpleTy) { |
3730 | default: |
3731 | llvm_unreachable("Unhandled dest type" ); |
3732 | case MVT::i8: |
3733 | return IsSigned ? NVPTX::CVT_s8_s64 : NVPTX::CVT_u8_u64; |
3734 | case MVT::i16: |
3735 | return IsSigned ? NVPTX::CVT_s16_s64 : NVPTX::CVT_u16_u64; |
3736 | case MVT::i32: |
3737 | return IsSigned ? NVPTX::CVT_s32_s64 : NVPTX::CVT_u32_u64; |
3738 | } |
3739 | case MVT::f16: |
3740 | switch (DestTy.SimpleTy) { |
3741 | default: |
3742 | llvm_unreachable("Unhandled dest type" ); |
3743 | case MVT::f32: |
3744 | return NVPTX::CVT_f32_f16; |
3745 | case MVT::f64: |
3746 | return NVPTX::CVT_f64_f16; |
3747 | } |
3748 | } |
3749 | } |
3750 | |