ArmSMEToLLVM.cpp source code [mlir/lib/Conversion/ArmSMEToLLVM/ArmSMEToLLVM.cpp]

1	//===- ArmSMEToLLVM.cpp - Convert ArmSME to LLVM dialect ------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file implements lowering of ArmSME operations to LLVM intrinsics.
10	//
11	//===----------------------------------------------------------------------===//
12
13	#include "mlir/Conversion/ArmSMEToLLVM/ArmSMEToLLVM.h"
14
15	#include "mlir/Conversion/LLVMCommon/ConversionTarget.h"
16	#include "mlir/Conversion/LLVMCommon/Pattern.h"
17	#include "mlir/Dialect/Arith/IR/Arith.h"
18	#include "mlir/Dialect/ArmSME/IR/ArmSME.h"
19	#include "mlir/Dialect/ArmSME/Transforms/Transforms.h"
20	#include "mlir/Dialect/ArmSME/Utils/Utils.h"
21	#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
22	#include "mlir/Dialect/Func/IR/FuncOps.h"
23	#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
24	#include "mlir/Dialect/MemRef/IR/MemRef.h"
25	#include "mlir/Dialect/Vector/IR/VectorOps.h"
26	#include "mlir/Pass/Pass.h"
27	#include "mlir/Transforms/DialectConversion.h"
28	#include "llvm/ADT/ScopeExit.h"
29
30	namespace mlir {
31	#define GEN_PASS_DEF_CONVERTARMSMETOLLVM
32	#include "mlir/Conversion/Passes.h.inc"
33	} // namespace mlir
34
35	using namespace mlir;
36
37	namespace {
38
39	static constexpr StringLiteral kInMemoryTileIdAttr("arm_sme.in_memory_tile_id");
40
41	/// Helper to create an arm_sme.intr.ld1.(horiz\|vert)' intrinsic.*
42	static Operation *createLoadTileSliceIntrinsic(
43	RewriterBase &rewriter, Location loc, arm_sme::ArmSMETileType type,
44	arm_sme::TileSliceLayout layout, Value maskOp, Value ptr,
45	IntegerAttr tileId, Value tileSliceI32) {
46	if (layout == arm_sme::TileSliceLayout::Horizontal) {
47	switch (type) {
48	case arm_sme::ArmSMETileType::ZAB:
49	return rewriter.create<arm_sme::aarch64_sme_ld1b_horiz>(
50	loc, maskOp, ptr, tileId, tileSliceI32);
51	case arm_sme::ArmSMETileType::ZAH:
52	return rewriter.create<arm_sme::aarch64_sme_ld1h_horiz>(
53	loc, maskOp, ptr, tileId, tileSliceI32);
54	case arm_sme::ArmSMETileType::ZAS:
55	return rewriter.create<arm_sme::aarch64_sme_ld1w_horiz>(
56	loc, maskOp, ptr, tileId, tileSliceI32);
57	case arm_sme::ArmSMETileType::ZAD:
58	return rewriter.create<arm_sme::aarch64_sme_ld1d_horiz>(
59	loc, maskOp, ptr, tileId, tileSliceI32);
60	case arm_sme::ArmSMETileType::ZAQ:
61	return rewriter.create<arm_sme::aarch64_sme_ld1q_horiz>(
62	loc, maskOp, ptr, tileId, tileSliceI32);
63	}
64	} else {
65	switch (type) {
66	case arm_sme::ArmSMETileType::ZAB:
67	return rewriter.create<arm_sme::aarch64_sme_ld1b_vert>(
68	loc, maskOp, ptr, tileId, tileSliceI32);
69	case arm_sme::ArmSMETileType::ZAH:
70	return rewriter.create<arm_sme::aarch64_sme_ld1h_vert>(
71	loc, maskOp, ptr, tileId, tileSliceI32);
72	case arm_sme::ArmSMETileType::ZAS:
73	return rewriter.create<arm_sme::aarch64_sme_ld1w_vert>(
74	loc, maskOp, ptr, tileId, tileSliceI32);
75	case arm_sme::ArmSMETileType::ZAD:
76	return rewriter.create<arm_sme::aarch64_sme_ld1d_vert>(
77	loc, maskOp, ptr, tileId, tileSliceI32);
78	case arm_sme::ArmSMETileType::ZAQ:
79	return rewriter.create<arm_sme::aarch64_sme_ld1q_vert>(
80	loc, maskOp, ptr, tileId, tileSliceI32);
81	break;
82	}
83	}
84	llvm_unreachable("unknown type in createLoadTileSliceIntrinsic");
85	}
86
87	/// Helper to create an arm_sme.intr.st1.(horiz\|vert)' intrinsic.*
88	static Operation *createStoreTileSliceIntrinsic(
89	RewriterBase &rewriter, Location loc, arm_sme::ArmSMETileType type,
90	arm_sme::TileSliceLayout layout, Value maskOp, Value ptr,
91	IntegerAttr tileId, Value tileSliceI32) {
92	if (layout == arm_sme::TileSliceLayout::Horizontal) {
93	switch (type) {
94	case arm_sme::ArmSMETileType::ZAB:
95	return rewriter.create<arm_sme::aarch64_sme_st1b_horiz>(
96	loc, maskOp, ptr, tileId, tileSliceI32);
97	case arm_sme::ArmSMETileType::ZAH:
98	return rewriter.create<arm_sme::aarch64_sme_st1h_horiz>(
99	loc, maskOp, ptr, tileId, tileSliceI32);
100	case arm_sme::ArmSMETileType::ZAS:
101	return rewriter.create<arm_sme::aarch64_sme_st1w_horiz>(
102	loc, maskOp, ptr, tileId, tileSliceI32);
103	case arm_sme::ArmSMETileType::ZAD:
104	return rewriter.create<arm_sme::aarch64_sme_st1d_horiz>(
105	loc, maskOp, ptr, tileId, tileSliceI32);
106	case arm_sme::ArmSMETileType::ZAQ:
107	return rewriter.create<arm_sme::aarch64_sme_st1q_horiz>(
108	loc, maskOp, ptr, tileId, tileSliceI32);
109	}
110	} else {
111	switch (type) {
112	case arm_sme::ArmSMETileType::ZAB:
113	return rewriter.create<arm_sme::aarch64_sme_st1b_vert>(
114	loc, maskOp, ptr, tileId, tileSliceI32);
115	case arm_sme::ArmSMETileType::ZAH:
116	return rewriter.create<arm_sme::aarch64_sme_st1h_vert>(
117	loc, maskOp, ptr, tileId, tileSliceI32);
118	case arm_sme::ArmSMETileType::ZAS:
119	return rewriter.create<arm_sme::aarch64_sme_st1w_vert>(
120	loc, maskOp, ptr, tileId, tileSliceI32);
121	case arm_sme::ArmSMETileType::ZAD:
122	return rewriter.create<arm_sme::aarch64_sme_st1d_vert>(
123	loc, maskOp, ptr, tileId, tileSliceI32);
124	case arm_sme::ArmSMETileType::ZAQ:
125	return rewriter.create<arm_sme::aarch64_sme_st1q_vert>(
126	loc, maskOp, ptr, tileId, tileSliceI32);
127	}
128	}
129	llvm_unreachable("unknown type in createStoreTileSliceIntrinsic");
130	}
131
132	IntegerAttr getTileIdOrError(arm_sme::ArmSMETileOpInterface op) {
133	auto tileId = op.getTileId();
134	if (!tileId)
135	op.emitOpError(
136	"expected tile ID to be allocated before conversion to LLVM");
137	return tileId;
138	}
139
140	/// Creates an alloca matching the size of tile used by `tileOp`. The alloca is
141	/// placed in the first block of the function.
142	static memref::AllocaOp
143	createAllocaForTile(RewriterBase &rewriter, Location loc,
144	FunctionOpInterface func,
145	arm_sme::ArmSMETileOpInterface tileOp) {
146	RewriterBase::InsertionGuard g(rewriter);
147	// Move to the first operation in the function.
148	rewriter.setInsertionPointToStart(&func.getBlocks().front());
149	// Create an alloca matching the tile size of the `tileOp`.
150	auto vscale = rewriter.create<vector::VectorScaleOp>(loc);
151	auto tileElementType = tileOp.getTileType().getElementType();
152	auto memrefType = MemRefType::get(
153	{ShapedType::kDynamic, ShapedType::kDynamic}, tileElementType);
154	unsigned minElements = arm_sme::getSMETileSliceMinNumElts(type: tileElementType);
155	auto minElementsOp =
156	rewriter.create<arith::ConstantIndexOp>(loc, minElements);
157	auto vectorLen = rewriter.create<arith::MulIOp>(loc, vscale, minElementsOp);
158	auto alloca = rewriter.create<memref::AllocaOp>(
159	loc, memrefType, ValueRange{vectorLen, vectorLen});
160	return alloca;
161	}
162
163	/// Finds or creates an alloca for a spill of a tile.
164	static memref::AllocaOp getOrCreateAllocaForTile(
165	RewriterBase &rewriter, Location loc, FunctionOpInterface func,
166	arm_sme::ArmSMETileOpInterface tileOp, unsigned tileId) {
167	// Find an alloca at the top of the function tagged with a
168	// 'arm_sme.in_memory_tile_id' that matches `tileId`.
169	for (auto &op : func.getBlocks().front()) {
170	auto alloca = llvm::dyn_cast<memref::AllocaOp>(op);
171	if (!alloca)
172	continue;
173	auto inMemoryTileId = llvm::dyn_cast_or_null<IntegerAttr>(
174	alloca->getDiscardableAttr(kInMemoryTileIdAttr));
175	if (!inMemoryTileId)
176	continue;
177	if (inMemoryTileId.getInt() == tileId)
178	return alloca;
179	}
180	// Otherwise, create a new alloca:
181	auto alloca = createAllocaForTile(rewriter, loc, func, tileOp);
182	alloca->setDiscardableAttr(kInMemoryTileIdAttr,
183	rewriter.getI32IntegerAttr(tileId));
184	return alloca;
185	}
186
187	/// Very naive lowering of in-memory tiles (i.e. tiles that were not assigned a
188	/// hardware tile ID) to ArmSME intrinsics. Currently, this works by assigning
189	/// the op to tile 0, then emitting a full tile swap between ZA and memory
190	/// before + after the tile op.
191	///
192	/// Example:
193	///
194	/// // Note: <IN MEMORY TILE> = tile ID >= 16.
195	/// arm_sme.tile_op { tile_id = <IN MEMORY TILE> }
196	///
197	/// is converted to:
198	/// // At function entry:
199	/// %spill = memref.alloca ... : memref<?x?xty>
200	///
201	/// // Around op:
202	/// scf.for %slice_idx {
203	/// %slice_to_save = "arm_sme.intr.read.horiz" ... <{tile_id = 0 : i32}>
204	/// "arm_sme.intr.ld1h.horiz"(%spill, %slice_idx) <{tile_id = 0 : i32}>
205	/// vector.store %slice_to_save, %spill[%slice_idx, %c0]
206	/// }
207	/// arm_sme.tile_op { tile_id = 0 }
208	/// scf.for %slice_idx {
209	/// %slice_to_save = "arm_sme.intr.read.horiz" ... <{tile_id = 0 : i32}>
210	/// "arm_sme.intr.ld1h.horiz"(%spill, %slice_idx) <{tile_id = 0 : i32}>
211	/// vector.store %slice_to_save, %spill[%slice_idx, %c0]
212	/// }
213	///
214	/// Note that these spills/fills are not inserted earlier as concept of a
215	/// register, and the need to swap the contents, can't really be represented
216	/// correctly at a high level in MLIR.
217	///
218	/// TODO: Reduce the spills/reloads to single slices where possible (and omit
219	/// redundant reloads). This could be done via a method on the
220	/// `ArmSMETileOpInterface` which returns how the operation uses ZA. E.g.:
221	///
222	/// `tileOp.getZaUsage()` could return:
223	///
224	/// struct ArmSMEOpZAUsage {
225	/// enum class Kind {
226	/// TileRead, // Omit store after tile operation.
227	/// TileWrite, // Omit load before tile operation.
228	/// TileReadWrite, // Needs both tile load and store.
229	/// SliceRead, // Spill single slice and omit store after operation.
230	/// SliceWrite, // Spill single slice and omit load before operation.
231	/// SliceReadWrite // Spill single slice.
232	/// };
233	/// Value sliceIndex {};
234	/// TileSliceLayout sliceLayout { TileSliceLayout::Horizontal };
235	/// };
236	///
237	struct ConvertArmSMESpillsAndFillsToLLVM : public ConvertToLLVMPattern {
238
239	ConvertArmSMESpillsAndFillsToLLVM(StringRef rootOpName,
240	const LLVMTypeConverter &typeConverter,
241	PatternBenefit benefit)
242	: ConvertToLLVMPattern (rootOpName, &typeConverter.getContext(),
243	typeConverter, benefit) {}
244
245	LogicalResult
246	matchAndRewrite(Operation *op, ArrayRef<Value> operands,
247	ConversionPatternRewriter &rewriter) const override {
248	auto tileOp = cast<arm_sme::ArmSMETileOpInterface>(op);
249	// Tile has a real (hardware) tile. No spills/reloads required.
250	if (!tileOp.isInMemoryTile())
251	return failure();
252
253	tileOp->emitWarning(
254	"failed to allocate SME virtual tile to operation, tile value will go "
255	"through memory, expect degraded performance");
256
257	// Step 1. Create an alloca for the tile at the top of the function (if one
258	// does not already exist).
259	auto loc = tileOp.getLoc();
260	auto func = tileOp->getParentOfType<FunctionOpInterface>();
261	auto tileAlloca = getOrCreateAllocaForTile(rewriter, loc, func, tileOp,
262	tileOp.getTileId().getInt());
263
264	// Step 2. Assign the op a real tile ID.
265	// For simplicity, we always use tile 0 (which always exists).
266	auto zeroTileId = rewriter.getI32IntegerAttr(`0`);
267	rewriter.modifyOpInPlace(tileOp, [&] { tileOp.setTileId(zeroTileId); });
268
269	VectorType tileVectorType = tileOp.getTileType();
270	auto sliceType = VectorType::Builder(tileVectorType).dropDim(`0`);
271	auto swapInMemoryTileWithSMETileZero = [&] {
272	emitFullTileSwap(rewriter, loc, tileAlloca,
273	*arm_sme::getSMETileType(tileVectorType), sliceType,
274	zeroTileId);
275	};
276
277	// Step 3. Emit tile swaps before and after the op.
278	// TODO: Reduce the amount spilled to the amount of data the `tileOp`
279	// touches (i.e. a single tile slice).
280	{
281	rewriter.setInsertionPoint(op);
282	// Swap the contents of ZA and the in-memory tile before the op.
283	swapInMemoryTileWithSMETileZero();
284	rewriter.setInsertionPointAfter(op);
285	// Swap the tile back out to memory again after the op.
286	swapInMemoryTileWithSMETileZero();
287	}
288
289	return success();
290	}
291
292	/// Extracts a pointer to a slice of an in-memory tile.
293	Value getInMemoryTileSlicePtr(RewriterBase &rewriter, Location loc,
294	Value tileMemory, Value sliceIndex) const {
295	auto llvmType = getTypeConverter()->convertType(t: tileMemory.getType());
296	auto descriptor =
297	rewriter.create<UnrealizedConversionCastOp>(loc, llvmType, tileMemory);
298	auto zero = rewriter.create<arith::ConstantIntOp>(loc, `0`, /width=/`64`);
299	auto sliceIndexI64 = rewriter.create<arith::IndexCastOp>(
300	loc, rewriter.getI64Type(), sliceIndex);
301	return getStridedElementPtr(
302	static_cast<ConversionPatternRewriter &>(rewriter), loc,
303	llvm::cast<MemRefType>(tileMemory.getType()), descriptor.getResult(`0`),
304	{sliceIndexI64, zero});
305	}
306
307	/// Emits an in-place swap of a slice of a tile in ZA and a slice of a
308	/// tile-sized memref (`tileAlloca`).
309	void emitSliceSwap(RewriterBase &rewriter, Location loc, Value tileAlloca,
310	arm_sme::ArmSMETileType tileType, VectorType sliceType,
311	IntegerAttr tileId, Value sliceIndex) const {
312	// Cast the slice index to an i32.
313	auto sliceIndexI32 = rewriter.create<arith::IndexCastOp>(
314	loc, rewriter.getI32Type(), sliceIndex);
315	// Create an all-true predicate for the slice.
316	auto predicateType = sliceType.clone(rewriter.getI1Type());
317	auto allTruePredicate = rewriter.create<arith::ConstantOp>(
318	loc, DenseElementsAttr::get(predicateType, true));
319	// Create padding vector (never used due to all-true predicate).
320	auto padVector = rewriter.create<LLVM::PoisonOp>(loc, sliceType);
321	// Get a pointer to the current slice.
322	auto slicePtr =
323	getInMemoryTileSlicePtr(rewriter, loc, tileMemory: tileAlloca, sliceIndex);
324	// Read the value of the current slice from ZA.
325	auto currentTileSlice = rewriter.create<arm_sme::aarch64_sme_read_horiz>(
326	loc, sliceType, padVector, allTruePredicate, tileId, sliceIndexI32);
327	// Load the new tile slice back from memory into ZA.
328	createLoadTileSliceIntrinsic(
329	rewriter, loc, tileType, arm_sme::TileSliceLayout::Horizontal,
330	allTruePredicate, slicePtr, tileId, sliceIndexI32);
331	// Store the current tile slice to memory.
332	auto zero = rewriter.create<arith::ConstantIndexOp>(loc, `0`);
333	rewriter.create<vector::StoreOp>(loc, currentTileSlice, tileAlloca,
334	ValueRange{sliceIndex, zero});
335	}
336
337	/// Emits a full in-place swap of the contents of a tile in ZA and a
338	/// tile-sized memref (`tileAlloca`).
339	void emitFullTileSwap(RewriterBase &rewriter, Location loc, Value tileAlloca,
340	arm_sme::ArmSMETileType tileType, VectorType sliceType,
341	IntegerAttr tileId) const {
342	RewriterBase::InsertionGuard guard(rewriter);
343	// Create an scf.for over all tile slices.
344	auto minNumElts =
345	rewriter.create<arith::ConstantIndexOp>(loc, sliceType.getDimSize(`0`));
346	auto lowerBound = rewriter.create<arith::ConstantIndexOp>(loc, `0`);
347	auto upperBound = rewriter.create<arith::MulIOp>(
348	loc, minNumElts, rewriter.create<vector::VectorScaleOp>(loc));
349	auto step = rewriter.create<arith::ConstantIndexOp>(loc, `1`);
350	auto forOp = rewriter.create<scf::ForOp>(loc, lowerBound, upperBound, step);
351	// Emit a swap for each tile slice.
352	rewriter.setInsertionPointToStart(forOp.getBody());
353	auto sliceIndex = forOp.getInductionVar();
354	emitSliceSwap(rewriter, loc, tileAlloca, tileType, sliceType, tileId,
355	sliceIndex);
356	}
357	};
358
359	enum class RequiresSpillsAndFills { Yes, No };
360
361	/// Base class for ArmSME to LLVM conversion patterns. By default, this adds
362	/// spills and fills around ArmSME ops that use in-memory tile IDs. This can be
363	/// disabled by setting the `requiresSpillsAndFills` template parameter to
364	/// `RequiresSpillsAndFills::No`.
365	template <typename SourceOp, RequiresSpillsAndFills requiresSpillsAndFills =
366	RequiresSpillsAndFills::Yes>
367	struct ConvertArmSMEOpToLLVMPattern : ConvertOpToLLVMPattern<SourceOp> {
368	using ArmSMEOp = SourceOp;
369	using ConvertOpToLLVMPattern<SourceOp>::ConvertOpToLLVMPattern;
370
371	static constexpr bool requiresSpillsAndFillsConversion() {
372	return requiresSpillsAndFills == RequiresSpillsAndFills::Yes;
373	}
374	};
375
376	template <typename Pattern>
377	static void addArmSMEConversionPattern(RewritePatternSet &patterns,
378	LLVMTypeConverter const &typeConverter) {
379	// Register spills/fills for ops that implement the
380	// `ArmSMETileOpInterface` and have `requiresSpillsAndFills` set to
381	// `RequiresSpillsAndFills::Yes`.
382	if constexpr (Pattern::requiresSpillsAndFillsConversion() &&
383	std::is_base_of_v<arm_sme::ArmSMETileOpInterface::Trait<
384	typename Pattern::ArmSMEOp>,
385	typename Pattern::ArmSMEOp>) {
386	// Add spill/fill conversions with a very high benefit to ensure
387	// they are lowered first.
388	patterns.add<ConvertArmSMESpillsAndFillsToLLVM>(
389	Pattern::ArmSMEOp::getOperationName(), typeConverter,
390	/benefit=/`1337`);
391	}
392	patterns.add<Pattern>(typeConverter);
393	}
394
395	/// Helper to register `ConvertArmSMEOpToLLVMPattern` patterns.
396	template <typename... Patterns>
397	static void
398	addArmSMEConversionPatterns(RewritePatternSet &patterns,
399	LLVMTypeConverter const &typeConverter) {
400	(addArmSMEConversionPattern<Patterns>(patterns, typeConverter), ...);
401	}
402
403	/// Lower 'arm_sme.zero' to SME intrinsics.
404	///
405	/// BEFORE:
406	/// ```mlir
407	/// %v = arm_sme.zero {tile_id = 0 : i32} : vector<[4]x[4]xi32>
408	/// ```
409	///
410	/// AFTER:
411	/// ```mlir
412	/// "arm_sme.intr.zero"() <{tile_mask = 17 : i32}> : () -> ()
413	/// %v = arm_sme.get_tile : vector<[4]x[4]xi32>
414	/// ```
415	///
416	/// The 'arm_sme.get_tile' (which models the return) will fold away once all
417	/// ArmSME ops have been converted to LLVM intrinsics.
418	struct ZeroOpConversion : public ConvertArmSMEOpToLLVMPattern<arm_sme::ZeroOp> {
419	using ConvertArmSMEOpToLLVMPattern::ConvertArmSMEOpToLLVMPattern;
420
421	LogicalResult
422	matchAndRewrite(arm_sme::ZeroOp zero, OpAdaptor adaptor,
423	ConversionPatternRewriter &rewriter) const override {
424	auto loc = zero.getLoc();
425
426	auto tileId = getTileIdOrError(zero);
427	if (!tileId)
428	return failure();
429
430	// Get the base mask for tile based on the element size.
431	// The base mask is just the mask to zero the first tile (of a size).
432	// These masks are derived from:
433	// https://developer.arm.com/documentation/ddi0602/2022-06/SME-Instructions/ZERO--Zero-a-list-of-64-bit-element-ZA-tiles-
434	arm_sme::ArmSMETileType tileType =
435	*arm_sme::getSMETileType(zero.getTileType());
436	auto baseMaskForSize = [&] {
437	switch (tileType) {
438	case arm_sme::ArmSMETileType::ZAB:
439	// Zeroing the 8-bit ZA0.B tile is equivalent to zeroing all eight
440	// 64-bit element tiles named ZA0.D to ZA7.D.
441	return `0b1111'1111`;
442	case arm_sme::ArmSMETileType::ZAH:
443	// Zeroing the 16-bit ZA0.H tile is equivalent to zeroing 64-bit
444	// element tiles named ZA0.D, ZA2.D, ZA4.D, and ZA6.D. Shift this left
445	// once for ZA1.H.
446	return `0b0101'0101`;
447	case arm_sme::ArmSMETileType::ZAS:
448	// Zeroing the 32-bit ZA0.S tile is equivalent to zeroing 64-bit
449	// element tiles named ZA0.D and ZA4.D.
450	// Shift left by 1, 2, or 3 respectively for ZA1.S, ZA2.S, ZA3.S.
451	return `0b0001'0001`;
452	case arm_sme::ArmSMETileType::ZAD:
453	// Zeroing one of the a 64-bit tiles ZA0.D to ZA7.D just requires
454	// setting the bit for that tile.
455	return `0b0000'0001`;
456	default:
457	llvm_unreachable("bad element size");
458	}
459	}();
460
461	// The actual mask is just the base mask shifted by the tile ID.
462	// This will be folded to a constant after tile allocation.
463	//
464	// The shift is just derived from the layout of the tiles, and that the tile
465	// ID is the index of the tile. For example, looking at the 32-bit ZAx.S
466	// tiles:
467	//
468	// ZA0.S = ZA0.D and ZA4.D
469	// Tile ID -> 0*
470	// Mask -> 00010001 = (00010001 << 0)*
471	// ZA1.S = ZA1.D and ZA5.D
472	// Tile ID -> 1*
473	// Mask -> 00100010 = (00010001 << 1)*
474	// ZA2.S = ZA2.D and ZA6.D
475	// Tile ID -> 2*
476	// Mask -> 01000100 = (00010001 << 2)*
477	// ZA3.S = ZA3.D and ZA7.D
478	// Tile ID -> 3*
479	// Mask -> 10001000 = (00010001 << 3)*
480	//
481	// This holds for all tile sizes.
482	int32_t zeroMask = baseMaskForSize << int32_t(tileId.getInt());
483	rewriter.create<arm_sme::aarch64_sme_zero>(
484	loc, rewriter.getI32IntegerAttr(zeroMask));
485
486	// Create a placeholder op to preserve dataflow.
487	// Note: Place the `get_tile` op at the start of the block. This ensures
488	// that if there are multiple `zero` ops the intrinsics will be consecutive.
489	rewriter.setInsertionPointToStart(zero->getBlock());
490	rewriter.replaceOpWithNewOp<arm_sme::GetTileOp>(zero, zero.getVectorType());
491
492	return success();
493	}
494	};
495
496	/// Lower `arm_sme.load_tile_slice` to SME intrinsics.
497	struct LoadTileSliceConversion
498	: public ConvertArmSMEOpToLLVMPattern<arm_sme::LoadTileSliceOp> {
499	using ConvertArmSMEOpToLLVMPattern::ConvertArmSMEOpToLLVMPattern;
500
501	LogicalResult
502	matchAndRewrite(arm_sme::LoadTileSliceOp loadTileSliceOp,
503	arm_sme::LoadTileSliceOp::Adaptor adaptor,
504	ConversionPatternRewriter &rewriter) const override {
505	auto loc = loadTileSliceOp.getLoc();
506	auto tileId = getTileIdOrError(loadTileSliceOp);
507	if (!tileId)
508	return failure();
509
510	Value ptr = this->getStridedElementPtr(
511	rewriter, loc, loadTileSliceOp.getMemRefType(), adaptor.getBase(),
512	adaptor.getIndices());
513
514	auto tileSlice = loadTileSliceOp.getTileSliceIndex();
515
516	// Cast tile slice to i32 for intrinsic.
517	auto tileSliceI32 = rewriter.create<arith::IndexCastUIOp>(
518	loc, rewriter.getI32Type(), tileSlice);
519
520	// Create all active predicate mask.
521	auto maskOp = loadTileSliceOp.getMask();
522
523	auto tileVectorType = loadTileSliceOp.getVectorType();
524	arm_sme::ArmSMETileType tileType = *arm_sme::getSMETileType(tileVectorType);
525	arm_sme::TileSliceLayout layout = loadTileSliceOp.getLayout();
526
527	// Create 'arm_sme.intr.ld1.(horiz\|vert)' intrinsic to load ZA tile slice.*
528	createLoadTileSliceIntrinsic(rewriter, loc, tileType, layout, maskOp, ptr,
529	tileId, tileSliceI32);
530
531	// The load intrinsics have no result, replace 'arm_sme.tile_load' with
532	// the input tile to preserve dataflow.
533	rewriter.replaceOp(loadTileSliceOp, loadTileSliceOp.getTile());
534
535	return success();
536	}
537	};
538
539	/// Lower for `arm_sme.store_tile_slice` to SME intrinsics.
540	struct StoreTileSliceConversion
541	: public ConvertArmSMEOpToLLVMPattern<arm_sme::StoreTileSliceOp> {
542	using ConvertArmSMEOpToLLVMPattern::ConvertArmSMEOpToLLVMPattern;
543
544	LogicalResult
545	matchAndRewrite(arm_sme::StoreTileSliceOp storeTileSliceOp,
546	arm_sme::StoreTileSliceOp::Adaptor adaptor,
547	ConversionPatternRewriter &rewriter) const override {
548	auto loc = storeTileSliceOp.getLoc();
549	auto tileVectorType = storeTileSliceOp.getVectorType();
550
551	auto tileId = getTileIdOrError(storeTileSliceOp);
552	if (!tileId)
553	return failure();
554
555	// Create 'arm_sme.intr.st1.horiz' intrinsic to store ZA tile slice.*
556	Value ptr = this->getStridedElementPtr(
557	rewriter, loc, storeTileSliceOp.getMemRefType(), adaptor.getBase(),
558	adaptor.getIndices());
559
560	auto tileSlice = storeTileSliceOp.getTileSliceIndex();
561
562	// Cast tile slice to i32 for intrinsic.
563	auto tileSliceI32 = rewriter.create<arith::IndexCastUIOp>(
564	loc, rewriter.getI32Type(), tileSlice);
565
566	auto maskOp = storeTileSliceOp.getMask();
567
568	arm_sme::TileSliceLayout layout = storeTileSliceOp.getLayout();
569	arm_sme::ArmSMETileType tileType = *arm_sme::getSMETileType(tileVectorType);
570
571	rewriter.replaceOp(storeTileSliceOp,
572	createStoreTileSliceIntrinsic(rewriter, loc, tileType,
573	layout, maskOp, ptr,
574	tileId, tileSliceI32));
575
576	return success();
577	}
578	};
579
580	/// Lower `arm_sme.insert_tile_slice` to SME intrinsics.
581	struct InsertTileSliceConversion
582	: public ConvertArmSMEOpToLLVMPattern<arm_sme::InsertTileSliceOp> {
583	using ConvertArmSMEOpToLLVMPattern::ConvertArmSMEOpToLLVMPattern;
584
585	LogicalResult
586	matchAndRewrite(arm_sme::InsertTileSliceOp insertTileSliceOp,
587	arm_sme::InsertTileSliceOp::Adaptor adaptor,
588	ConversionPatternRewriter &rewriter) const override {
589	auto loc = insertTileSliceOp.getLoc();
590	auto tileType = insertTileSliceOp.getTileType();
591
592	auto tileId = getTileIdOrError(insertTileSliceOp);
593	if (!tileId)
594	return failure();
595
596	auto tileSlice = insertTileSliceOp.getTileSliceIndex();
597
598	// Cast tile slice from index to i32 for intrinsic.
599	auto tileSliceI32 = rewriter.create<arith::IndexCastUIOp>(
600	loc, rewriter.getI32Type(), tileSlice);
601
602	// Create all active predicate mask.
603	auto one = rewriter.create<arith::ConstantOp>(
604	loc, rewriter.getI1Type(),
605	rewriter.getIntegerAttr(rewriter.getI1Type(), `1`));
606	auto predTy = VectorType::get(tileType.getShape()[`0`], rewriter.getI1Type(),
607	/scalableDims=/{true});
608	auto allActiveMask = rewriter.create<vector::SplatOp>(loc, predTy, one);
609
610	// Create 'arm_sme.intr.write.(horiz\|vert)' to write vector to tile slice.
611	switch (insertTileSliceOp.getLayout()) {
612	case arm_sme::TileSliceLayout::Horizontal:
613	rewriter.create<arm_sme::aarch64_sme_write_horiz>(
614	loc, tileId, tileSliceI32, allActiveMask,
615	insertTileSliceOp.getVector());
616	break;
617	case arm_sme::TileSliceLayout::Vertical:
618	rewriter.create<arm_sme::aarch64_sme_write_vert>(
619	loc, tileId, tileSliceI32, allActiveMask,
620	insertTileSliceOp.getVector());
621	break;
622	}
623
624	// Intrinsic has no result, replace 'arm_sme.insert_tile_slice' with
625	// the input tile to preserve dataflow.
626	rewriter.replaceOp(insertTileSliceOp, insertTileSliceOp.getTile());
627
628	return success();
629	}
630	};
631
632	/// Lower `arm_sme.extract_tile_slice` to SME intrinsics.
633	struct ExtractTileSliceConversion
634	: public ConvertArmSMEOpToLLVMPattern<arm_sme::ExtractTileSliceOp> {
635	using ConvertArmSMEOpToLLVMPattern::ConvertArmSMEOpToLLVMPattern;
636
637	LogicalResult
638	matchAndRewrite(arm_sme::ExtractTileSliceOp extractTileSlice, OpAdaptor,
639	ConversionPatternRewriter &rewriter) const override {
640	auto loc = extractTileSlice.getLoc();
641	auto sliceType = extractTileSlice.getSliceType();
642	auto sliceIndex = extractTileSlice.getTileSliceIndex();
643
644	auto tileId = getTileIdOrError(extractTileSlice);
645	if (!tileId)
646	return failure();
647
648	// Create an 'all true' predicate for the tile slice.
649	auto predicateType = sliceType.cloneWith({}, rewriter.getI1Type());
650	auto allTruePredicate = rewriter.create<arith::ConstantOp>(
651	loc, DenseElementsAttr::get(predicateType, true));
652
653	// Zero destination/fallback for tile slice extraction.
654	auto zeroVector = rewriter.create<arith::ConstantOp>(
655	loc, sliceType, rewriter.getZeroAttr(sliceType));
656
657	// Cast tile slice from index to i32 for intrinsic.
658	auto sliceIndexI32 = rewriter.create<arith::IndexCastOp>(
659	loc, rewriter.getI32Type(), sliceIndex);
660
661	// Create 'arm_sme.intr.read.(horiz\|vert)' to extract the tile slice.
662	switch (extractTileSlice.getLayout()) {
663	case arm_sme::TileSliceLayout::Horizontal:
664	rewriter.replaceOpWithNewOp<arm_sme::aarch64_sme_read_horiz>(
665	extractTileSlice, sliceType, zeroVector, allTruePredicate, tileId,
666	sliceIndexI32);
667	break;
668	case arm_sme::TileSliceLayout::Vertical:
669	rewriter.replaceOpWithNewOp<arm_sme::aarch64_sme_read_vert>(
670	extractTileSlice, sliceType, zeroVector, allTruePredicate, tileId,
671	sliceIndexI32);
672	break;
673	}
674
675	return success();
676	}
677	};
678
679	/// Lower `arm_sme.outerproduct` to SME MOPA intrinsics.
680	///
681	/// Example:
682	///
683	/// %0 = arm_sme.outerproduct %lhs, %rhs acc(%acc)
684	/// : vector<[4]xf32>, vector<[4]xf32>
685	///
686	/// is converted to:
687	///
688	/// "arm_sme.intr.mopa"(%ptrue_s, %ptrue_s, %lhs, %rhs) <{tile_id = 0 : i32}>
689	/// : (vector<[4]xi1>, vector<[4]xi1>, vector<[4]xf32>,
690	/// vector<[4]xf32>) -> ()
691	///
692	/// Currently only supports FMOPA and BFMOPA (non-widening).
693	struct OuterProductOpConversion
694	: public ConvertArmSMEOpToLLVMPattern<arm_sme::OuterProductOp> {
695	using ConvertArmSMEOpToLLVMPattern::ConvertArmSMEOpToLLVMPattern;
696
697	LogicalResult
698	matchAndRewrite(arm_sme::OuterProductOp outerProductOp,
699	arm_sme::OuterProductOp::Adaptor adaptor,
700	ConversionPatternRewriter &rewriter) const override {
701	auto tileId = getTileIdOrError(outerProductOp);
702	if (!tileId)
703	return failure();
704
705	auto isSupportedType = [](VectorType vectorType) {
706	// TODO: the FP outer product instruction variants are predicated on
707	// different features [1]:
708	//
709	// FMOPA (non-widening)*
710	// half-precision - +sme2p1,+sme-f16f16*
711	// single-precision - +sme*
712	// double-precision - +sme-f64f64*
713	// BFMOPA*
714	// half-precision - +sme2p1,+b16b16*
715	//
716	// It should be possible to control lowering based on target features.
717	// [1]
718	// https://developer.arm.com/downloads/-/exploration-tools/feature-names-for-a-profile
719	if ((vectorType.getRank() != `2`) \|\| !vectorType.allDimsScalable())
720	return false;
721
722	auto elementType = vectorType.getElementType();
723
724	if (!elementType.isF16() && !elementType.isBF16() &&
725	!elementType.isF32() && !elementType.isF64())
726	return false;
727
728	unsigned minNumElts = arm_sme::MinStreamingVectorLengthInBits /
729	vectorType.getElementTypeBitWidth();
730	return vectorType.getShape() ==
731	ArrayRef<int64_t>({minNumElts, minNumElts});
732	};
733
734	// TODO: Support CombiningKind::Sub for outer products.
735	if (outerProductOp.getKind() != arm_sme::CombiningKind::Add)
736	return outerProductOp.emitError("unsupported kind");
737
738	auto resultVectorType = outerProductOp.getResultType();
739	if (!isSupportedType(resultVectorType))
740	return outerProductOp.emitError("unsupported type");
741
742	auto loc = outerProductOp.getLoc();
743
744	Value acc = outerProductOp.getAcc();
745	if (!acc) {
746	// Initalize accumulator with zero.
747	auto zero = rewriter.create<arm_sme::ZeroOp>(loc, resultVectorType);
748	zero.setTileId(tileId);
749	acc = zero;
750	}
751
752	Value lhsMask = outerProductOp.getLhsMask();
753	Value rhsMask = outerProductOp.getRhsMask();
754
755	if (!lhsMask \|\| !rhsMask) {
756	auto predTy =
757	outerProductOp.getLhsType().cloneWith({}, rewriter.getI1Type());
758	Value allActiveMask = rewriter.create<arith::ConstantOp>(
759	loc, DenseElementsAttr::get(predTy, true));
760	lhsMask = allActiveMask;
761	rhsMask = allActiveMask;
762	}
763
764	// Create 'arm_sme.intr.mopa' outer product intrinsic.
765	rewriter.create<arm_sme::aarch64_sme_mopa>(loc, tileId, lhsMask, rhsMask,
766	outerProductOp.getLhs(),
767	outerProductOp.getRhs());
768
769	// The outerproduct intrinsics have no result, replace
770	// 'arm_sme.outerproduct' with the input tile to preserve dataflow.
771	rewriter.replaceOp(outerProductOp, acc);
772
773	return success();
774	}
775	};
776
777	/// Lower 2-way and 4-way widening outer products to intrinsics.
778	template <class OuterProductWideningOp, class OuterProductWideningIntrOp>
779	struct OuterProductWideningOpConversion
780	: public ConvertArmSMEOpToLLVMPattern<OuterProductWideningOp> {
781	using ConvertArmSMEOpToLLVMPattern<
782	OuterProductWideningOp>::ConvertArmSMEOpToLLVMPattern;
783
784	LogicalResult
785	matchAndRewrite(OuterProductWideningOp op,
786	typename OuterProductWideningOp::Adaptor adaptor,
787	ConversionPatternRewriter &rewriter) const override {
788	auto tileId = getTileIdOrError(op);
789	if (!tileId)
790	return failure();
791
792	auto loc = op.getLoc();
793	Value acc = op.getAcc();
794	if (!acc) {
795	// Initalize accumulator with zero.
796	auto zero = rewriter.create<arm_sme::ZeroOp>(loc, op.getResultType());
797	zero.setTileId(tileId);
798	acc = zero;
799	}
800
801	Value lhsMask = op.getLhsMask();
802	Value rhsMask = op.getRhsMask();
803	if (!lhsMask \|\| !rhsMask) {
804	auto predTy = op.getLhsType().cloneWith({}, rewriter.getI1Type());
805	Value allActiveMask = rewriter.create<arith::ConstantOp>(
806	loc, DenseElementsAttr::get(predTy, true));
807	lhsMask = allActiveMask;
808	rhsMask = allActiveMask;
809	}
810
811	rewriter.create<OuterProductWideningIntrOp>(
812	loc, tileId, lhsMask, rhsMask, adaptor.getLhs(), adaptor.getRhs());
813
814	// The outerproduct intrinsics have no result, replace
815	// 'arm_sme.outerproduct' with the input tile to preserve dataflow.
816	rewriter.replaceOp(op, acc);
817
818	return success();
819	}
820	};
821
822	/// Lower `arm_sme.streaming_vl` to SME CNTS intrinsics.
823	///
824	/// Example:
825	///
826	/// %0 = arm_sme.streaming_vl <half>
827	///
828	/// is converted to:
829	///
830	/// %cnt = "arm_sme.intr.cntsh"() : () -> i64
831	/// %0 = arith.index_cast %cnt : i64 to index
832	///
833	struct StreamingVLOpConversion
834	: public ConvertArmSMEOpToLLVMPattern<arm_sme::StreamingVLOp,
835	RequiresSpillsAndFills::No> {
836	using ConvertArmSMEOpToLLVMPattern::ConvertArmSMEOpToLLVMPattern;
837
838	LogicalResult
839	matchAndRewrite(arm_sme::StreamingVLOp streamingVlOp,
840	arm_sme::StreamingVLOp::Adaptor adaptor,
841	ConversionPatternRewriter &rewriter) const override {
842	auto loc = streamingVlOp.getLoc();
843	auto i64Type = rewriter.getI64Type();
844	auto intrOp = [&]() -> Operation {
845	switch (streamingVlOp.getTypeSize()) {
846	case arm_sme::TypeSize::Byte:
847	return rewriter.create<arm_sme::aarch64_sme_cntsb>(loc, i64Type);
848	case arm_sme::TypeSize::Half:
849	return rewriter.create<arm_sme::aarch64_sme_cntsh>(loc, i64Type);
850	case arm_sme::TypeSize::Word:
851	return rewriter.create<arm_sme::aarch64_sme_cntsw>(loc, i64Type);
852	case arm_sme::TypeSize::Double:
853	return rewriter.create<arm_sme::aarch64_sme_cntsd>(loc, i64Type);
854	}
855	llvm_unreachable("unknown type size in StreamingVLOpConversion");
856	}();
857	rewriter.replaceOpWithNewOp<arith::IndexCastOp>(
858	streamingVlOp, rewriter.getIndexType(), intrOp->getResult(`0`));
859	return success();
860	}
861	};
862
863	/// Merges consecutive `arm_sme.intr.zero` operations in a block by bitwise
864	/// or-ing the zero masks. Note: In future the backend _should_ handle this.
865	static void mergeConsecutiveTileZerosInBlock(Block *block) {
866	uint32_t mergedZeroMask = `0`;
867	SmallVector<arm_sme::aarch64_sme_zero, `16`> zeroOpsToMerge;
868	auto replaceMergedZeroOps = [&] {
869	auto cleanup = llvm::make_scope_exit([&] {
870	mergedZeroMask = `0`;
871	zeroOpsToMerge.clear();
872	});
873	if (zeroOpsToMerge.size() <= `1`)
874	return;
875	IRRewriter rewriter(zeroOpsToMerge.front());
876	rewriter.create<arm_sme::aarch64_sme_zero>(
877	zeroOpsToMerge.front().getLoc(),
878	rewriter.getI32IntegerAttr(mergedZeroMask));
879	for (auto zeroOp : zeroOpsToMerge)
880	rewriter.eraseOp(zeroOp);
881	};
882	for (Operation &op : *block) {
883	if (auto zeroOp = dyn_cast<arm_sme::aarch64_sme_zero>(op)) {
884	mergedZeroMask \|= zeroOp.getTileMask();
885	zeroOpsToMerge.push_back(zeroOp);
886	} else {
887	replaceMergedZeroOps ();
888	}
889	}
890	replaceMergedZeroOps ();
891	}
892
893	} // namespace
894
895	namespace {
896
897	struct ConvertArmSMEToLLVMPass
898	: public impl::ConvertArmSMEToLLVMBase<ConvertArmSMEToLLVMPass> {
899	ConvertArmSMEToLLVMPass(bool dumpTileLiveRanges) {
900	this->dumpTileLiveRanges = dumpTileLiveRanges;
901	}
902	void runOnOperation() override {
903	auto function = getOperation();
904
905	if (failed(arm_sme::allocateSMETiles(function, dumpTileLiveRanges)))
906	return signalPassFailure();
907
908	LLVMConversionTarget target(getContext());
909	RewritePatternSet patterns(&getContext());
910	LLVMTypeConverter converter(&getContext());
911	configureArmSMEToLLVMConversionLegality(target);
912	populateArmSMEToLLVMConversionPatterns(converter, patterns);
913
914	if (failed(applyPartialConversion(function, target, std::move(patterns))))
915	signalPassFailure();
916
917	function->walk(mergeConsecutiveTileZerosInBlock);
918
919	// Walk the function and fail if there are unexpected operations on SME
920	// tile types after conversion.
921	function->walk([&](Operation *op) {
922	// These ops are legal post conversion, skip these.
923	if (isa<arm_sme::CopyTileOp, arm_sme::GetTileOp, cf::BranchOp>(op) \|\|
924	!op->isRegistered())
925	return;
926	auto isSMETileType = [](Type type) {
927	return arm_sme::isValidSMETileVectorType(type);
928	};
929	if (llvm::any_of(Range: op->getResultTypes(), P: isSMETileType) \|\|
930	llvm::any_of(Range: op->getOperandTypes(), P: isSMETileType)) {
931	op->emitOpError(message: "unexpected operation with SME tile type after "
932	"conversion to LLVM");
933	signalPassFailure();
934	}
935	});
936	}
937	};
938
939	} // namespace
940
941	void mlir::configureArmSMEToLLVMConversionLegality(ConversionTarget &target) {
942	target.addIllegalDialect<arm_sme::ArmSMEDialect>();
943	target.addLegalOp<
944	arm_sme::aarch64_sme_zero, arm_sme::aarch64_sme_str,
945	arm_sme::aarch64_sme_ld1b_horiz, arm_sme::aarch64_sme_ld1h_horiz,
946	arm_sme::aarch64_sme_ld1w_horiz, arm_sme::aarch64_sme_ld1d_horiz,
947	arm_sme::aarch64_sme_ld1q_horiz, arm_sme::aarch64_sme_st1b_horiz,
948	arm_sme::aarch64_sme_st1h_horiz, arm_sme::aarch64_sme_st1w_horiz,
949	arm_sme::aarch64_sme_st1d_horiz, arm_sme::aarch64_sme_st1q_horiz,
950	arm_sme::aarch64_sme_ld1b_vert, arm_sme::aarch64_sme_ld1h_vert,
951	arm_sme::aarch64_sme_ld1w_vert, arm_sme::aarch64_sme_ld1d_vert,
952	arm_sme::aarch64_sme_ld1q_vert, arm_sme::aarch64_sme_st1b_vert,
953	arm_sme::aarch64_sme_st1h_vert, arm_sme::aarch64_sme_st1w_vert,
954	arm_sme::aarch64_sme_st1d_vert, arm_sme::aarch64_sme_st1q_vert,
955	arm_sme::aarch64_sme_read_horiz, arm_sme::aarch64_sme_read_vert,
956	arm_sme::aarch64_sme_write_horiz, arm_sme::aarch64_sme_write_vert,
957	arm_sme::aarch64_sme_mopa, arm_sme::aarch64_sme_mopa_wide,
958	arm_sme::aarch64_sme_mops_wide, arm_sme::aarch64_sme_smopa_wide,
959	arm_sme::aarch64_sme_smops_wide, arm_sme::aarch64_sme_umopa_wide,
960	arm_sme::aarch64_sme_umops_wide, arm_sme::aarch64_sme_smopa_za32,
961	arm_sme::aarch64_sme_smops_za32, arm_sme::aarch64_sme_umopa_za32,
962	arm_sme::aarch64_sme_umops_za32, arm_sme::aarch64_sme_sumopa_wide,
963	arm_sme::aarch64_sme_sumops_wide, arm_sme::aarch64_sme_usmopa_wide,
964	arm_sme::aarch64_sme_usmops_wide, arm_sme::aarch64_sme_cntsb,
965	arm_sme::aarch64_sme_cntsh, arm_sme::aarch64_sme_cntsw,
966	arm_sme::aarch64_sme_cntsd>();
967	target.addLegalDialect<arith::ArithDialect,
968	/ The following are used to lower tile spills/fills /
969	vector::VectorDialect, scf::SCFDialect,
970	memref::MemRefDialect>();
971	// Pseudo operations. These cannot be code-generated but may exist in the
972	// input IR, or be generated during the conversion. They need to be eliminated
973	// before the final conversion to LLVM IR (and likely will be due to DCE).
974	target.addLegalOp<arm_sme::GetTileOp, arm_sme::CopyTileOp,
975	UnrealizedConversionCastOp>();
976	}
977
978	void mlir::populateArmSMEToLLVMConversionPatterns(LLVMTypeConverter &converter,
979	RewritePatternSet &patterns) {
980	converter.addConversion(callback: [&](VectorType type) -> std::optional<Type> {
981	// There's no LLVM type for SME tiles, but after lowering to intrinsics all
982	// SME vector types should be eliminated.
983	if (arm_sme::isValidSMETileVectorType(type))
984	return type;
985	return std::nullopt;
986	});
987
988	addArmSMEConversionPatterns<
989	LoadTileSliceConversion, ExtractTileSliceConversion,
990	InsertTileSliceConversion, StoreTileSliceConversion,
991	StreamingVLOpConversion, OuterProductOpConversion,
992	OuterProductWideningOpConversion<arm_sme::FMopa2WayOp,
993	arm_sme::aarch64_sme_mopa_wide>,
994	OuterProductWideningOpConversion<arm_sme::FMops2WayOp,
995	arm_sme::aarch64_sme_mops_wide>,
996	OuterProductWideningOpConversion<arm_sme::SMopa2WayOp,
997	arm_sme::aarch64_sme_smopa_za32>,
998	OuterProductWideningOpConversion<arm_sme::SMops2WayOp,
999	arm_sme::aarch64_sme_smops_za32>,
1000	OuterProductWideningOpConversion<arm_sme::UMopa2WayOp,
1001	arm_sme::aarch64_sme_umopa_za32>,
1002	OuterProductWideningOpConversion<arm_sme::UMops2WayOp,
1003	arm_sme::aarch64_sme_umops_za32>,
1004	OuterProductWideningOpConversion<arm_sme::SMopa4WayOp,
1005	arm_sme::aarch64_sme_smopa_wide>,
1006	OuterProductWideningOpConversion<arm_sme::SMops4WayOp,
1007	arm_sme::aarch64_sme_smops_wide>,
1008	OuterProductWideningOpConversion<arm_sme::UMopa4WayOp,
1009	arm_sme::aarch64_sme_umopa_wide>,
1010	OuterProductWideningOpConversion<arm_sme::UMops4WayOp,
1011	arm_sme::aarch64_sme_umops_wide>,
1012	OuterProductWideningOpConversion<arm_sme::SuMopa4WayOp,
1013	arm_sme::aarch64_sme_sumopa_wide>,
1014	OuterProductWideningOpConversion<arm_sme::SuMops4WayOp,
1015	arm_sme::aarch64_sme_sumops_wide>,
1016	OuterProductWideningOpConversion<arm_sme::UsMopa4WayOp,
1017	arm_sme::aarch64_sme_usmopa_wide>,
1018	OuterProductWideningOpConversion<arm_sme::UsMops4WayOp,
1019	arm_sme::aarch64_sme_usmops_wide>,
1020	ZeroOpConversion>(patterns, converter);
1021	}
1022
1023	std::unique_ptr<Pass>
1024	mlir::createConvertArmSMEToLLVMPass(bool dumpTileLiveRanges) {
1025	return std::make_unique<ConvertArmSMEToLLVMPass>(args&: dumpTileLiveRanges);
1026	}
1027

source code of mlir/lib/Conversion/ArmSMEToLLVM/ArmSMEToLLVM.cpp