LowerWorkshare.cpp source code [flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp]

1	//===- LowerWorkshare.cpp - special cases for bufferization -------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file implements the lowering of omp.workshare to other omp constructs.
10	//
11	// This pass is tasked with parallelizing the loops nested in
12	// workshare.loop_wrapper while both the Fortran to mlir lowering and the hlfir
13	// to fir lowering pipelines are responsible for emitting the
14	// workshare.loop_wrapper ops where appropriate according to the
15	// `shouldUseWorkshareLowering` function.
16	//
17	//===----------------------------------------------------------------------===//
18
19	#include <flang/Optimizer/Builder/FIRBuilder.h>
20	#include <flang/Optimizer/Dialect/FIROps.h>
21	#include <flang/Optimizer/Dialect/FIRType.h>
22	#include <flang/Optimizer/HLFIR/HLFIROps.h>
23	#include <flang/Optimizer/OpenMP/Passes.h>
24	#include <llvm/ADT/BreadthFirstIterator.h>
25	#include <llvm/ADT/STLExtras.h>
26	#include <llvm/ADT/SmallVectorExtras.h>
27	#include <llvm/ADT/iterator_range.h>
28	#include <llvm/Support/ErrorHandling.h>
29	#include <mlir/Dialect/Arith/IR/Arith.h>
30	#include <mlir/Dialect/LLVMIR/LLVMTypes.h>
31	#include <mlir/Dialect/OpenMP/OpenMPClauseOperands.h>
32	#include <mlir/Dialect/OpenMP/OpenMPDialect.h>
33	#include <mlir/Dialect/SCF/IR/SCF.h>
34	#include <mlir/IR/BuiltinOps.h>
35	#include <mlir/IR/IRMapping.h>
36	#include <mlir/IR/OpDefinition.h>
37	#include <mlir/IR/PatternMatch.h>
38	#include <mlir/IR/Value.h>
39	#include <mlir/IR/Visitors.h>
40	#include <mlir/Interfaces/SideEffectInterfaces.h>
41	#include <mlir/Support/LLVM.h>
42
43	#include <variant>
44
45	namespace flangomp {
46	#define GEN_PASS_DEF_LOWERWORKSHARE
47	#include "flang/Optimizer/OpenMP/Passes.h.inc"
48	} // namespace flangomp
49
50	#define DEBUG_TYPE "lower-workshare"
51
52	using namespace mlir;
53
54	namespace flangomp {
55
56	// Checks for nesting pattern below as we need to avoid sharing the work of
57	// statements which are nested in some constructs such as omp.critical or
58	// another omp.parallel.
59	//
60	// omp.workshare { // `wsOp`
61	// ...
62	// omp.T { // `parent`
63	// ...
64	// `op`
65	//
66	template <typename T>
67	static bool isNestedIn(omp::WorkshareOp wsOp, Operation *op) {
68	T parent = op->getParentOfType<T>();
69	if (!parent)
70	return false;
71	return wsOp->isProperAncestor(parent);
72	}
73
74	bool shouldUseWorkshareLowering(Operation *op) {
75	auto parentWorkshare = op->getParentOfType<omp::WorkshareOp>();
76
77	if (!parentWorkshare)
78	return false;
79
80	if (isNestedIn<omp::CriticalOp>(parentWorkshare, op))
81	return false;
82
83	// 2.8.3 workshare Construct
84	// For a parallel construct, the construct is a unit of work with respect to
85	// the workshare construct. The statements contained in the parallel construct
86	// are executed by a new thread team.
87	if (isNestedIn<omp::ParallelOp>(parentWorkshare, op))
88	return false;
89
90	// 2.8.2 single Construct
91	// Binding The binding thread set for a single region is the current team. A
92	// single region binds to the innermost enclosing parallel region.
93	// Description Only one of the encountering threads will execute the
94	// structured block associated with the single construct.
95	if (isNestedIn<omp::SingleOp>(parentWorkshare, op))
96	return false;
97
98	// Do not use workshare lowering until we support CFG in omp.workshare
99	if (parentWorkshare.getRegion().getBlocks().size() != `1`)
100	return false;
101
102	return true;
103	}
104
105	} // namespace flangomp
106
107	namespace {
108
109	struct SingleRegion {
110	Block::iterator begin, end;
111	};
112
113	static bool mustParallelizeOp(Operation *op) {
114	return op
115	->walk([&](Operation *nested) {
116	// We need to be careful not to pick up workshare.loop_wrapper in nested
117	// omp.parallel{omp.workshare} regions, i.e. make sure that `nested`
118	// binds to the workshare region we are currently handling.
119	//
120	// For example:
121	//
122	// omp.parallel {
123	// omp.workshare { // currently handling this
124	// omp.parallel {
125	// omp.workshare { // nested workshare
126	// omp.workshare.loop_wrapper {}
127	//
128	// Therefore, we skip if we encounter a nested omp.workshare.
129	if (isa<omp::WorkshareOp>(nested))
130	return WalkResult::skip();
131	if (isa<omp::WorkshareLoopWrapperOp>(nested))
132	return WalkResult::interrupt();
133	return WalkResult::advance();
134	})
135	.wasInterrupted();
136	}
137
138	static bool isSafeToParallelize(Operation *op) {
139	return isa<hlfir::DeclareOp>(op) \|\| isa<fir::DeclareOp>(op) \|\|
140	isMemoryEffectFree(op);
141	}
142
143	/// Simple shallow copies suffice for our purposes in this pass, so we implement
144	/// this simpler alternative to the full fledged `createCopyFunc` in the
145	/// frontend
146	static mlir::func::FuncOp createCopyFunc(mlir::Location loc, mlir::Type varType,
147	fir::FirOpBuilder builder) {
148	mlir::ModuleOp module = builder.getModule();
149	auto rt = cast<fir::ReferenceType>(varType);
150	mlir::Type eleTy = rt.getEleTy();
151	std::string copyFuncName =
152	fir::getTypeAsString(eleTy, builder.getKindMap(), "_workshare_copy");
153
154	if (auto decl = module.lookupSymbol<mlir::func::FuncOp>(copyFuncName))
155	return decl;
156
157	// create function
158	mlir::OpBuilder::InsertionGuard guard(builder);
159	mlir::OpBuilder modBuilder(module.getBodyRegion());
160	llvm::SmallVector<mlir::Type> argsTy = {varType, varType};
161	auto funcType = mlir::FunctionType::get(builder.getContext(), argsTy, {});
162	mlir::func::FuncOp funcOp =
163	modBuilder.create<mlir::func::FuncOp>(loc, copyFuncName, funcType);
164	funcOp.setVisibility(mlir::SymbolTable::Visibility::Private);
165	fir::factory::setInternalLinkage(funcOp);
166	builder.createBlock(&funcOp.getRegion(), funcOp.getRegion().end(), argsTy,
167	{loc, loc});
168	builder.setInsertionPointToStart(&funcOp.getRegion().back());
169
170	Value loaded = builder.create<fir::LoadOp>(loc, funcOp.getArgument(`1`));
171	builder.create<fir::StoreOp>(loc, loaded, funcOp.getArgument(`0`));
172
173	builder.create<mlir::func::ReturnOp>(loc);
174	return funcOp;
175	}
176
177	static bool isUserOutsideSR(Operation user, Operation parentOp,
178	SingleRegion sr) {
179	while (user->getParentOp() != parentOp)
180	user = user->getParentOp();
181	return sr.begin->getBlock() != user->getBlock() \|\|
182	!(user->isBeforeInBlock(&*sr.end) && sr.begin->isBeforeInBlock(user));
183	}
184
185	static bool isTransitivelyUsedOutside(Value v, SingleRegion sr) {
186	Block *srBlock = sr.begin->getBlock();
187	Operation *parentOp = srBlock->getParentOp();
188
189	for (auto &use : v.getUses()) {
190	Operation *user = use.getOwner();
191	if (isUserOutsideSR(user, parentOp, sr))
192	return true;
193
194	// Now we know user is inside `sr`.
195
196	// Results of nested users cannot be used outside of `sr`.
197	if (user->getBlock() != srBlock)
198	continue;
199
200	// A non-safe to parallelize operation will be checked for uses outside
201	// separately.
202	if (!isSafeToParallelize(user))
203	continue;
204
205	// For safe to parallelize operations, we need to check if there is a
206	// transitive use of `v` through them.
207	for (auto res : user->getResults())
208	if (isTransitivelyUsedOutside(res, sr))
209	return true;
210	}
211	return false;
212	}
213
214	/// We clone pure operations in both the parallel and single blocks. this
215	/// functions cleans them up if they end up with no uses
216	static void cleanupBlock(Block *block) {
217	for (Operation &op : llvm::make_early_inc_range(
218	llvm::make_range(block->rbegin(), block->rend())))
219	if (isOpTriviallyDead(&op))
220	op.erase();
221	}
222
223	static void parallelizeRegion(Region &sourceRegion, Region &targetRegion,
224	IRMapping &rootMapping, Location loc,
225	mlir::DominanceInfo &di) {
226	OpBuilder rootBuilder(sourceRegion.getContext());
227	ModuleOp m = sourceRegion.getParentOfType<ModuleOp>();
228	OpBuilder copyFuncBuilder(m.getBodyRegion());
229	fir::FirOpBuilder firCopyFuncBuilder(copyFuncBuilder, m);
230
231	auto mapReloadedValue =
232	[&](Value v, OpBuilder allocaBuilder, OpBuilder singleBuilder,
233	OpBuilder parallelBuilder, IRMapping singleMapping) -> Value {
234	if (auto reloaded = rootMapping.lookupOrNull(v))
235	return nullptr;
236	Type ty = v.getType();
237	Value alloc = allocaBuilder.create<fir::AllocaOp>(loc, ty);
238	singleBuilder.create<fir::StoreOp>(loc, singleMapping.lookup(v), alloc);
239	Value reloaded = parallelBuilder.create<fir::LoadOp>(loc, ty, alloc);
240	rootMapping.map(v, reloaded);
241	return alloc;
242	};
243
244	auto moveToSingle =
245	[&](SingleRegion sr, OpBuilder allocaBuilder, OpBuilder singleBuilder,
246	OpBuilder parallelBuilder) -> std::pair<bool, SmallVector<Value>> {
247	IRMapping singleMapping = rootMapping;
248	SmallVector<Value> copyPrivate;
249	bool allParallelized = true;
250
251	for (Operation &op : llvm::make_range(sr.begin, sr.end)) {
252	if (isSafeToParallelize(&op)) {
253	singleBuilder.clone(op, singleMapping);
254	if (llvm::all_of(op.getOperands(), [&](Value opr) {
255	// Either we have already remapped it
256	bool remapped = rootMapping.contains(opr);
257	// Or it is available because it dominates `sr`
258	bool dominates = di.properlyDominates(opr, &*sr.begin);
259	return remapped \|\| dominates;
260	})) {
261	// Safe to parallelize operations which have all operands available in
262	// the root parallel block can be executed there.
263	parallelBuilder.clone(op, rootMapping);
264	} else {
265	// If any operand was not available, it means that there was no
266	// transitive use of a non-safe-to-parallelize operation outside `sr`.
267	// This means that there should be no transitive uses outside `sr` of
268	// `op`.
269	assert(llvm::all_of(op.getResults(), [&](Value v) {
270	return !isTransitivelyUsedOutside(v, sr);
271	}));
272	allParallelized = false;
273	}
274	} else if (auto alloca = dyn_cast<fir::AllocaOp>(&op)) {
275	auto hoisted =
276	cast<fir::AllocaOp>(allocaBuilder.clone(*alloca, singleMapping));
277	rootMapping.map(&alloca, &hoisted);
278	rootMapping.map(alloca.getResult(), hoisted.getResult());
279	copyPrivate.push_back(hoisted);
280	allParallelized = false;
281	} else {
282	singleBuilder.clone(op, singleMapping);
283	// Prepare reloaded values for results of operations that cannot be
284	// safely parallelized and which are used after the region `sr`.
285	for (auto res : op.getResults()) {
286	if (isTransitivelyUsedOutside(res, sr)) {
287	auto alloc = mapReloadedValue(res, allocaBuilder, singleBuilder,
288	parallelBuilder, singleMapping);
289	if (alloc)
290	copyPrivate.push_back(alloc);
291	}
292	}
293	allParallelized = false;
294	}
295	}
296	singleBuilder.create<omp::TerminatorOp>(loc);
297	return {allParallelized, copyPrivate};
298	};
299
300	for (Block &block : sourceRegion) {
301	Block *targetBlock = rootBuilder.createBlock(
302	&targetRegion, {}, block.getArgumentTypes(),
303	llvm::map_to_vector(block.getArguments(),
304	[](BlockArgument arg) { return arg.getLoc(); }));
305	rootMapping.map(&block, targetBlock);
306	rootMapping.map(block.getArguments(), targetBlock->getArguments());
307	}
308
309	auto handleOneBlock = [&](Block &block) {
310	Block &targetBlock = *rootMapping.lookup(&block);
311	rootBuilder.setInsertionPointToStart(&targetBlock);
312	Operation *terminator = block.getTerminator();
313	SmallVector<std::variant<SingleRegion, Operation *>> regions;
314
315	auto it = block.begin();
316	auto getOneRegion = [&]() {
317	if (&*it == terminator)
318	return false;
319	if (mustParallelizeOp(&*it)) {
320	regions.push_back(&*it);
321	it++;
322	return true;
323	}
324	SingleRegion sr;
325	sr.begin = it;
326	while (&it != terminator && !mustParallelizeOp(&it))
327	it++;
328	sr.end = it;
329	assert(sr.begin != sr.end);
330	regions.push_back(sr);
331	return true;
332	};
333	while (getOneRegion())
334	;
335
336	for (auto [i, opOrSingle] : llvm::enumerate(regions)) {
337	bool isLast = i + `1` == regions.size();
338	if (std::holds_alternative<SingleRegion>(opOrSingle)) {
339	OpBuilder singleBuilder(sourceRegion.getContext());
340	Block singleBlock = new* Block();
341	singleBuilder.setInsertionPointToStart(singleBlock);
342
343	OpBuilder allocaBuilder(sourceRegion.getContext());
344	Block allocaBlock = new* Block();
345	allocaBuilder.setInsertionPointToStart(allocaBlock);
346
347	OpBuilder parallelBuilder(sourceRegion.getContext());
348	Block parallelBlock = new* Block();
349	parallelBuilder.setInsertionPointToStart(parallelBlock);
350
351	auto [allParallelized, copyprivateVars] =
352	moveToSingle(std::get<SingleRegion>(opOrSingle), allocaBuilder,
353	singleBuilder, parallelBuilder);
354	if (allParallelized) {
355	// The single region was not required as all operations were safe to
356	// parallelize
357	assert(copyprivateVars.empty());
358	assert(allocaBlock->empty());
359	delete singleBlock;
360	} else {
361	omp::SingleOperands singleOperands;
362	if (isLast)
363	singleOperands.nowait = rootBuilder.getUnitAttr();
364	singleOperands.copyprivateVars = copyprivateVars;
365	cleanupBlock(singleBlock);
366	for (auto var : singleOperands.copyprivateVars) {
367	mlir::func::FuncOp funcOp =
368	createCopyFunc(loc, var.getType(), firCopyFuncBuilder);
369	singleOperands.copyprivateSyms.push_back(
370	SymbolRefAttr::get(funcOp));
371	}
372	omp::SingleOp singleOp =
373	rootBuilder.create<omp::SingleOp>(loc, singleOperands);
374	singleOp.getRegion().push_back(singleBlock);
375	targetRegion.front().getOperations().splice(
376	singleOp->getIterator(), allocaBlock->getOperations());
377	}
378	rootBuilder.getInsertionBlock()->getOperations().splice(
379	rootBuilder.getInsertionPoint(), parallelBlock->getOperations());
380	delete allocaBlock;
381	delete parallelBlock;
382	} else {
383	auto op = std::get<Operation *>(opOrSingle);
384	if (auto wslw = dyn_cast<omp::WorkshareLoopWrapperOp>(op)) {
385	omp::WsloopOperands wsloopOperands;
386	if (isLast)
387	wsloopOperands.nowait = rootBuilder.getUnitAttr();
388	auto wsloop =
389	rootBuilder.create<mlir::omp::WsloopOp>(loc, wsloopOperands);
390	auto clonedWslw = cast<omp::WorkshareLoopWrapperOp>(
391	rootBuilder.clone(*wslw, rootMapping));
392	wsloop.getRegion().takeBody(clonedWslw.getRegion());
393	clonedWslw->erase();
394	} else {
395	assert(mustParallelizeOp(op));
396	Operation cloned = rootBuilder.cloneWithoutRegions(op, rootMapping);
397	for (auto [region, clonedRegion] :
398	llvm::zip(op->getRegions(), cloned->getRegions()))
399	parallelizeRegion(region, clonedRegion, rootMapping, loc, di);
400	}
401	}
402	}
403
404	rootBuilder.clone(*block.getTerminator(), rootMapping);
405	};
406
407	if (sourceRegion.hasOneBlock()) {
408	handleOneBlock(sourceRegion.front());
409	} else if (!sourceRegion.empty()) {
410	auto &domTree = di.getDomTree(&sourceRegion);
411	for (auto node : llvm::breadth_first(domTree.getRootNode())) {
412	handleOneBlock(*node->getBlock());
413	}
414	}
415
416	for (Block &targetBlock : targetRegion)
417	cleanupBlock(&targetBlock);
418	}
419
420	/// Lowers workshare to a sequence of single-thread regions and parallel loops
421	///
422	/// For example:
423	///
424	/// omp.workshare {
425	/// %a = fir.allocmem
426	/// omp.workshare.loop_wrapper {}
427	/// fir.call Assign %b %a
428	/// fir.freemem %a
429	/// }
430	///
431	/// becomes
432	///
433	/// %tmp = fir.alloca
434	/// omp.single copyprivate(%tmp) {
435	/// %a = fir.allocmem
436	/// fir.store %a %tmp
437	/// }
438	/// %a_reloaded = fir.load %tmp
439	/// omp.workshare.loop_wrapper {}
440	/// omp.single {
441	/// fir.call Assign %b %a_reloaded
442	/// fir.freemem %a_reloaded
443	/// }
444	///
445	/// Note that we allocate temporary memory for values in omp.single's which need
446	/// to be accessed by all threads and broadcast them using single's copyprivate
447	LogicalResult lowerWorkshare(mlir::omp::WorkshareOp wsOp, DominanceInfo &di) {
448	Location loc = wsOp->getLoc();
449	IRMapping rootMapping;
450
451	OpBuilder rootBuilder(wsOp);
452
453	// FIXME Currently, we only support workshare constructs with structured
454	// control flow. The transformation itself supports CFG, however, once we
455	// transform the MLIR region in the omp.workshare, we need to inline that
456	// region in the parent block. We have no guarantees at this point of the
457	// pipeline that the parent op supports CFG (e.g. fir.if), thus this is not
458	// generally possible. The alternative is to put the lowered region in an
459	// operation akin to scf.execute_region, which will get lowered at the same
460	// time when fir ops get lowered to CFG. However, SCF is not registered in
461	// flang so we cannot use it. Remove this requirement once we have
462	// scf.execute_region or an alternative operation available.
463	if (wsOp.getRegion().getBlocks().size() == `1`) {
464	// This operation is just a placeholder which will be erased later. We need
465	// it because our `parallelizeRegion` function works on regions and not
466	// blocks.
467	omp::WorkshareOp newOp =
468	rootBuilder.create<omp::WorkshareOp>(loc, omp::WorkshareOperands());
469	if (!wsOp.getNowait())
470	rootBuilder.create<omp::BarrierOp>(loc);
471
472	parallelizeRegion(wsOp.getRegion(), newOp.getRegion(), rootMapping, loc,
473	di);
474
475	// Inline the contents of the placeholder workshare op into its parent
476	// block.
477	Block *theBlock = &newOp.getRegion().front();
478	Operation *term = theBlock->getTerminator();
479	Block *parentBlock = wsOp->getBlock();
480	parentBlock->getOperations().splice(newOp->getIterator(),
481	theBlock->getOperations());
482	assert(term->getNumOperands() == `0`);
483	term->erase();
484	newOp->erase();
485	wsOp->erase();
486	} else {
487	// Otherwise just change the operation to an omp.single.
488
489	wsOp->emitWarning(
490	"omp workshare with unstructured control flow is currently "
491	"unsupported and will be serialized.");
492
493	// `shouldUseWorkshareLowering` should have guaranteed that there are no
494	// omp.workshare_loop_wrapper's that bind to this omp.workshare.
495	assert(!wsOp->walk([&](Operation *op) {
496	// Nested omp.workshare can have their own
497	// omp.workshare_loop_wrapper's.
498	if (isa<omp::WorkshareOp>(op))
499	return WalkResult::skip();
500	if (isa<omp::WorkshareLoopWrapperOp>(op))
501	return WalkResult::interrupt();
502	return WalkResult::advance();
503	})
504	.wasInterrupted());
505
506	omp::SingleOperands operands;
507	operands.nowait = wsOp.getNowaitAttr();
508	omp::SingleOp newOp = rootBuilder.create<omp::SingleOp>(loc, operands);
509
510	newOp.getRegion().getBlocks().splice(newOp.getRegion().getBlocks().begin(),
511	wsOp.getRegion().getBlocks());
512	wsOp->erase();
513	}
514	return success();
515	}
516
517	class LowerWorksharePass
518	: public flangomp::impl::LowerWorkshareBase<LowerWorksharePass> {
519	public:
520	void runOnOperation() override {
521	mlir::DominanceInfo &di = getAnalysis<mlir::DominanceInfo>();
522	getOperation()->walk([&](mlir::omp::WorkshareOp wsOp) {
523	if (failed(lowerWorkshare(wsOp, di)))
524	signalPassFailure();
525	});
526	}
527	};
528	} // namespace
529

source code of flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp