| 1 | //===--- Kernel.cpp - OpenMP device kernel interface -------------- C++ -*-===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // This file contains the kernel entry points for the device. |
| 10 | // |
| 11 | //===----------------------------------------------------------------------===// |
| 12 | |
| 13 | #include "Shared/Environment.h" |
| 14 | |
| 15 | #include "Allocator.h" |
| 16 | #include "Debug.h" |
| 17 | #include "DeviceTypes.h" |
| 18 | #include "Interface.h" |
| 19 | #include "Mapping.h" |
| 20 | #include "State.h" |
| 21 | #include "Synchronization.h" |
| 22 | #include "Workshare.h" |
| 23 | |
| 24 | using namespace ompx; |
| 25 | |
| 26 | // These flags are copied from "llvm/Frontend/OpenMP/OMPDeviceConstants.h" and |
| 27 | // must be kept in-sync. |
| 28 | enum OMPTgtExecModeFlags : unsigned char { |
| 29 | OMP_TGT_EXEC_MODE_BARE = 0, |
| 30 | OMP_TGT_EXEC_MODE_GENERIC = 1 << 0, |
| 31 | OMP_TGT_EXEC_MODE_SPMD = 1 << 1, |
| 32 | OMP_TGT_EXEC_MODE_GENERIC_SPMD = |
| 33 | OMP_TGT_EXEC_MODE_GENERIC | OMP_TGT_EXEC_MODE_SPMD |
| 34 | }; |
| 35 | |
| 36 | static void |
| 37 | inititializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment, |
| 38 | KernelLaunchEnvironmentTy &KernelLaunchEnvironment) { |
| 39 | // Order is important here. |
| 40 | synchronize::init(IsSPMD); |
| 41 | mapping::init(IsSPMD); |
| 42 | state::init(IsSPMD, KernelEnvironment, KernelLaunchEnvironment); |
| 43 | allocator::init(IsSPMD, KernelEnvironment); |
| 44 | workshare::init(IsSPMD); |
| 45 | } |
| 46 | |
| 47 | /// Simple generic state machine for worker threads. |
| 48 | static void genericStateMachine(IdentTy *Ident) { |
| 49 | uint32_t TId = mapping::getThreadIdInBlock(); |
| 50 | |
| 51 | do { |
| 52 | ParallelRegionFnTy WorkFn = nullptr; |
| 53 | |
| 54 | // Wait for the signal that we have a new work function. |
| 55 | synchronize::threads(atomic::seq_cst); |
| 56 | |
| 57 | // Retrieve the work function from the runtime. |
| 58 | bool IsActive = __kmpc_kernel_parallel(&WorkFn); |
| 59 | |
| 60 | // If there is nothing more to do, break out of the state machine by |
| 61 | // returning to the caller. |
| 62 | if (!WorkFn) |
| 63 | return; |
| 64 | |
| 65 | if (IsActive) { |
| 66 | ASSERT(!mapping::isSPMDMode(), nullptr); |
| 67 | ((void (*)(uint32_t, uint32_t))WorkFn)(0, TId); |
| 68 | __kmpc_kernel_end_parallel(); |
| 69 | } |
| 70 | |
| 71 | synchronize::threads(atomic::seq_cst); |
| 72 | |
| 73 | } while (true); |
| 74 | } |
| 75 | |
| 76 | extern "C" { |
| 77 | |
| 78 | /// Initialization |
| 79 | /// |
| 80 | /// \param Ident Source location identification, can be NULL. |
| 81 | /// |
| 82 | int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment, |
| 83 | KernelLaunchEnvironmentTy &KernelLaunchEnvironment) { |
| 84 | ConfigurationEnvironmentTy &Configuration = KernelEnvironment.Configuration; |
| 85 | bool IsSPMD = Configuration.ExecMode & OMP_TGT_EXEC_MODE_SPMD; |
| 86 | bool UseGenericStateMachine = Configuration.UseGenericStateMachine; |
| 87 | if (IsSPMD) { |
| 88 | inititializeRuntime(/*IsSPMD=*/true, KernelEnvironment, |
| 89 | KernelLaunchEnvironment); |
| 90 | synchronize::threadsAligned(atomic::relaxed); |
| 91 | } else { |
| 92 | inititializeRuntime(/*IsSPMD=*/false, KernelEnvironment, |
| 93 | KernelLaunchEnvironment); |
| 94 | // No need to wait since only the main threads will execute user |
| 95 | // code and workers will run into a barrier right away. |
| 96 | } |
| 97 | |
| 98 | if (IsSPMD) { |
| 99 | state::assumeInitialState(IsSPMD); |
| 100 | |
| 101 | // Synchronize to ensure the assertions above are in an aligned region. |
| 102 | // The barrier is eliminated later. |
| 103 | synchronize::threadsAligned(atomic::relaxed); |
| 104 | return -1; |
| 105 | } |
| 106 | |
| 107 | if (mapping::isInitialThreadInLevel0(IsSPMD)) |
| 108 | return -1; |
| 109 | |
| 110 | // Enter the generic state machine if enabled and if this thread can possibly |
| 111 | // be an active worker thread. |
| 112 | // |
| 113 | // The latter check is important for NVIDIA Pascal (but not Volta) and AMD |
| 114 | // GPU. In those cases, a single thread can apparently satisfy a barrier on |
| 115 | // behalf of all threads in the same warp. Thus, it would not be safe for |
| 116 | // other threads in the main thread's warp to reach the first |
| 117 | // synchronize::threads call in genericStateMachine before the main thread |
| 118 | // reaches its corresponding synchronize::threads call: that would permit all |
| 119 | // active worker threads to proceed before the main thread has actually set |
| 120 | // state::ParallelRegionFn, and then they would immediately quit without |
| 121 | // doing any work. mapping::getMaxTeamThreads() does not include any of the |
| 122 | // main thread's warp, so none of its threads can ever be active worker |
| 123 | // threads. |
| 124 | if (UseGenericStateMachine && |
| 125 | mapping::getThreadIdInBlock() < mapping::getMaxTeamThreads(IsSPMD)) |
| 126 | genericStateMachine(KernelEnvironment.Ident); |
| 127 | |
| 128 | return mapping::getThreadIdInBlock(); |
| 129 | } |
| 130 | |
| 131 | /// De-Initialization |
| 132 | /// |
| 133 | /// In non-SPMD, this function releases the workers trapped in a state machine |
| 134 | /// and also any memory dynamically allocated by the runtime. |
| 135 | /// |
| 136 | /// \param Ident Source location identification, can be NULL. |
| 137 | /// |
| 138 | void __kmpc_target_deinit() { |
| 139 | bool IsSPMD = mapping::isSPMDMode(); |
| 140 | if (IsSPMD) |
| 141 | return; |
| 142 | |
| 143 | if (mapping::isInitialThreadInLevel0(IsSPMD)) { |
| 144 | // Signal the workers to exit the state machine and exit the kernel. |
| 145 | state::ParallelRegionFn = nullptr; |
| 146 | } else if (!state::getKernelEnvironment() |
| 147 | .Configuration.UseGenericStateMachine) { |
| 148 | // Retrieve the work function just to ensure we always call |
| 149 | // __kmpc_kernel_parallel even if a custom state machine is used. |
| 150 | // TODO: this is not super pretty. The problem is we create the call to |
| 151 | // __kmpc_kernel_parallel in the openmp-opt pass but while we optimize it |
| 152 | // is not there yet. Thus, we assume we never reach it from |
| 153 | // __kmpc_target_deinit. That allows us to remove the store in there to |
| 154 | // ParallelRegionFn, which leads to bad results later on. |
| 155 | ParallelRegionFnTy WorkFn = nullptr; |
| 156 | __kmpc_kernel_parallel(&WorkFn); |
| 157 | ASSERT(WorkFn == nullptr, nullptr); |
| 158 | } |
| 159 | } |
| 160 | |
| 161 | int8_t __kmpc_is_spmd_exec_mode() { return mapping::isSPMDMode(); } |
| 162 | } |
| 163 | |