| 1 | // Copyright 2009-2021 Intel Corporation | 
| 2 | // SPDX-License-Identifier: Apache-2.0 | 
| 3 |  | 
| 4 | #pragma once | 
| 5 |  | 
| 6 | #include "../common/default.h" | 
| 7 |  | 
| 8 | /* force a complete cache invalidation when running out of allocation space */ | 
| 9 | #define FORCE_SIMPLE_FLUSH 0 | 
| 10 |  | 
| 11 | #define THREAD_BLOCK_ATOMIC_ADD 4 | 
| 12 |  | 
| 13 | #if defined(DEBUG) | 
| 14 | #define CACHE_STATS(x)  | 
| 15 | #else | 
| 16 | #define CACHE_STATS(x)  | 
| 17 | #endif | 
| 18 |  | 
| 19 | namespace embree | 
| 20 | { | 
| 21 |   class SharedTessellationCacheStats | 
| 22 |   { | 
| 23 |   public: | 
| 24 |     /* stats */ | 
| 25 |     static std::atomic<size_t> cache_accesses; | 
| 26 |     static std::atomic<size_t> cache_hits; | 
| 27 |     static std::atomic<size_t> cache_misses; | 
| 28 |     static std::atomic<size_t> cache_flushes;                 | 
| 29 |     static size_t        cache_num_patches; | 
| 30 |     __aligned(64) static SpinLock mtx; | 
| 31 |      | 
| 32 |     /* print stats for debugging */                  | 
| 33 |     static void printStats(); | 
| 34 |     static void clearStats(); | 
| 35 |   }; | 
| 36 |    | 
| 37 |   void resizeTessellationCache(size_t new_size); | 
| 38 |   void resetTessellationCache(); | 
| 39 |    | 
| 40 |  //////////////////////////////////////////////////////////////////////////////// | 
| 41 |  //////////////////////////////////////////////////////////////////////////////// | 
| 42 |  //////////////////////////////////////////////////////////////////////////////// | 
| 43 |  | 
| 44 |  struct __aligned(64) ThreadWorkState  | 
| 45 |  { | 
| 46 |    ALIGNED_STRUCT_(64); | 
| 47 |  | 
| 48 |    std::atomic<size_t> counter; | 
| 49 |    ThreadWorkState* next; | 
| 50 |    bool allocated; | 
| 51 |  | 
| 52 |    __forceinline ThreadWorkState(bool allocated = false)  | 
| 53 |      : counter(0), next(nullptr), allocated(allocated)  | 
| 54 |    { | 
| 55 |      assert( ((size_t)this % 64) == 0 );  | 
| 56 |    }    | 
| 57 |  }; | 
| 58 |  | 
| 59 |  class __aligned(64) SharedLazyTessellationCache  | 
| 60 |  { | 
| 61 |  public: | 
| 62 |     | 
| 63 |    static const size_t NUM_CACHE_SEGMENTS              = 8; | 
| 64 |    static const size_t NUM_PREALLOC_THREAD_WORK_STATES = 512; | 
| 65 |    static const size_t COMMIT_INDEX_SHIFT              = 32+8; | 
| 66 | #if defined(__64BIT__) | 
| 67 |    static const size_t REF_TAG_MASK                    = 0xffffffffff; | 
| 68 | #else | 
| 69 |    static const size_t REF_TAG_MASK                    = 0x7FFFFFFF; | 
| 70 | #endif | 
| 71 |    static const size_t MAX_TESSELLATION_CACHE_SIZE     = REF_TAG_MASK+1; | 
| 72 |    static const size_t BLOCK_SIZE                      = 64; | 
| 73 |     | 
| 74 |  | 
| 75 |     /*! Per thread tessellation ref cache */ | 
| 76 |    static __thread ThreadWorkState* init_t_state; | 
| 77 |    static ThreadWorkState* current_t_state; | 
| 78 |     | 
| 79 |    static __forceinline ThreadWorkState *threadState()  | 
| 80 |    { | 
| 81 |      if (unlikely(!init_t_state)) | 
| 82 |        /* sets init_t_state, can't return pointer due to macosx icc bug*/ | 
| 83 |        SharedLazyTessellationCache::sharedLazyTessellationCache.getNextRenderThreadWorkState(); | 
| 84 |      return init_t_state; | 
| 85 |    } | 
| 86 |  | 
| 87 |    struct Tag | 
| 88 |    { | 
| 89 |      __forceinline Tag() : data(0) {} | 
| 90 |  | 
| 91 |      __forceinline Tag(void* ptr, size_t combinedTime) {  | 
| 92 |        init(ptr,combinedTime); | 
| 93 |      } | 
| 94 |  | 
| 95 |      __forceinline Tag(size_t ptr, size_t combinedTime) { | 
| 96 |        init(ptr: (void*)ptr,combinedTime);  | 
| 97 |      } | 
| 98 |  | 
| 99 |      __forceinline void init(void* ptr, size_t combinedTime) | 
| 100 |      { | 
| 101 |        if (ptr == nullptr) { | 
| 102 |          data = 0; | 
| 103 |          return; | 
| 104 |        } | 
| 105 |        int64_t new_root_ref = (int64_t) ptr; | 
| 106 |        new_root_ref -= (int64_t)SharedLazyTessellationCache::sharedLazyTessellationCache.getDataPtr();                                 | 
| 107 |        assert( new_root_ref <= (int64_t)REF_TAG_MASK ); | 
| 108 |        new_root_ref |= (int64_t)combinedTime << COMMIT_INDEX_SHIFT;  | 
| 109 |        data = new_root_ref; | 
| 110 |      } | 
| 111 |  | 
| 112 |      __forceinline int64_t get() const { return data.load(); } | 
| 113 |      __forceinline void set( int64_t v ) { data.store(i: v); } | 
| 114 |      __forceinline void reset() { data.store(i: 0); } | 
| 115 |  | 
| 116 |    private: | 
| 117 |      atomic<int64_t> data; | 
| 118 |    }; | 
| 119 |  | 
| 120 |    static __forceinline size_t (const int64_t v) { return v >> SharedLazyTessellationCache::COMMIT_INDEX_SHIFT; } | 
| 121 |  | 
| 122 |    struct CacheEntry | 
| 123 |    { | 
| 124 |      Tag tag; | 
| 125 |      SpinLock mutex; | 
| 126 |    }; | 
| 127 |  | 
| 128 |  private: | 
| 129 |  | 
| 130 |    float *data; | 
| 131 |    bool hugepages; | 
| 132 |    size_t size; | 
| 133 |    size_t maxBlocks; | 
| 134 |    ThreadWorkState *threadWorkState; | 
| 135 |        | 
| 136 |    __aligned(64) std::atomic<size_t> localTime; | 
| 137 |    __aligned(64) std::atomic<size_t> next_block; | 
| 138 |    __aligned(64) SpinLock   reset_state; | 
| 139 |    __aligned(64) SpinLock   linkedlist_mtx; | 
| 140 |    __aligned(64) std::atomic<size_t> switch_block_threshold; | 
| 141 |    __aligned(64) std::atomic<size_t> numRenderThreads; | 
| 142 |  | 
| 143 |  | 
| 144 |  public: | 
| 145 |  | 
| 146 |        | 
| 147 |    SharedLazyTessellationCache(); | 
| 148 |    ~SharedLazyTessellationCache(); | 
| 149 |  | 
| 150 |    void getNextRenderThreadWorkState(); | 
| 151 |  | 
| 152 |    __forceinline size_t maxAllocSize() const { | 
| 153 |      return switch_block_threshold; | 
| 154 |    } | 
| 155 |  | 
| 156 |    __forceinline size_t getCurrentIndex() { return localTime.load(); } | 
| 157 |    __forceinline void   addCurrentIndex(const size_t i=1) { localTime.fetch_add(i: i); } | 
| 158 |  | 
| 159 |    __forceinline size_t getTime(const size_t globalTime) { | 
| 160 |      return localTime.load()+NUM_CACHE_SEGMENTS*globalTime; | 
| 161 |    } | 
| 162 |  | 
| 163 |  | 
| 164 |    __forceinline size_t lockThread  (ThreadWorkState *const t_state, const ssize_t plus=1) { return t_state->counter.fetch_add(i: plus);  } | 
| 165 |    __forceinline size_t unlockThread(ThreadWorkState *const t_state, const ssize_t plus=-1) { assert(isLocked(t_state)); return t_state->counter.fetch_add(i: plus); } | 
| 166 |  | 
| 167 |    __forceinline bool isLocked(ThreadWorkState *const t_state) { return t_state->counter.load() != 0; } | 
| 168 |  | 
| 169 |    static __forceinline void lock  () { sharedLazyTessellationCache.lockThread(t_state: threadState()); } | 
| 170 |    static __forceinline void unlock() { sharedLazyTessellationCache.unlockThread(t_state: threadState()); } | 
| 171 |    static __forceinline bool isLocked() { return sharedLazyTessellationCache.isLocked(t_state: threadState()); } | 
| 172 |    static __forceinline size_t getState() { return threadState()->counter.load(); } | 
| 173 |    static __forceinline void lockThreadLoop() { sharedLazyTessellationCache.lockThreadLoop(t_state: threadState()); } | 
| 174 |  | 
| 175 |    static __forceinline size_t getTCacheTime(const size_t globalTime) { | 
| 176 |      return sharedLazyTessellationCache.getTime(globalTime); | 
| 177 |    } | 
| 178 |  | 
| 179 |    /* per thread lock */ | 
| 180 |    __forceinline void lockThreadLoop (ThreadWorkState *const t_state)  | 
| 181 |    {  | 
| 182 |      while(1) | 
| 183 |      { | 
| 184 |        size_t lock = SharedLazyTessellationCache::sharedLazyTessellationCache.lockThread(t_state,plus: 1); | 
| 185 |        if (unlikely(lock >= THREAD_BLOCK_ATOMIC_ADD)) | 
| 186 |        { | 
| 187 |          /* lock failed wait until sync phase is over */ | 
| 188 |          sharedLazyTessellationCache.unlockThread(t_state,plus: -1);	        | 
| 189 |          sharedLazyTessellationCache.waitForUsersLessEqual(t_state,users: 0); | 
| 190 |        } | 
| 191 |        else | 
| 192 |          break; | 
| 193 |      } | 
| 194 |    } | 
| 195 |  | 
| 196 |    static __forceinline void* lookup(CacheEntry& entry, size_t globalTime) | 
| 197 |    {    | 
| 198 |      const int64_t subdiv_patch_root_ref = entry.tag.get();  | 
| 199 |      CACHE_STATS(SharedTessellationCacheStats::cache_accesses++); | 
| 200 |       | 
| 201 |      if (likely(subdiv_patch_root_ref != 0))  | 
| 202 |      { | 
| 203 |        const size_t subdiv_patch_root = (subdiv_patch_root_ref & REF_TAG_MASK) + (size_t)sharedLazyTessellationCache.getDataPtr(); | 
| 204 |        const size_t subdiv_patch_cache_index = extractCommitIndex(v: subdiv_patch_root_ref); | 
| 205 |         | 
| 206 |        if (likely( sharedLazyTessellationCache.validCacheIndex(subdiv_patch_cache_index,globalTime) )) | 
| 207 |        { | 
| 208 |          CACHE_STATS(SharedTessellationCacheStats::cache_hits++); | 
| 209 |          return (void*) subdiv_patch_root; | 
| 210 |        } | 
| 211 |      } | 
| 212 |      CACHE_STATS(SharedTessellationCacheStats::cache_misses++); | 
| 213 |      return nullptr; | 
| 214 |    } | 
| 215 |  | 
| 216 |    template<typename Constructor> | 
| 217 |      static __forceinline auto lookup (CacheEntry& entry, size_t globalTime, const Constructor constructor, const bool before=false) -> decltype(constructor()) | 
| 218 |    { | 
| 219 |      ThreadWorkState *t_state = SharedLazyTessellationCache::threadState(); | 
| 220 |  | 
| 221 |      while (true) | 
| 222 |      { | 
| 223 |        sharedLazyTessellationCache.lockThreadLoop(t_state); | 
| 224 |        void* patch = SharedLazyTessellationCache::lookup(entry,globalTime); | 
| 225 |        if (patch) return (decltype(constructor())) patch; | 
| 226 |         | 
| 227 |        if (entry.mutex.try_lock()) | 
| 228 |        { | 
| 229 |          if (!validTag(tag: entry.tag,globalTime))  | 
| 230 |          { | 
| 231 |            auto timeBefore = sharedLazyTessellationCache.getTime(globalTime); | 
| 232 |            auto ret = constructor(); // thread is locked here! | 
| 233 |            assert(ret); | 
| 234 |            /* this should never return nullptr */ | 
| 235 |            auto timeAfter = sharedLazyTessellationCache.getTime(globalTime); | 
| 236 |            auto time = before ? timeBefore : timeAfter; | 
| 237 |            __memory_barrier(); | 
| 238 |            entry.tag = SharedLazyTessellationCache::Tag(ret,time); | 
| 239 |            __memory_barrier(); | 
| 240 |            entry.mutex.unlock(); | 
| 241 |            return ret; | 
| 242 |          } | 
| 243 |          entry.mutex.unlock(); | 
| 244 |        } | 
| 245 |        SharedLazyTessellationCache::sharedLazyTessellationCache.unlockThread(t_state); | 
| 246 |      } | 
| 247 |    } | 
| 248 |     | 
| 249 |    __forceinline bool validCacheIndex(const size_t i, const size_t globalTime) | 
| 250 |    { | 
| 251 | #if FORCE_SIMPLE_FLUSH == 1 | 
| 252 |      return i == getTime(globalTime); | 
| 253 | #else | 
| 254 |      return i+(NUM_CACHE_SEGMENTS-1) >= getTime(globalTime); | 
| 255 | #endif | 
| 256 |    } | 
| 257 |  | 
| 258 |    static __forceinline bool validTime(const size_t oldtime, const size_t newTime) | 
| 259 |    { | 
| 260 |      return oldtime+(NUM_CACHE_SEGMENTS-1) >= newTime; | 
| 261 |    } | 
| 262 |  | 
| 263 |  | 
| 264 |     static __forceinline bool validTag(const Tag& tag, size_t globalTime) | 
| 265 |     { | 
| 266 |       const int64_t subdiv_patch_root_ref = tag.get();  | 
| 267 |       if (subdiv_patch_root_ref == 0) return false; | 
| 268 |       const size_t subdiv_patch_cache_index = extractCommitIndex(v: subdiv_patch_root_ref); | 
| 269 |       return sharedLazyTessellationCache.validCacheIndex(i: subdiv_patch_cache_index,globalTime); | 
| 270 |     } | 
| 271 |  | 
| 272 |    void waitForUsersLessEqual(ThreadWorkState *const t_state, | 
| 273 | 			      const unsigned int users); | 
| 274 |      | 
| 275 |    __forceinline size_t alloc(const size_t blocks) | 
| 276 |    { | 
| 277 |      if (unlikely(blocks >= switch_block_threshold)) | 
| 278 |        throw_RTCError(RTC_ERROR_INVALID_OPERATION,"allocation exceeds size of tessellation cache segment" ); | 
| 279 |  | 
| 280 |      assert(blocks < switch_block_threshold); | 
| 281 |      size_t index = next_block.fetch_add(i: blocks); | 
| 282 |      if (unlikely(index + blocks >= switch_block_threshold)) return (size_t)-1; | 
| 283 |      return index; | 
| 284 |    } | 
| 285 |  | 
| 286 |    static __forceinline void* malloc(const size_t bytes) | 
| 287 |    { | 
| 288 |      size_t block_index = -1; | 
| 289 |      ThreadWorkState *const t_state = threadState(); | 
| 290 |      while (true) | 
| 291 |      { | 
| 292 |        block_index = sharedLazyTessellationCache.alloc(blocks: (bytes+BLOCK_SIZE-1)/BLOCK_SIZE); | 
| 293 |        if (block_index == (size_t)-1) | 
| 294 |        { | 
| 295 |          sharedLazyTessellationCache.unlockThread(t_state);		   | 
| 296 |          sharedLazyTessellationCache.allocNextSegment(); | 
| 297 |          sharedLazyTessellationCache.lockThread(t_state); | 
| 298 |          continue;  | 
| 299 |        } | 
| 300 |        break; | 
| 301 |      } | 
| 302 |      return sharedLazyTessellationCache.getBlockPtr(block_index); | 
| 303 |    } | 
| 304 |  | 
| 305 |    __forceinline void *getBlockPtr(const size_t block_index) | 
| 306 |    { | 
| 307 |      assert(block_index < maxBlocks); | 
| 308 |      assert(data); | 
| 309 |      assert(block_index*16 <= size); | 
| 310 |      return (void*)&data[block_index*16]; | 
| 311 |    } | 
| 312 |  | 
| 313 |    __forceinline void*  getDataPtr()      { return data; } | 
| 314 |    __forceinline size_t getNumUsedBytes() { return next_block * BLOCK_SIZE; } | 
| 315 |    __forceinline size_t getMaxBlocks()    { return maxBlocks; } | 
| 316 |    __forceinline size_t getSize()         { return size; } | 
| 317 |  | 
| 318 |    void allocNextSegment(); | 
| 319 |    void realloc(const size_t newSize); | 
| 320 |  | 
| 321 |    void reset(); | 
| 322 |  | 
| 323 |    static SharedLazyTessellationCache sharedLazyTessellationCache; | 
| 324 |  }; | 
| 325 | } | 
| 326 |  |