1 | //===- File.h - Reading sparse tensors from files ---------------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file implements reading sparse tensor from files in one of the |
10 | // following external formats: |
11 | // |
12 | // (1) Matrix Market Exchange (MME): *.mtx |
13 | // https://math.nist.gov/MatrixMarket/formats.html |
14 | // |
15 | // (2) Formidable Repository of Open Sparse Tensors and Tools (FROSTT): *.tns |
16 | // http://frostt.io/tensors/file-formats.html |
17 | // |
18 | //===----------------------------------------------------------------------===// |
19 | |
20 | #ifndef MLIR_EXECUTIONENGINE_SPARSETENSOR_FILE_H |
21 | #define MLIR_EXECUTIONENGINE_SPARSETENSOR_FILE_H |
22 | |
23 | #include "mlir/ExecutionEngine/SparseTensor/MapRef.h" |
24 | #include "mlir/ExecutionEngine/SparseTensor/Storage.h" |
25 | |
26 | #include <fstream> |
27 | |
28 | namespace mlir { |
29 | namespace sparse_tensor { |
30 | |
31 | namespace detail { |
32 | |
33 | template <typename T> |
34 | struct is_complex final : public std::false_type {}; |
35 | |
36 | template <typename T> |
37 | struct is_complex<std::complex<T>> final : public std::true_type {}; |
38 | |
39 | /// Returns an element-value of non-complex type. If `IsPattern` is true, |
40 | /// then returns an arbitrary value. If `IsPattern` is false, then |
41 | /// reads the value from the current line buffer beginning at `linePtr`. |
42 | template <typename V, bool IsPattern> |
43 | inline std::enable_if_t<!is_complex<V>::value, V> readValue(char **linePtr) { |
44 | // The external formats always store these numerical values with the type |
45 | // double, but we cast these values to the sparse tensor object type. |
46 | // For a pattern tensor, we arbitrarily pick the value 1 for all entries. |
47 | if constexpr (IsPattern) |
48 | return 1.0; |
49 | return strtod(nptr: *linePtr, endptr: linePtr); |
50 | } |
51 | |
52 | /// Returns an element-value of complex type. If `IsPattern` is true, |
53 | /// then returns an arbitrary value. If `IsPattern` is false, then reads |
54 | /// the value from the current line buffer beginning at `linePtr`. |
55 | template <typename V, bool IsPattern> |
56 | inline std::enable_if_t<is_complex<V>::value, V> readValue(char **linePtr) { |
57 | // Read two values to make a complex. The external formats always store |
58 | // numerical values with the type double, but we cast these values to the |
59 | // sparse tensor object type. For a pattern tensor, we arbitrarily pick the |
60 | // value 1 for all entries. |
61 | if constexpr (IsPattern) |
62 | return V(1.0, 1.0); |
63 | double re = strtod(nptr: *linePtr, endptr: linePtr); |
64 | double im = strtod(nptr: *linePtr, endptr: linePtr); |
65 | // Avoiding brace-notation since that forbids narrowing to `float`. |
66 | return V(re, im); |
67 | } |
68 | |
69 | /// Returns an element-value. If `isPattern` is true, then returns an |
70 | /// arbitrary value. If `isPattern` is false, then reads the value from |
71 | /// the current line buffer beginning at `linePtr`. |
72 | template <typename V> |
73 | inline V readValue(char **linePtr, bool isPattern) { |
74 | return isPattern ? readValue<V, true>(linePtr) : readValue<V, false>(linePtr); |
75 | } |
76 | |
77 | } // namespace detail |
78 | |
79 | //===----------------------------------------------------------------------===// |
80 | // |
81 | // Reader class. |
82 | // |
83 | //===----------------------------------------------------------------------===// |
84 | |
85 | /// This class abstracts over the information stored in file headers, |
86 | /// as well as providing the buffers and methods for parsing those headers. |
87 | class SparseTensorReader final { |
88 | public: |
89 | enum class ValueKind : uint8_t { |
90 | // The value before calling `readHeader`. |
91 | kInvalid = 0, |
92 | // Values that can be set by `readMMEHeader`. |
93 | kPattern = 1, |
94 | kReal = 2, |
95 | kInteger = 3, |
96 | kComplex = 4, |
97 | // The value set by `readExtFROSTTHeader`. |
98 | kUndefined = 5 |
99 | }; |
100 | |
101 | explicit SparseTensorReader(const char *filename) : filename(filename) { |
102 | assert(filename && "Received nullptr for filename" ); |
103 | } |
104 | |
105 | // Disallows copying, to avoid duplicating the `file` pointer. |
106 | SparseTensorReader(const SparseTensorReader &) = delete; |
107 | SparseTensorReader &operator=(const SparseTensorReader &) = delete; |
108 | |
109 | /// Factory method to allocate a new reader, open the file, read the |
110 | /// header, and validate that the actual contents of the file match |
111 | /// the expected `dimShape` and `valTp`. |
112 | static SparseTensorReader *create(const char *filename, uint64_t dimRank, |
113 | const uint64_t *dimShape, |
114 | PrimaryType valTp) { |
115 | SparseTensorReader *reader = new SparseTensorReader(filename); |
116 | reader->openFile(); |
117 | reader->readHeader(); |
118 | if (!reader->canReadAs(valTy: valTp)) { |
119 | fprintf(stderr, |
120 | format: "Tensor element type %d not compatible with values in file %s\n" , |
121 | static_cast<int>(valTp), filename); |
122 | exit(status: 1); |
123 | } |
124 | reader->assertMatchesShape(rank: dimRank, shape: dimShape); |
125 | return reader; |
126 | } |
127 | |
128 | // This dtor tries to avoid leaking the `file`. (Though it's better |
129 | // to call `closeFile` explicitly when possible, since there are |
130 | // circumstances where dtors are not called reliably.) |
131 | ~SparseTensorReader() { closeFile(); } |
132 | |
133 | /// Opens the file for reading. |
134 | void openFile(); |
135 | |
136 | /// Closes the file. |
137 | void closeFile(); |
138 | |
139 | /// Reads and parses the file's header. |
140 | void (); |
141 | |
142 | /// Returns the stored value kind. |
143 | ValueKind getValueKind() const { return valueKind_; } |
144 | |
145 | /// Checks if a header has been successfully read. |
146 | bool isValid() const { return valueKind_ != ValueKind::kInvalid; } |
147 | |
148 | /// Checks if the file's ValueKind can be converted into the given |
149 | /// tensor PrimaryType. Is only valid after parsing the header. |
150 | bool canReadAs(PrimaryType valTy) const; |
151 | |
152 | /// Gets the MME "pattern" property setting. Is only valid after |
153 | /// parsing the header. |
154 | bool isPattern() const { |
155 | assert(isValid() && "Attempt to isPattern() before readHeader()" ); |
156 | return valueKind_ == ValueKind::kPattern; |
157 | } |
158 | |
159 | /// Gets the MME "symmetric" property setting. Is only valid after |
160 | /// parsing the header. |
161 | bool isSymmetric() const { |
162 | assert(isValid() && "Attempt to isSymmetric() before readHeader()" ); |
163 | return isSymmetric_; |
164 | } |
165 | |
166 | /// Gets the dimension-rank of the tensor. Is only valid after parsing |
167 | /// the header. |
168 | uint64_t getRank() const { |
169 | assert(isValid() && "Attempt to getRank() before readHeader()" ); |
170 | return idata[0]; |
171 | } |
172 | |
173 | /// Gets the number of stored elements. Is only valid after parsing |
174 | /// the header. |
175 | uint64_t getNSE() const { |
176 | assert(isValid() && "Attempt to getNSE() before readHeader()" ); |
177 | return idata[1]; |
178 | } |
179 | |
180 | /// Gets the dimension-sizes array. The pointer itself is always |
181 | /// valid; however, the values stored therein are only valid after |
182 | /// parsing the header. |
183 | const uint64_t *getDimSizes() const { return idata + 2; } |
184 | |
185 | /// Safely gets the size of the given dimension. Is only valid |
186 | /// after parsing the header. |
187 | uint64_t getDimSize(uint64_t d) const { |
188 | assert(d < getRank() && "Dimension out of bounds" ); |
189 | return idata[2 + d]; |
190 | } |
191 | |
192 | /// Asserts the shape subsumes the actual dimension sizes. Is only |
193 | /// valid after parsing the header. |
194 | void assertMatchesShape(uint64_t rank, const uint64_t *shape) const; |
195 | |
196 | /// Allocates a new sparse-tensor storage object with the given encoding, |
197 | /// initializes it by reading all the elements from the file, and then |
198 | /// closes the file. Templated on P, I, and V. |
199 | template <typename P, typename I, typename V> |
200 | SparseTensorStorage<P, I, V> * |
201 | readSparseTensor(uint64_t lvlRank, const uint64_t *lvlSizes, |
202 | const LevelType *lvlTypes, const uint64_t *dim2lvl, |
203 | const uint64_t *lvl2dim) { |
204 | const uint64_t dimRank = getRank(); |
205 | MapRef map(dimRank, lvlRank, dim2lvl, lvl2dim); |
206 | auto *lvlCOO = readCOO<V>(map, lvlSizes); |
207 | auto *tensor = SparseTensorStorage<P, I, V>::newFromCOO( |
208 | dimRank, getDimSizes(), lvlRank, lvlSizes, lvlTypes, dim2lvl, lvl2dim, |
209 | lvlCOO); |
210 | delete lvlCOO; |
211 | return tensor; |
212 | } |
213 | |
214 | /// Reads the COO tensor from the file, stores the coordinates and values to |
215 | /// the given buffers, returns a boolean value to indicate whether the COO |
216 | /// elements are sorted. |
217 | template <typename C, typename V> |
218 | bool readToBuffers(uint64_t lvlRank, const uint64_t *dim2lvl, |
219 | const uint64_t *lvl2dim, C *lvlCoordinates, V *values); |
220 | |
221 | private: |
222 | /// Attempts to read a line from the file. |
223 | void readLine(); |
224 | |
225 | /// Reads the next line of the input file and parses the coordinates |
226 | /// into the `dimCoords` argument. Returns the position in the `line` |
227 | /// buffer where the element's value should be parsed from. |
228 | template <typename C> |
229 | char *readCoords(C *dimCoords) { |
230 | readLine(); |
231 | // Local variable for tracking the parser's position in the `line` buffer. |
232 | char *linePtr = line; |
233 | for (uint64_t dimRank = getRank(), d = 0; d < dimRank; ++d) { |
234 | // Parse the 1-based coordinate. |
235 | uint64_t c = strtoul(nptr: linePtr, endptr: &linePtr, base: 10); |
236 | // Store the 0-based coordinate. |
237 | dimCoords[d] = static_cast<C>(c - 1); |
238 | } |
239 | return linePtr; |
240 | } |
241 | |
242 | /// Reads all the elements from the file while applying the given map. |
243 | template <typename V> |
244 | SparseTensorCOO<V> *readCOO(const MapRef &map, const uint64_t *lvlSizes); |
245 | |
246 | /// The implementation of `readCOO` that is templated `IsPattern` in order |
247 | /// to perform LICM without needing to duplicate the source code. |
248 | template <typename V, bool IsPattern> |
249 | void readCOOLoop(const MapRef &map, SparseTensorCOO<V> *coo); |
250 | |
251 | /// The internal implementation of `readToBuffers`. We template over |
252 | /// `IsPattern` in order to perform LICM without needing to duplicate |
253 | /// the source code. |
254 | template <typename C, typename V, bool IsPattern> |
255 | bool readToBuffersLoop(const MapRef &map, C *lvlCoordinates, V *values); |
256 | |
257 | /// Reads the MME header of a general sparse matrix of type real. |
258 | void (); |
259 | |
260 | /// Reads the "extended" FROSTT header. Although not part of the |
261 | /// documented format, we assume that the file starts with optional |
262 | /// comments followed by two lines that define the rank, the number of |
263 | /// nonzeros, and the dimensions sizes (one per rank) of the sparse tensor. |
264 | void (); |
265 | |
266 | static constexpr int kColWidth = 1025; |
267 | const char *const filename; |
268 | FILE *file = nullptr; |
269 | ValueKind valueKind_ = ValueKind::kInvalid; |
270 | bool isSymmetric_ = false; |
271 | uint64_t idata[512]; |
272 | char line[kColWidth]; |
273 | }; |
274 | |
275 | //===----------------------------------------------------------------------===// |
276 | // |
277 | // Reader class methods. |
278 | // |
279 | //===----------------------------------------------------------------------===// |
280 | |
281 | template <typename V> |
282 | SparseTensorCOO<V> *SparseTensorReader::readCOO(const MapRef &map, |
283 | const uint64_t *lvlSizes) { |
284 | assert(isValid() && "Attempt to readCOO() before readHeader()" ); |
285 | // Prepare a COO object with the number of stored elems as initial capacity. |
286 | auto *coo = new SparseTensorCOO<V>(map.getLvlRank(), lvlSizes, getNSE()); |
287 | // Enter the reading loop. |
288 | if (isPattern()) |
289 | readCOOLoop<V, true>(map, coo); |
290 | else |
291 | readCOOLoop<V, false>(map, coo); |
292 | // Close the file and return the COO. |
293 | closeFile(); |
294 | return coo; |
295 | } |
296 | |
297 | template <typename V, bool IsPattern> |
298 | void SparseTensorReader::readCOOLoop(const MapRef &map, |
299 | SparseTensorCOO<V> *coo) { |
300 | const uint64_t dimRank = map.getDimRank(); |
301 | const uint64_t lvlRank = map.getLvlRank(); |
302 | assert(dimRank == getRank()); |
303 | std::vector<uint64_t> dimCoords(dimRank); |
304 | std::vector<uint64_t> lvlCoords(lvlRank); |
305 | for (uint64_t k = 0, nse = getNSE(); k < nse; k++) { |
306 | char *linePtr = readCoords(dimCoords: dimCoords.data()); |
307 | const V value = detail::readValue<V, IsPattern>(&linePtr); |
308 | map.pushforward(in: dimCoords.data(), out: lvlCoords.data()); |
309 | coo->add(lvlCoords, value); |
310 | } |
311 | } |
312 | |
313 | template <typename C, typename V> |
314 | bool SparseTensorReader::readToBuffers(uint64_t lvlRank, |
315 | const uint64_t *dim2lvl, |
316 | const uint64_t *lvl2dim, |
317 | C *lvlCoordinates, V *values) { |
318 | assert(isValid() && "Attempt to readCOO() before readHeader()" ); |
319 | MapRef map(getRank(), lvlRank, dim2lvl, lvl2dim); |
320 | bool isSorted = |
321 | isPattern() ? readToBuffersLoop<C, V, true>(map, lvlCoordinates, values) |
322 | : readToBuffersLoop<C, V, false>(map, lvlCoordinates, values); |
323 | closeFile(); |
324 | return isSorted; |
325 | } |
326 | |
327 | template <typename C, typename V, bool IsPattern> |
328 | bool SparseTensorReader::readToBuffersLoop(const MapRef &map, C *lvlCoordinates, |
329 | V *values) { |
330 | const uint64_t dimRank = map.getDimRank(); |
331 | const uint64_t lvlRank = map.getLvlRank(); |
332 | const uint64_t nse = getNSE(); |
333 | assert(dimRank == getRank()); |
334 | std::vector<C> dimCoords(dimRank); |
335 | bool isSorted = false; |
336 | char *linePtr; |
337 | const auto readNextElement = [&]() { |
338 | linePtr = readCoords<C>(dimCoords.data()); |
339 | map.pushforward(dimCoords.data(), lvlCoordinates); |
340 | *values = detail::readValue<V, IsPattern>(&linePtr); |
341 | if (isSorted) { |
342 | // Note that isSorted is set to false when reading the first element, |
343 | // to guarantee the safeness of using prevLvlCoords. |
344 | C *prevLvlCoords = lvlCoordinates - lvlRank; |
345 | for (uint64_t l = 0; l < lvlRank; ++l) { |
346 | if (prevLvlCoords[l] != lvlCoordinates[l]) { |
347 | if (prevLvlCoords[l] > lvlCoordinates[l]) |
348 | isSorted = false; |
349 | break; |
350 | } |
351 | } |
352 | } |
353 | lvlCoordinates += lvlRank; |
354 | ++values; |
355 | }; |
356 | readNextElement(); |
357 | isSorted = true; |
358 | for (uint64_t n = 1; n < nse; ++n) |
359 | readNextElement(); |
360 | return isSorted; |
361 | } |
362 | |
363 | } // namespace sparse_tensor |
364 | } // namespace mlir |
365 | |
366 | #endif // MLIR_EXECUTIONENGINE_SPARSETENSOR_FILE_H |
367 | |