1//===- File.h - Reading sparse tensors from files ---------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements reading sparse tensor from files in one of the
10// following external formats:
11//
12// (1) Matrix Market Exchange (MME): *.mtx
13// https://math.nist.gov/MatrixMarket/formats.html
14//
15// (2) Formidable Repository of Open Sparse Tensors and Tools (FROSTT): *.tns
16// http://frostt.io/tensors/file-formats.html
17//
18//===----------------------------------------------------------------------===//
19
20#ifndef MLIR_EXECUTIONENGINE_SPARSETENSOR_FILE_H
21#define MLIR_EXECUTIONENGINE_SPARSETENSOR_FILE_H
22
23#include "mlir/ExecutionEngine/SparseTensor/MapRef.h"
24#include "mlir/ExecutionEngine/SparseTensor/Storage.h"
25
26#include <fstream>
27
28namespace mlir {
29namespace sparse_tensor {
30
31namespace detail {
32
33template <typename T>
34struct is_complex final : public std::false_type {};
35
36template <typename T>
37struct is_complex<std::complex<T>> final : public std::true_type {};
38
39/// Returns an element-value of non-complex type. If `IsPattern` is true,
40/// then returns an arbitrary value. If `IsPattern` is false, then
41/// reads the value from the current line buffer beginning at `linePtr`.
42template <typename V, bool IsPattern>
43inline std::enable_if_t<!is_complex<V>::value, V> readValue(char **linePtr) {
44 // The external formats always store these numerical values with the type
45 // double, but we cast these values to the sparse tensor object type.
46 // For a pattern tensor, we arbitrarily pick the value 1 for all entries.
47 if constexpr (IsPattern)
48 return 1.0;
49 return strtod(nptr: *linePtr, endptr: linePtr);
50}
51
52/// Returns an element-value of complex type. If `IsPattern` is true,
53/// then returns an arbitrary value. If `IsPattern` is false, then reads
54/// the value from the current line buffer beginning at `linePtr`.
55template <typename V, bool IsPattern>
56inline std::enable_if_t<is_complex<V>::value, V> readValue(char **linePtr) {
57 // Read two values to make a complex. The external formats always store
58 // numerical values with the type double, but we cast these values to the
59 // sparse tensor object type. For a pattern tensor, we arbitrarily pick the
60 // value 1 for all entries.
61 if constexpr (IsPattern)
62 return V(1.0, 1.0);
63 double re = strtod(nptr: *linePtr, endptr: linePtr);
64 double im = strtod(nptr: *linePtr, endptr: linePtr);
65 // Avoiding brace-notation since that forbids narrowing to `float`.
66 return V(re, im);
67}
68
69/// Returns an element-value. If `isPattern` is true, then returns an
70/// arbitrary value. If `isPattern` is false, then reads the value from
71/// the current line buffer beginning at `linePtr`.
72template <typename V>
73inline V readValue(char **linePtr, bool isPattern) {
74 return isPattern ? readValue<V, true>(linePtr) : readValue<V, false>(linePtr);
75}
76
77} // namespace detail
78
79//===----------------------------------------------------------------------===//
80//
81// Reader class.
82//
83//===----------------------------------------------------------------------===//
84
85/// This class abstracts over the information stored in file headers,
86/// as well as providing the buffers and methods for parsing those headers.
87class SparseTensorReader final {
88public:
89 enum class ValueKind : uint8_t {
90 // The value before calling `readHeader`.
91 kInvalid = 0,
92 // Values that can be set by `readMMEHeader`.
93 kPattern = 1,
94 kReal = 2,
95 kInteger = 3,
96 kComplex = 4,
97 // The value set by `readExtFROSTTHeader`.
98 kUndefined = 5
99 };
100
101 explicit SparseTensorReader(const char *filename) : filename(filename) {
102 assert(filename && "Received nullptr for filename");
103 }
104
105 // Disallows copying, to avoid duplicating the `file` pointer.
106 SparseTensorReader(const SparseTensorReader &) = delete;
107 SparseTensorReader &operator=(const SparseTensorReader &) = delete;
108
109 /// Factory method to allocate a new reader, open the file, read the
110 /// header, and validate that the actual contents of the file match
111 /// the expected `dimShape` and `valTp`.
112 static SparseTensorReader *create(const char *filename, uint64_t dimRank,
113 const uint64_t *dimShape,
114 PrimaryType valTp) {
115 SparseTensorReader *reader = new SparseTensorReader(filename);
116 reader->openFile();
117 reader->readHeader();
118 if (!reader->canReadAs(valTy: valTp)) {
119 fprintf(stderr,
120 format: "Tensor element type %d not compatible with values in file %s\n",
121 static_cast<int>(valTp), filename);
122 exit(status: 1);
123 }
124 reader->assertMatchesShape(rank: dimRank, shape: dimShape);
125 return reader;
126 }
127
128 // This dtor tries to avoid leaking the `file`. (Though it's better
129 // to call `closeFile` explicitly when possible, since there are
130 // circumstances where dtors are not called reliably.)
131 ~SparseTensorReader() { closeFile(); }
132
133 /// Opens the file for reading.
134 void openFile();
135
136 /// Closes the file.
137 void closeFile();
138
139 /// Reads and parses the file's header.
140 void readHeader();
141
142 /// Returns the stored value kind.
143 ValueKind getValueKind() const { return valueKind_; }
144
145 /// Checks if a header has been successfully read.
146 bool isValid() const { return valueKind_ != ValueKind::kInvalid; }
147
148 /// Checks if the file's ValueKind can be converted into the given
149 /// tensor PrimaryType. Is only valid after parsing the header.
150 bool canReadAs(PrimaryType valTy) const;
151
152 /// Gets the MME "pattern" property setting. Is only valid after
153 /// parsing the header.
154 bool isPattern() const {
155 assert(isValid() && "Attempt to isPattern() before readHeader()");
156 return valueKind_ == ValueKind::kPattern;
157 }
158
159 /// Gets the MME "symmetric" property setting. Is only valid after
160 /// parsing the header.
161 bool isSymmetric() const {
162 assert(isValid() && "Attempt to isSymmetric() before readHeader()");
163 return isSymmetric_;
164 }
165
166 /// Gets the dimension-rank of the tensor. Is only valid after parsing
167 /// the header.
168 uint64_t getRank() const {
169 assert(isValid() && "Attempt to getRank() before readHeader()");
170 return idata[0];
171 }
172
173 /// Gets the number of stored elements. Is only valid after parsing
174 /// the header.
175 uint64_t getNSE() const {
176 assert(isValid() && "Attempt to getNSE() before readHeader()");
177 return idata[1];
178 }
179
180 /// Gets the dimension-sizes array. The pointer itself is always
181 /// valid; however, the values stored therein are only valid after
182 /// parsing the header.
183 const uint64_t *getDimSizes() const { return idata + 2; }
184
185 /// Safely gets the size of the given dimension. Is only valid
186 /// after parsing the header.
187 uint64_t getDimSize(uint64_t d) const {
188 assert(d < getRank() && "Dimension out of bounds");
189 return idata[2 + d];
190 }
191
192 /// Asserts the shape subsumes the actual dimension sizes. Is only
193 /// valid after parsing the header.
194 void assertMatchesShape(uint64_t rank, const uint64_t *shape) const;
195
196 /// Allocates a new sparse-tensor storage object with the given encoding,
197 /// initializes it by reading all the elements from the file, and then
198 /// closes the file. Templated on P, I, and V.
199 template <typename P, typename I, typename V>
200 SparseTensorStorage<P, I, V> *
201 readSparseTensor(uint64_t lvlRank, const uint64_t *lvlSizes,
202 const LevelType *lvlTypes, const uint64_t *dim2lvl,
203 const uint64_t *lvl2dim) {
204 const uint64_t dimRank = getRank();
205 MapRef map(dimRank, lvlRank, dim2lvl, lvl2dim);
206 auto *lvlCOO = readCOO<V>(map, lvlSizes);
207 auto *tensor = SparseTensorStorage<P, I, V>::newFromCOO(
208 dimRank, getDimSizes(), lvlRank, lvlSizes, lvlTypes, dim2lvl, lvl2dim,
209 lvlCOO);
210 delete lvlCOO;
211 return tensor;
212 }
213
214 /// Reads the COO tensor from the file, stores the coordinates and values to
215 /// the given buffers, returns a boolean value to indicate whether the COO
216 /// elements are sorted.
217 template <typename C, typename V>
218 bool readToBuffers(uint64_t lvlRank, const uint64_t *dim2lvl,
219 const uint64_t *lvl2dim, C *lvlCoordinates, V *values);
220
221private:
222 /// Attempts to read a line from the file.
223 void readLine();
224
225 /// Reads the next line of the input file and parses the coordinates
226 /// into the `dimCoords` argument. Returns the position in the `line`
227 /// buffer where the element's value should be parsed from.
228 template <typename C>
229 char *readCoords(C *dimCoords) {
230 readLine();
231 // Local variable for tracking the parser's position in the `line` buffer.
232 char *linePtr = line;
233 for (uint64_t dimRank = getRank(), d = 0; d < dimRank; ++d) {
234 // Parse the 1-based coordinate.
235 uint64_t c = strtoul(nptr: linePtr, endptr: &linePtr, base: 10);
236 // Store the 0-based coordinate.
237 dimCoords[d] = static_cast<C>(c - 1);
238 }
239 return linePtr;
240 }
241
242 /// Reads all the elements from the file while applying the given map.
243 template <typename V>
244 SparseTensorCOO<V> *readCOO(const MapRef &map, const uint64_t *lvlSizes);
245
246 /// The implementation of `readCOO` that is templated `IsPattern` in order
247 /// to perform LICM without needing to duplicate the source code.
248 template <typename V, bool IsPattern>
249 void readCOOLoop(const MapRef &map, SparseTensorCOO<V> *coo);
250
251 /// The internal implementation of `readToBuffers`. We template over
252 /// `IsPattern` in order to perform LICM without needing to duplicate
253 /// the source code.
254 template <typename C, typename V, bool IsPattern>
255 bool readToBuffersLoop(const MapRef &map, C *lvlCoordinates, V *values);
256
257 /// Reads the MME header of a general sparse matrix of type real.
258 void readMMEHeader();
259
260 /// Reads the "extended" FROSTT header. Although not part of the
261 /// documented format, we assume that the file starts with optional
262 /// comments followed by two lines that define the rank, the number of
263 /// nonzeros, and the dimensions sizes (one per rank) of the sparse tensor.
264 void readExtFROSTTHeader();
265
266 static constexpr int kColWidth = 1025;
267 const char *const filename;
268 FILE *file = nullptr;
269 ValueKind valueKind_ = ValueKind::kInvalid;
270 bool isSymmetric_ = false;
271 uint64_t idata[512];
272 char line[kColWidth];
273};
274
275//===----------------------------------------------------------------------===//
276//
277// Reader class methods.
278//
279//===----------------------------------------------------------------------===//
280
281template <typename V>
282SparseTensorCOO<V> *SparseTensorReader::readCOO(const MapRef &map,
283 const uint64_t *lvlSizes) {
284 assert(isValid() && "Attempt to readCOO() before readHeader()");
285 // Prepare a COO object with the number of stored elems as initial capacity.
286 auto *coo = new SparseTensorCOO<V>(map.getLvlRank(), lvlSizes, getNSE());
287 // Enter the reading loop.
288 if (isPattern())
289 readCOOLoop<V, true>(map, coo);
290 else
291 readCOOLoop<V, false>(map, coo);
292 // Close the file and return the COO.
293 closeFile();
294 return coo;
295}
296
297template <typename V, bool IsPattern>
298void SparseTensorReader::readCOOLoop(const MapRef &map,
299 SparseTensorCOO<V> *coo) {
300 const uint64_t dimRank = map.getDimRank();
301 const uint64_t lvlRank = map.getLvlRank();
302 assert(dimRank == getRank());
303 std::vector<uint64_t> dimCoords(dimRank);
304 std::vector<uint64_t> lvlCoords(lvlRank);
305 for (uint64_t k = 0, nse = getNSE(); k < nse; k++) {
306 char *linePtr = readCoords(dimCoords: dimCoords.data());
307 const V value = detail::readValue<V, IsPattern>(&linePtr);
308 map.pushforward(in: dimCoords.data(), out: lvlCoords.data());
309 coo->add(lvlCoords, value);
310 }
311}
312
313template <typename C, typename V>
314bool SparseTensorReader::readToBuffers(uint64_t lvlRank,
315 const uint64_t *dim2lvl,
316 const uint64_t *lvl2dim,
317 C *lvlCoordinates, V *values) {
318 assert(isValid() && "Attempt to readCOO() before readHeader()");
319 MapRef map(getRank(), lvlRank, dim2lvl, lvl2dim);
320 bool isSorted =
321 isPattern() ? readToBuffersLoop<C, V, true>(map, lvlCoordinates, values)
322 : readToBuffersLoop<C, V, false>(map, lvlCoordinates, values);
323 closeFile();
324 return isSorted;
325}
326
327template <typename C, typename V, bool IsPattern>
328bool SparseTensorReader::readToBuffersLoop(const MapRef &map, C *lvlCoordinates,
329 V *values) {
330 const uint64_t dimRank = map.getDimRank();
331 const uint64_t lvlRank = map.getLvlRank();
332 const uint64_t nse = getNSE();
333 assert(dimRank == getRank());
334 std::vector<C> dimCoords(dimRank);
335 bool isSorted = false;
336 char *linePtr;
337 const auto readNextElement = [&]() {
338 linePtr = readCoords<C>(dimCoords.data());
339 map.pushforward(dimCoords.data(), lvlCoordinates);
340 *values = detail::readValue<V, IsPattern>(&linePtr);
341 if (isSorted) {
342 // Note that isSorted is set to false when reading the first element,
343 // to guarantee the safeness of using prevLvlCoords.
344 C *prevLvlCoords = lvlCoordinates - lvlRank;
345 for (uint64_t l = 0; l < lvlRank; ++l) {
346 if (prevLvlCoords[l] != lvlCoordinates[l]) {
347 if (prevLvlCoords[l] > lvlCoordinates[l])
348 isSorted = false;
349 break;
350 }
351 }
352 }
353 lvlCoordinates += lvlRank;
354 ++values;
355 };
356 readNextElement();
357 isSorted = true;
358 for (uint64_t n = 1; n < nse; ++n)
359 readNextElement();
360 return isSorted;
361}
362
363} // namespace sparse_tensor
364} // namespace mlir
365
366#endif // MLIR_EXECUTIONENGINE_SPARSETENSOR_FILE_H
367

source code of mlir/include/mlir/ExecutionEngine/SparseTensor/File.h