1 | /* SPDX-License-Identifier: GPL-2.0 */ |
2 | #ifndef _BCACHEFS_JOURNAL_H |
3 | #define _BCACHEFS_JOURNAL_H |
4 | |
5 | /* |
6 | * THE JOURNAL: |
7 | * |
8 | * The primary purpose of the journal is to log updates (insertions) to the |
9 | * b-tree, to avoid having to do synchronous updates to the b-tree on disk. |
10 | * |
11 | * Without the journal, the b-tree is always internally consistent on |
12 | * disk - and in fact, in the earliest incarnations bcache didn't have a journal |
13 | * but did handle unclean shutdowns by doing all index updates synchronously |
14 | * (with coalescing). |
15 | * |
16 | * Updates to interior nodes still happen synchronously and without the journal |
17 | * (for simplicity) - this may change eventually but updates to interior nodes |
18 | * are rare enough it's not a huge priority. |
19 | * |
20 | * This means the journal is relatively separate from the b-tree; it consists of |
21 | * just a list of keys and journal replay consists of just redoing those |
22 | * insertions in same order that they appear in the journal. |
23 | * |
24 | * PERSISTENCE: |
25 | * |
26 | * For synchronous updates (where we're waiting on the index update to hit |
27 | * disk), the journal entry will be written out immediately (or as soon as |
28 | * possible, if the write for the previous journal entry was still in flight). |
29 | * |
30 | * Synchronous updates are specified by passing a closure (@flush_cl) to |
31 | * bch2_btree_insert() or bch_btree_insert_node(), which then pass that parameter |
32 | * down to the journalling code. That closure will wait on the journal write to |
33 | * complete (via closure_wait()). |
34 | * |
35 | * If the index update wasn't synchronous, the journal entry will be |
36 | * written out after 10 ms have elapsed, by default (the delay_ms field |
37 | * in struct journal). |
38 | * |
39 | * JOURNAL ENTRIES: |
40 | * |
41 | * A journal entry is variable size (struct jset), it's got a fixed length |
42 | * header and then a variable number of struct jset_entry entries. |
43 | * |
44 | * Journal entries are identified by monotonically increasing 64 bit sequence |
45 | * numbers - jset->seq; other places in the code refer to this sequence number. |
46 | * |
47 | * A jset_entry entry contains one or more bkeys (which is what gets inserted |
48 | * into the b-tree). We need a container to indicate which b-tree the key is |
49 | * for; also, the roots of the various b-trees are stored in jset_entry entries |
50 | * (one for each b-tree) - this lets us add new b-tree types without changing |
51 | * the on disk format. |
52 | * |
53 | * We also keep some things in the journal header that are logically part of the |
54 | * superblock - all the things that are frequently updated. This is for future |
55 | * bcache on raw flash support; the superblock (which will become another |
56 | * journal) can't be moved or wear leveled, so it contains just enough |
57 | * information to find the main journal, and the superblock only has to be |
58 | * rewritten when we want to move/wear level the main journal. |
59 | * |
60 | * JOURNAL LAYOUT ON DISK: |
61 | * |
62 | * The journal is written to a ringbuffer of buckets (which is kept in the |
63 | * superblock); the individual buckets are not necessarily contiguous on disk |
64 | * which means that journal entries are not allowed to span buckets, but also |
65 | * that we can resize the journal at runtime if desired (unimplemented). |
66 | * |
67 | * The journal buckets exist in the same pool as all the other buckets that are |
68 | * managed by the allocator and garbage collection - garbage collection marks |
69 | * the journal buckets as metadata buckets. |
70 | * |
71 | * OPEN/DIRTY JOURNAL ENTRIES: |
72 | * |
73 | * Open/dirty journal entries are journal entries that contain b-tree updates |
74 | * that have not yet been written out to the b-tree on disk. We have to track |
75 | * which journal entries are dirty, and we also have to avoid wrapping around |
76 | * the journal and overwriting old but still dirty journal entries with new |
77 | * journal entries. |
78 | * |
79 | * On disk, this is represented with the "last_seq" field of struct jset; |
80 | * last_seq is the first sequence number that journal replay has to replay. |
81 | * |
82 | * To avoid overwriting dirty journal entries on disk, we keep a mapping (in |
83 | * journal_device->seq) of for each journal bucket, the highest sequence number |
84 | * any journal entry it contains. Then, by comparing that against last_seq we |
85 | * can determine whether that journal bucket contains dirty journal entries or |
86 | * not. |
87 | * |
88 | * To track which journal entries are dirty, we maintain a fifo of refcounts |
89 | * (where each entry corresponds to a specific sequence number) - when a ref |
90 | * goes to 0, that journal entry is no longer dirty. |
91 | * |
92 | * Journalling of index updates is done at the same time as the b-tree itself is |
93 | * being modified (see btree_insert_key()); when we add the key to the journal |
94 | * the pending b-tree write takes a ref on the journal entry the key was added |
95 | * to. If a pending b-tree write would need to take refs on multiple dirty |
96 | * journal entries, it only keeps the ref on the oldest one (since a newer |
97 | * journal entry will still be replayed if an older entry was dirty). |
98 | * |
99 | * JOURNAL FILLING UP: |
100 | * |
101 | * There are two ways the journal could fill up; either we could run out of |
102 | * space to write to, or we could have too many open journal entries and run out |
103 | * of room in the fifo of refcounts. Since those refcounts are decremented |
104 | * without any locking we can't safely resize that fifo, so we handle it the |
105 | * same way. |
106 | * |
107 | * If the journal fills up, we start flushing dirty btree nodes until we can |
108 | * allocate space for a journal write again - preferentially flushing btree |
109 | * nodes that are pinning the oldest journal entries first. |
110 | */ |
111 | |
112 | #include <linux/hash.h> |
113 | |
114 | #include "journal_types.h" |
115 | |
116 | struct bch_fs; |
117 | |
118 | static inline void journal_wake(struct journal *j) |
119 | { |
120 | wake_up(&j->wait); |
121 | closure_wake_up(list: &j->async_wait); |
122 | } |
123 | |
124 | static inline struct journal_buf *journal_cur_buf(struct journal *j) |
125 | { |
126 | return j->buf + j->reservations.idx; |
127 | } |
128 | |
129 | /* Sequence number of oldest dirty journal entry */ |
130 | |
131 | static inline u64 journal_last_seq(struct journal *j) |
132 | { |
133 | return j->pin.front; |
134 | } |
135 | |
136 | static inline u64 journal_cur_seq(struct journal *j) |
137 | { |
138 | return atomic64_read(v: &j->seq); |
139 | } |
140 | |
141 | static inline u64 journal_last_unwritten_seq(struct journal *j) |
142 | { |
143 | return j->seq_ondisk + 1; |
144 | } |
145 | |
146 | static inline int journal_state_count(union journal_res_state s, int idx) |
147 | { |
148 | switch (idx) { |
149 | case 0: return s.buf0_count; |
150 | case 1: return s.buf1_count; |
151 | case 2: return s.buf2_count; |
152 | case 3: return s.buf3_count; |
153 | } |
154 | BUG(); |
155 | } |
156 | |
157 | static inline void journal_state_inc(union journal_res_state *s) |
158 | { |
159 | s->buf0_count += s->idx == 0; |
160 | s->buf1_count += s->idx == 1; |
161 | s->buf2_count += s->idx == 2; |
162 | s->buf3_count += s->idx == 3; |
163 | } |
164 | |
165 | /* |
166 | * Amount of space that will be taken up by some keys in the journal (i.e. |
167 | * including the jset header) |
168 | */ |
169 | static inline unsigned jset_u64s(unsigned u64s) |
170 | { |
171 | return u64s + sizeof(struct jset_entry) / sizeof(u64); |
172 | } |
173 | |
174 | static inline int journal_entry_overhead(struct journal *j) |
175 | { |
176 | return sizeof(struct jset) / sizeof(u64) + j->entry_u64s_reserved; |
177 | } |
178 | |
179 | static inline struct jset_entry * |
180 | bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s) |
181 | { |
182 | struct jset *jset = buf->data; |
183 | struct jset_entry *entry = vstruct_idx(jset, le32_to_cpu(jset->u64s)); |
184 | |
185 | memset(entry, 0, sizeof(*entry)); |
186 | entry->u64s = cpu_to_le16(u64s); |
187 | |
188 | le32_add_cpu(var: &jset->u64s, val: jset_u64s(u64s)); |
189 | |
190 | return entry; |
191 | } |
192 | |
193 | static inline struct jset_entry * |
194 | journal_res_entry(struct journal *j, struct journal_res *res) |
195 | { |
196 | return vstruct_idx(j->buf[res->idx].data, res->offset); |
197 | } |
198 | |
199 | static inline unsigned journal_entry_init(struct jset_entry *entry, unsigned type, |
200 | enum btree_id id, unsigned level, |
201 | unsigned u64s) |
202 | { |
203 | entry->u64s = cpu_to_le16(u64s); |
204 | entry->btree_id = id; |
205 | entry->level = level; |
206 | entry->type = type; |
207 | entry->pad[0] = 0; |
208 | entry->pad[1] = 0; |
209 | entry->pad[2] = 0; |
210 | return jset_u64s(u64s); |
211 | } |
212 | |
213 | static inline unsigned journal_entry_set(struct jset_entry *entry, unsigned type, |
214 | enum btree_id id, unsigned level, |
215 | const void *data, unsigned u64s) |
216 | { |
217 | unsigned ret = journal_entry_init(entry, type, id, level, u64s); |
218 | |
219 | memcpy_u64s_small(dst: entry->_data, src: data, u64s); |
220 | return ret; |
221 | } |
222 | |
223 | static inline struct jset_entry * |
224 | bch2_journal_add_entry(struct journal *j, struct journal_res *res, |
225 | unsigned type, enum btree_id id, |
226 | unsigned level, unsigned u64s) |
227 | { |
228 | struct jset_entry *entry = journal_res_entry(j, res); |
229 | unsigned actual = journal_entry_init(entry, type, id, level, u64s); |
230 | |
231 | EBUG_ON(!res->ref); |
232 | EBUG_ON(actual > res->u64s); |
233 | |
234 | res->offset += actual; |
235 | res->u64s -= actual; |
236 | return entry; |
237 | } |
238 | |
239 | static inline bool journal_entry_empty(struct jset *j) |
240 | { |
241 | if (j->seq != j->last_seq) |
242 | return false; |
243 | |
244 | vstruct_for_each(j, i) |
245 | if (i->type == BCH_JSET_ENTRY_btree_keys && i->u64s) |
246 | return false; |
247 | return true; |
248 | } |
249 | |
250 | /* |
251 | * Drop reference on a buffer index and return true if the count has hit zero. |
252 | */ |
253 | static inline union journal_res_state journal_state_buf_put(struct journal *j, unsigned idx) |
254 | { |
255 | union journal_res_state s; |
256 | |
257 | s.v = atomic64_sub_return(i: ((union journal_res_state) { |
258 | .buf0_count = idx == 0, |
259 | .buf1_count = idx == 1, |
260 | .buf2_count = idx == 2, |
261 | .buf3_count = idx == 3, |
262 | }).v, v: &j->reservations.counter); |
263 | return s; |
264 | } |
265 | |
266 | bool bch2_journal_entry_close(struct journal *); |
267 | void bch2_journal_do_writes(struct journal *); |
268 | void bch2_journal_buf_put_final(struct journal *, u64); |
269 | |
270 | static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq) |
271 | { |
272 | union journal_res_state s; |
273 | |
274 | s = journal_state_buf_put(j, idx); |
275 | if (!journal_state_count(s, idx)) |
276 | bch2_journal_buf_put_final(j, seq); |
277 | } |
278 | |
279 | static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq) |
280 | { |
281 | union journal_res_state s; |
282 | |
283 | s = journal_state_buf_put(j, idx); |
284 | if (!journal_state_count(s, idx)) { |
285 | spin_lock(lock: &j->lock); |
286 | bch2_journal_buf_put_final(j, seq); |
287 | spin_unlock(lock: &j->lock); |
288 | } |
289 | } |
290 | |
291 | /* |
292 | * This function releases the journal write structure so other threads can |
293 | * then proceed to add their keys as well. |
294 | */ |
295 | static inline void bch2_journal_res_put(struct journal *j, |
296 | struct journal_res *res) |
297 | { |
298 | if (!res->ref) |
299 | return; |
300 | |
301 | lock_release(lock: &j->res_map, _THIS_IP_); |
302 | |
303 | while (res->u64s) |
304 | bch2_journal_add_entry(j, res, |
305 | type: BCH_JSET_ENTRY_btree_keys, |
306 | id: 0, level: 0, u64s: 0); |
307 | |
308 | bch2_journal_buf_put(j, idx: res->idx, seq: res->seq); |
309 | |
310 | res->ref = 0; |
311 | } |
312 | |
313 | int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *, |
314 | unsigned); |
315 | |
316 | /* First bits for BCH_WATERMARK: */ |
317 | enum journal_res_flags { |
318 | __JOURNAL_RES_GET_NONBLOCK = BCH_WATERMARK_BITS, |
319 | __JOURNAL_RES_GET_CHECK, |
320 | }; |
321 | |
322 | #define JOURNAL_RES_GET_NONBLOCK (1 << __JOURNAL_RES_GET_NONBLOCK) |
323 | #define JOURNAL_RES_GET_CHECK (1 << __JOURNAL_RES_GET_CHECK) |
324 | |
325 | static inline int journal_res_get_fast(struct journal *j, |
326 | struct journal_res *res, |
327 | unsigned flags) |
328 | { |
329 | union journal_res_state old, new; |
330 | u64 v = atomic64_read(v: &j->reservations.counter); |
331 | |
332 | do { |
333 | old.v = new.v = v; |
334 | |
335 | /* |
336 | * Check if there is still room in the current journal |
337 | * entry: |
338 | */ |
339 | if (new.cur_entry_offset + res->u64s > j->cur_entry_u64s) |
340 | return 0; |
341 | |
342 | EBUG_ON(!journal_state_count(new, new.idx)); |
343 | |
344 | if ((flags & BCH_WATERMARK_MASK) < j->watermark) |
345 | return 0; |
346 | |
347 | new.cur_entry_offset += res->u64s; |
348 | journal_state_inc(s: &new); |
349 | |
350 | /* |
351 | * If the refcount would overflow, we have to wait: |
352 | * XXX - tracepoint this: |
353 | */ |
354 | if (!journal_state_count(s: new, idx: new.idx)) |
355 | return 0; |
356 | |
357 | if (flags & JOURNAL_RES_GET_CHECK) |
358 | return 1; |
359 | } while ((v = atomic64_cmpxchg(v: &j->reservations.counter, |
360 | old: old.v, new: new.v)) != old.v); |
361 | |
362 | res->ref = true; |
363 | res->idx = old.idx; |
364 | res->offset = old.cur_entry_offset; |
365 | res->seq = le64_to_cpu(j->buf[old.idx].data->seq); |
366 | return 1; |
367 | } |
368 | |
369 | static inline int bch2_journal_res_get(struct journal *j, struct journal_res *res, |
370 | unsigned u64s, unsigned flags) |
371 | { |
372 | int ret; |
373 | |
374 | EBUG_ON(res->ref); |
375 | EBUG_ON(!test_bit(JOURNAL_STARTED, &j->flags)); |
376 | |
377 | res->u64s = u64s; |
378 | |
379 | if (journal_res_get_fast(j, res, flags)) |
380 | goto out; |
381 | |
382 | ret = bch2_journal_res_get_slowpath(j, res, flags); |
383 | if (ret) |
384 | return ret; |
385 | out: |
386 | if (!(flags & JOURNAL_RES_GET_CHECK)) { |
387 | lock_acquire_shared(&j->res_map, 0, |
388 | (flags & JOURNAL_RES_GET_NONBLOCK) != 0, |
389 | NULL, _THIS_IP_); |
390 | EBUG_ON(!res->ref); |
391 | } |
392 | return 0; |
393 | } |
394 | |
395 | /* journal_entry_res: */ |
396 | |
397 | void bch2_journal_entry_res_resize(struct journal *, |
398 | struct journal_entry_res *, |
399 | unsigned); |
400 | |
401 | int bch2_journal_flush_seq_async(struct journal *, u64, struct closure *); |
402 | void bch2_journal_flush_async(struct journal *, struct closure *); |
403 | |
404 | int bch2_journal_flush_seq(struct journal *, u64); |
405 | int bch2_journal_flush(struct journal *); |
406 | bool bch2_journal_noflush_seq(struct journal *, u64); |
407 | int bch2_journal_meta(struct journal *); |
408 | |
409 | void bch2_journal_halt(struct journal *); |
410 | |
411 | static inline int bch2_journal_error(struct journal *j) |
412 | { |
413 | return j->reservations.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL |
414 | ? -EIO : 0; |
415 | } |
416 | |
417 | struct bch_dev; |
418 | |
419 | static inline void bch2_journal_set_replay_done(struct journal *j) |
420 | { |
421 | BUG_ON(!test_bit(JOURNAL_STARTED, &j->flags)); |
422 | set_bit(nr: JOURNAL_REPLAY_DONE, addr: &j->flags); |
423 | } |
424 | |
425 | void bch2_journal_unblock(struct journal *); |
426 | void bch2_journal_block(struct journal *); |
427 | struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq); |
428 | |
429 | void __bch2_journal_debug_to_text(struct printbuf *, struct journal *); |
430 | void bch2_journal_debug_to_text(struct printbuf *, struct journal *); |
431 | void bch2_journal_pins_to_text(struct printbuf *, struct journal *); |
432 | bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *); |
433 | |
434 | int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *, |
435 | unsigned nr); |
436 | int bch2_dev_journal_alloc(struct bch_dev *); |
437 | int bch2_fs_journal_alloc(struct bch_fs *); |
438 | |
439 | void bch2_dev_journal_stop(struct journal *, struct bch_dev *); |
440 | |
441 | void bch2_fs_journal_stop(struct journal *); |
442 | int bch2_fs_journal_start(struct journal *, u64); |
443 | |
444 | void bch2_dev_journal_exit(struct bch_dev *); |
445 | int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *); |
446 | void bch2_fs_journal_exit(struct journal *); |
447 | int bch2_fs_journal_init(struct journal *); |
448 | |
449 | #endif /* _BCACHEFS_JOURNAL_H */ |
450 | |