1 | /* SPDX-License-Identifier: GPL-2.0 */ |
2 | #ifndef _BCACHE_JOURNAL_H |
3 | #define _BCACHE_JOURNAL_H |
4 | |
5 | /* |
6 | * THE JOURNAL: |
7 | * |
8 | * The journal is treated as a circular buffer of buckets - a journal entry |
9 | * never spans two buckets. This means (not implemented yet) we can resize the |
10 | * journal at runtime, and will be needed for bcache on raw flash support. |
11 | * |
12 | * Journal entries contain a list of keys, ordered by the time they were |
13 | * inserted; thus journal replay just has to reinsert the keys. |
14 | * |
15 | * We also keep some things in the journal header that are logically part of the |
16 | * superblock - all the things that are frequently updated. This is for future |
17 | * bcache on raw flash support; the superblock (which will become another |
18 | * journal) can't be moved or wear leveled, so it contains just enough |
19 | * information to find the main journal, and the superblock only has to be |
20 | * rewritten when we want to move/wear level the main journal. |
21 | * |
22 | * Currently, we don't journal BTREE_REPLACE operations - this will hopefully be |
23 | * fixed eventually. This isn't a bug - BTREE_REPLACE is used for insertions |
24 | * from cache misses, which don't have to be journaled, and for writeback and |
25 | * moving gc we work around it by flushing the btree to disk before updating the |
26 | * gc information. But it is a potential issue with incremental garbage |
27 | * collection, and it's fragile. |
28 | * |
29 | * OPEN JOURNAL ENTRIES: |
30 | * |
31 | * Each journal entry contains, in the header, the sequence number of the last |
32 | * journal entry still open - i.e. that has keys that haven't been flushed to |
33 | * disk in the btree. |
34 | * |
35 | * We track this by maintaining a refcount for every open journal entry, in a |
36 | * fifo; each entry in the fifo corresponds to a particular journal |
37 | * entry/sequence number. When the refcount at the tail of the fifo goes to |
38 | * zero, we pop it off - thus, the size of the fifo tells us the number of open |
39 | * journal entries |
40 | * |
41 | * We take a refcount on a journal entry when we add some keys to a journal |
42 | * entry that we're going to insert (held by struct btree_op), and then when we |
43 | * insert those keys into the btree the btree write we're setting up takes a |
44 | * copy of that refcount (held by struct btree_write). That refcount is dropped |
45 | * when the btree write completes. |
46 | * |
47 | * A struct btree_write can only hold a refcount on a single journal entry, but |
48 | * might contain keys for many journal entries - we handle this by making sure |
49 | * it always has a refcount on the _oldest_ journal entry of all the journal |
50 | * entries it has keys for. |
51 | * |
52 | * JOURNAL RECLAIM: |
53 | * |
54 | * As mentioned previously, our fifo of refcounts tells us the number of open |
55 | * journal entries; from that and the current journal sequence number we compute |
56 | * last_seq - the oldest journal entry we still need. We write last_seq in each |
57 | * journal entry, and we also have to keep track of where it exists on disk so |
58 | * we don't overwrite it when we loop around the journal. |
59 | * |
60 | * To do that we track, for each journal bucket, the sequence number of the |
61 | * newest journal entry it contains - if we don't need that journal entry we |
62 | * don't need anything in that bucket anymore. From that we track the last |
63 | * journal bucket we still need; all this is tracked in struct journal_device |
64 | * and updated by journal_reclaim(). |
65 | * |
66 | * JOURNAL FILLING UP: |
67 | * |
68 | * There are two ways the journal could fill up; either we could run out of |
69 | * space to write to, or we could have too many open journal entries and run out |
70 | * of room in the fifo of refcounts. Since those refcounts are decremented |
71 | * without any locking we can't safely resize that fifo, so we handle it the |
72 | * same way. |
73 | * |
74 | * If the journal fills up, we start flushing dirty btree nodes until we can |
75 | * allocate space for a journal write again - preferentially flushing btree |
76 | * nodes that are pinning the oldest journal entries first. |
77 | */ |
78 | |
79 | /* |
80 | * Only used for holding the journal entries we read in btree_journal_read() |
81 | * during cache_registration |
82 | */ |
83 | struct journal_replay { |
84 | struct list_head list; |
85 | atomic_t *pin; |
86 | struct jset j; |
87 | }; |
88 | |
89 | /* |
90 | * We put two of these in struct journal; we used them for writes to the |
91 | * journal that are being staged or in flight. |
92 | */ |
93 | struct journal_write { |
94 | struct jset *data; |
95 | #define JSET_BITS 3 |
96 | |
97 | struct cache_set *c; |
98 | struct closure_waitlist wait; |
99 | bool dirty; |
100 | bool need_write; |
101 | }; |
102 | |
103 | /* Embedded in struct cache_set */ |
104 | struct journal { |
105 | spinlock_t lock; |
106 | spinlock_t flush_write_lock; |
107 | bool btree_flushing; |
108 | bool do_reserve; |
109 | /* used when waiting because the journal was full */ |
110 | struct closure_waitlist wait; |
111 | struct closure io; |
112 | int io_in_flight; |
113 | struct delayed_work work; |
114 | |
115 | /* Number of blocks free in the bucket(s) we're currently writing to */ |
116 | unsigned int blocks_free; |
117 | uint64_t seq; |
118 | DECLARE_FIFO(atomic_t, pin); |
119 | |
120 | BKEY_PADDED(key); |
121 | |
122 | struct journal_write w[2], *cur; |
123 | }; |
124 | |
125 | /* |
126 | * Embedded in struct cache. First three fields refer to the array of journal |
127 | * buckets, in cache_sb. |
128 | */ |
129 | struct journal_device { |
130 | /* |
131 | * For each journal bucket, contains the max sequence number of the |
132 | * journal writes it contains - so we know when a bucket can be reused. |
133 | */ |
134 | uint64_t seq[SB_JOURNAL_BUCKETS]; |
135 | |
136 | /* Journal bucket we're currently writing to */ |
137 | unsigned int cur_idx; |
138 | |
139 | /* Last journal bucket that still contains an open journal entry */ |
140 | unsigned int last_idx; |
141 | |
142 | /* Next journal bucket to be discarded */ |
143 | unsigned int discard_idx; |
144 | |
145 | #define DISCARD_READY 0 |
146 | #define DISCARD_IN_FLIGHT 1 |
147 | #define DISCARD_DONE 2 |
148 | /* 1 - discard in flight, -1 - discard completed */ |
149 | atomic_t discard_in_flight; |
150 | |
151 | struct work_struct discard_work; |
152 | struct bio discard_bio; |
153 | struct bio_vec discard_bv; |
154 | |
155 | /* Bio for journal reads/writes to this device */ |
156 | struct bio bio; |
157 | struct bio_vec bv[8]; |
158 | }; |
159 | |
160 | #define BTREE_FLUSH_NR 8 |
161 | |
162 | #define journal_pin_cmp(c, l, r) \ |
163 | (fifo_idx(&(c)->journal.pin, (l)) > fifo_idx(&(c)->journal.pin, (r))) |
164 | |
165 | #define JOURNAL_PIN 20000 |
166 | |
167 | #define journal_full(j) \ |
168 | (!(j)->blocks_free || fifo_free(&(j)->pin) <= 1) |
169 | |
170 | struct closure; |
171 | struct cache_set; |
172 | struct btree_op; |
173 | struct keylist; |
174 | |
175 | atomic_t *bch_journal(struct cache_set *c, |
176 | struct keylist *keys, |
177 | struct closure *parent); |
178 | void bch_journal_next(struct journal *j); |
179 | void bch_journal_mark(struct cache_set *c, struct list_head *list); |
180 | void bch_journal_meta(struct cache_set *c, struct closure *cl); |
181 | int bch_journal_read(struct cache_set *c, struct list_head *list); |
182 | int bch_journal_replay(struct cache_set *c, struct list_head *list); |
183 | |
184 | void bch_journal_free(struct cache_set *c); |
185 | int bch_journal_alloc(struct cache_set *c); |
186 | void bch_journal_space_reserve(struct journal *j); |
187 | |
188 | #endif /* _BCACHE_JOURNAL_H */ |
189 | |