commit.c source code [linux/fs/jbd2/commit.c]

1	// SPDX-License-Identifier: GPL-2.0+
2	/*
3	* linux/fs/jbd2/commit.c
4	*
5	* Written by Stephen C. Tweedie <sct@redhat.com>, 1998
6	*
7	* Copyright 1998 Red Hat corp --- All Rights Reserved
8	*
9	* Journal commit routines for the generic filesystem journaling code;
10	* part of the ext2fs journaling system.
11	*/
12
13	#include <linux/time.h>
14	#include <linux/fs.h>
15	#include <linux/jbd2.h>
16	#include <linux/errno.h>
17	#include <linux/slab.h>
18	#include <linux/mm.h>
19	#include <linux/pagemap.h>
20	#include <linux/jiffies.h>
21	#include <linux/crc32.h>
22	#include <linux/writeback.h>
23	#include <linux/backing-dev.h>
24	#include <linux/bio.h>
25	#include <linux/blkdev.h>
26	#include <linux/bitops.h>
27	#include <trace/events/jbd2.h>
28
29	/*
30	* IO end handler for temporary buffer_heads handling writes to the journal.
31	*/
32	static void journal_end_buffer_io_sync(struct buffer_head bh, int* uptodate)
33	{
34	struct buffer_head *orig_bh = bh->b_private;
35
36	BUFFER_TRACE(bh, "");
37	if (uptodate)
38	set_buffer_uptodate(bh);
39	else
40	clear_buffer_uptodate(bh);
41	if (orig_bh) {
42	clear_bit_unlock(nr: BH_Shadow, addr: &orig_bh->b_state);
43	smp_mb__after_atomic();
44	wake_up_bit(word: &orig_bh->b_state, bit: BH_Shadow);
45	}
46	unlock_buffer(bh);
47	}
48
49	/*
50	* When an ext4 file is truncated, it is possible that some pages are not
51	* successfully freed, because they are attached to a committing transaction.
52	* After the transaction commits, these pages are left on the LRU, with no
53	* ->mapping, and with attached buffers. These pages are trivially reclaimable
54	* by the VM, but their apparent absence upsets the VM accounting, and it makes
55	* the numbers in /proc/meminfo look odd.
56	*
57	* So here, we have a buffer which has just come off the forget list. Look to
58	* see if we can strip all buffers from the backing page.
59	*
60	* Called under lock_journal(), and possibly under journal_datalist_lock. The
61	* caller provided us with a ref against the buffer, and we drop that here.
62	*/
63	static void release_buffer_page(struct buffer_head *bh)
64	{
65	struct folio *folio;
66
67	if (buffer_dirty(bh))
68	goto nope;
69	if (atomic_read(v: &bh->b_count) != `1`)
70	goto nope;
71	folio = bh->b_folio;
72	if (folio->mapping)
73	goto nope;
74
75	/ OK, it's a truncated page /
76	if (!folio_trylock(folio))
77	goto nope;
78
79	folio_get(folio);
80	__brelse(bh);
81	try_to_free_buffers(folio);
82	folio_unlock(folio);
83	folio_put(folio);
84	return;
85
86	nope:
87	__brelse(bh);
88	}
89
90	static void jbd2_commit_block_csum_set(journal_t j, struct* buffer_head *bh)
91	{
92	struct commit_header *h;
93	__u32 csum;
94
95	if (!jbd2_journal_has_csum_v2or3(journal: j))
96	return;
97
98	h = (struct commit_header *)(bh->b_data);
99	h->h_chksum_type = `0`;
100	h->h_chksum_size = `0`;
101	h->h_chksum[`0`] = `0`;
102	csum = jbd2_chksum(journal: j, crc: j->j_csum_seed, address: bh->b_data, length: j->j_blocksize);
103	h->h_chksum[`0`] = cpu_to_be32(csum);
104	}
105
106	/*
107	* Done it all: now submit the commit record. We should have
108	* cleaned up our previous buffers by now, so if we are in abort
109	* mode we can now just skip the rest of the journal write
110	* entirely.
111	*
112	* Returns 1 if the journal needs to be aborted or 0 on success
113	*/
114	static int journal_submit_commit_record(journal_t *journal,
115	transaction_t *commit_transaction,
116	struct buffer_head **cbh,
117	__u32 crc32_sum)
118	{
119	struct commit_header *tmp;
120	struct buffer_head *bh;
121	struct timespec64 now;
122	blk_opf_t write_flags = REQ_OP_WRITE \| JBD2_JOURNAL_REQ_FLAGS;
123
124	*cbh = NULL;
125
126	if (is_journal_aborted(journal))
127	return `0`;
128
129	bh = jbd2_journal_get_descriptor_buffer(commit_transaction,
130	JBD2_COMMIT_BLOCK);
131	if (!bh)
132	return `1`;
133
134	tmp = (struct commit_header *)bh->b_data;
135	ktime_get_coarse_real_ts64(ts: &now);
136	tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
137	tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
138
139	if (jbd2_has_feature_checksum(j: journal)) {
140	tmp->h_chksum_type = JBD2_CRC32_CHKSUM;
141	tmp->h_chksum_size = JBD2_CRC32_CHKSUM_SIZE;
142	tmp->h_chksum[`0`] = cpu_to_be32(crc32_sum);
143	}
144	jbd2_commit_block_csum_set(j: journal, bh);
145
146	BUFFER_TRACE(bh, "submit commit block");
147	lock_buffer(bh);
148	clear_buffer_dirty(bh);
149	set_buffer_uptodate(bh);
150	bh->b_end_io = journal_end_buffer_io_sync;
151
152	if (journal->j_flags & JBD2_BARRIER &&
153	!jbd2_has_feature_async_commit(j: journal))
154	write_flags \|= REQ_PREFLUSH \| REQ_FUA;
155
156	submit_bh(write_flags, bh);
157	*cbh = bh;
158	return `0`;
159	}
160
161	/*
162	* This function along with journal_submit_commit_record
163	* allows to write the commit record asynchronously.
164	*/
165	static int journal_wait_on_commit_record(journal_t *journal,
166	struct buffer_head *bh)
167	{
168	int ret = `0`;
169
170	clear_buffer_dirty(bh);
171	wait_on_buffer(bh);
172
173	if (unlikely(!buffer_uptodate(bh)))
174	ret = -EIO;
175	put_bh(bh); / One for getblk() /
176
177	return ret;
178	}
179
180	/ Send all the data buffers related to an inode /
181	int jbd2_submit_inode_data(journal_t journal, struct* jbd2_inode *jinode)
182	{
183	if (!jinode \|\| !(jinode->i_flags & JI_WRITE_DATA))
184	return `0`;
185
186	trace_jbd2_submit_inode_data(inode: jinode->i_vfs_inode);
187	return journal->j_submit_inode_data_buffers(jinode);
188
189	}
190	EXPORT_SYMBOL(jbd2_submit_inode_data);
191
192	int jbd2_wait_inode_data(journal_t journal, struct* jbd2_inode *jinode)
193	{
194	if (!jinode \|\| !(jinode->i_flags & JI_WAIT_DATA) \|\|
195	!jinode->i_vfs_inode \|\| !jinode->i_vfs_inode->i_mapping)
196	return `0`;
197	return filemap_fdatawait_range_keep_errors(
198	mapping: jinode->i_vfs_inode->i_mapping, start_byte: jinode->i_dirty_start,
199	end_byte: jinode->i_dirty_end);
200	}
201	EXPORT_SYMBOL(jbd2_wait_inode_data);
202
203	/*
204	* Submit all the data buffers of inode associated with the transaction to
205	* disk.
206	*
207	* We are in a committing transaction. Therefore no new inode can be added to
208	* our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
209	* operate on from being released while we write out pages.
210	*/
211	static int journal_submit_data_buffers(journal_t *journal,
212	transaction_t *commit_transaction)
213	{
214	struct jbd2_inode *jinode;
215	int err, ret = `0`;
216
217	spin_lock(lock: &journal->j_list_lock);
218	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
219	if (!(jinode->i_flags & JI_WRITE_DATA))
220	continue;
221	jinode->i_flags \|= JI_COMMIT_RUNNING;
222	spin_unlock(lock: &journal->j_list_lock);
223	/ submit the inode data buffers. /
224	trace_jbd2_submit_inode_data(inode: jinode->i_vfs_inode);
225	if (journal->j_submit_inode_data_buffers) {
226	err = journal->j_submit_inode_data_buffers(jinode);
227	if (!ret)
228	ret = err;
229	}
230	spin_lock(lock: &journal->j_list_lock);
231	J_ASSERT(jinode->i_transaction == commit_transaction);
232	jinode->i_flags &= ~JI_COMMIT_RUNNING;
233	smp_mb();
234	wake_up_bit(word: &jinode->i_flags, __JI_COMMIT_RUNNING);
235	}
236	spin_unlock(lock: &journal->j_list_lock);
237	return ret;
238	}
239
240	int jbd2_journal_finish_inode_data_buffers(struct jbd2_inode *jinode)
241	{
242	struct address_space *mapping = jinode->i_vfs_inode->i_mapping;
243
244	return filemap_fdatawait_range_keep_errors(mapping,
245	start_byte: jinode->i_dirty_start,
246	end_byte: jinode->i_dirty_end);
247	}
248
249	/*
250	* Wait for data submitted for writeout, refile inodes to proper
251	* transaction if needed.
252	*
253	*/
254	static int journal_finish_inode_data_buffers(journal_t *journal,
255	transaction_t *commit_transaction)
256	{
257	struct jbd2_inode jinode, next_i;
258	int err, ret = `0`;
259
260	/ For locking, see the comment in journal_submit_data_buffers() /
261	spin_lock(lock: &journal->j_list_lock);
262	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
263	if (!(jinode->i_flags & JI_WAIT_DATA))
264	continue;
265	jinode->i_flags \|= JI_COMMIT_RUNNING;
266	spin_unlock(lock: &journal->j_list_lock);
267	/ wait for the inode data buffers writeout. /
268	if (journal->j_finish_inode_data_buffers) {
269	err = journal->j_finish_inode_data_buffers(jinode);
270	if (!ret)
271	ret = err;
272	}
273	cond_resched();
274	spin_lock(lock: &journal->j_list_lock);
275	jinode->i_flags &= ~JI_COMMIT_RUNNING;
276	smp_mb();
277	wake_up_bit(word: &jinode->i_flags, __JI_COMMIT_RUNNING);
278	}
279
280	/ Now refile inode to proper lists /
281	list_for_each_entry_safe(jinode, next_i,
282	&commit_transaction->t_inode_list, i_list) {
283	list_del(entry: &jinode->i_list);
284	if (jinode->i_next_transaction) {
285	jinode->i_transaction = jinode->i_next_transaction;
286	jinode->i_next_transaction = NULL;
287	list_add(new: &jinode->i_list,
288	head: &jinode->i_transaction->t_inode_list);
289	} else {
290	jinode->i_transaction = NULL;
291	jinode->i_dirty_start = `0`;
292	jinode->i_dirty_end = `0`;
293	}
294	}
295	spin_unlock(lock: &journal->j_list_lock);
296
297	return ret;
298	}
299
300	static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
301	{
302	char *addr;
303	__u32 checksum;
304
305	addr = kmap_local_folio(folio: bh->b_folio, offset: bh_offset(bh));
306	checksum = crc32_be(crc: crc32_sum, p: addr, len: bh->b_size);
307	kunmap_local(addr);
308
309	return checksum;
310	}
311
312	static void write_tag_block(journal_t j, journal_block_tag_t tag,
313	unsigned long long block)
314	{
315	tag->t_blocknr = cpu_to_be32(block & (u32)~`0`);
316	if (jbd2_has_feature_64bit(j))
317	tag->t_blocknr_high = cpu_to_be32((block >> `31`) >> `1`);
318	}
319
320	static void jbd2_block_tag_csum_set(journal_t j, journal_block_tag_t tag,
321	struct buffer_head *bh, __u32 sequence)
322	{
323	journal_block_tag3_t tag3 = (journal_block_tag3_t )tag;
324	__u8 *addr;
325	__u32 csum32;
326	__be32 seq;
327
328	if (!jbd2_journal_has_csum_v2or3(journal: j))
329	return;
330
331	seq = cpu_to_be32(sequence);
332	addr = kmap_local_folio(folio: bh->b_folio, offset: bh_offset(bh));
333	csum32 = jbd2_chksum(journal: j, crc: j->j_csum_seed, address: (__u8 )&seq, length: sizeof*(seq));
334	csum32 = jbd2_chksum(journal: j, crc: csum32, address: addr, length: bh->b_size);
335	kunmap_local(addr);
336
337	if (jbd2_has_feature_csum3(j))
338	tag3->t_checksum = cpu_to_be32(csum32);
339	else
340	tag->t_checksum = cpu_to_be16(csum32);
341	}
342	/*
343	* jbd2_journal_commit_transaction
344	*
345	* The primary function for committing a transaction to the log. This
346	* function is called by the journal thread to begin a complete commit.
347	*/
348	void jbd2_journal_commit_transaction(journal_t *journal)
349	{
350	struct transaction_stats_s stats;
351	transaction_t *commit_transaction;
352	struct journal_head *jh;
353	struct buffer_head *descriptor;
354	struct buffer_head **wbuf = journal->j_wbuf;
355	int bufs;
356	int flags;
357	int err;
358	unsigned long long blocknr;
359	ktime_t start_time;
360	u64 commit_time;
361	char *tagp = NULL;
362	journal_block_tag_t *tag = NULL;
363	int space_left = `0`;
364	int first_tag = `0`;
365	int tag_flag;
366	int i;
367	int tag_bytes = journal_tag_bytes(journal);
368	struct buffer_head cbh = NULL; /* For transactional checksums /
369	__u32 crc32_sum = ~`0`;
370	struct blk_plug plug;
371	/ Tail of the journal /
372	unsigned long first_block;
373	tid_t first_tid;
374	int update_tail;
375	int csum_size = `0`;
376	LIST_HEAD(io_bufs);
377	LIST_HEAD(log_bufs);
378
379	if (jbd2_journal_has_csum_v2or3(journal))
380	csum_size = sizeof(struct jbd2_journal_block_tail);
381
382	/*
383	* First job: lock down the current transaction and wait for
384	* all outstanding updates to complete.
385	*/
386
387	/ Do we need to erase the effects of a prior jbd2_journal_flush? /
388	if (journal->j_flags & JBD2_FLUSHED) {
389	jbd2_debug(`3`, "super block updated\n");
390	mutex_lock_io(&journal->j_checkpoint_mutex);
391	/*
392	* We hold j_checkpoint_mutex so tail cannot change under us.
393	* We don't need any special data guarantees for writing sb
394	* since journal is empty and it is ok for write to be
395	* flushed only with transaction commit.
396	*/
397	jbd2_journal_update_sb_log_tail(journal,
398	journal->j_tail_sequence,
399	journal->j_tail, `0`);
400	mutex_unlock(lock: &journal->j_checkpoint_mutex);
401	} else {
402	jbd2_debug(`3`, "superblock not updated\n");
403	}
404
405	J_ASSERT(journal->j_running_transaction != NULL);
406	J_ASSERT(journal->j_committing_transaction == NULL);
407
408	write_lock(&journal->j_state_lock);
409	journal->j_flags \|= JBD2_FULL_COMMIT_ONGOING;
410	while (journal->j_flags & JBD2_FAST_COMMIT_ONGOING) {
411	DEFINE_WAIT(wait);
412
413	prepare_to_wait(wq_head: &journal->j_fc_wait, wq_entry: &wait,
414	TASK_UNINTERRUPTIBLE);
415	write_unlock(&journal->j_state_lock);
416	schedule();
417	write_lock(&journal->j_state_lock);
418	finish_wait(wq_head: &journal->j_fc_wait, wq_entry: &wait);
419	/*
420	* TODO: by blocking fast commits here, we are increasing
421	* fsync() latency slightly. Strictly speaking, we don't need
422	* to block fast commits until the transaction enters T_FLUSH
423	* state. So an optimization is possible where we block new fast
424	* commits here and wait for existing ones to complete
425	* just before we enter T_FLUSH. That way, the existing fast
426	* commits and this full commit can proceed parallely.
427	*/
428	}
429	write_unlock(&journal->j_state_lock);
430
431	commit_transaction = journal->j_running_transaction;
432
433	trace_jbd2_start_commit(journal, commit_transaction);
434	jbd2_debug(`1`, "JBD2: starting commit of transaction %d\n",
435	commit_transaction->t_tid);
436
437	write_lock(&journal->j_state_lock);
438	journal->j_fc_off = `0`;
439	J_ASSERT(commit_transaction->t_state == T_RUNNING);
440	commit_transaction->t_state = T_LOCKED;
441
442	trace_jbd2_commit_locking(journal, commit_transaction);
443	stats.run.rs_wait = commit_transaction->t_max_wait;
444	stats.run.rs_request_delay = `0`;
445	stats.run.rs_locked = jiffies;
446	if (commit_transaction->t_requested)
447	stats.run.rs_request_delay =
448	jbd2_time_diff(start: commit_transaction->t_requested,
449	end: stats.run.rs_locked);
450	stats.run.rs_running = jbd2_time_diff(start: commit_transaction->t_start,
451	end: stats.run.rs_locked);
452
453	// waits for any t_updates to finish
454	jbd2_journal_wait_updates(journal);
455
456	commit_transaction->t_state = T_SWITCH;
457
458	J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
459	journal->j_max_transaction_buffers);
460
461	/*
462	* First thing we are allowed to do is to discard any remaining
463	* BJ_Reserved buffers. Note, it is _not_ permissible to assume
464	* that there are no such buffers: if a large filesystem
465	* operation like a truncate needs to split itself over multiple
466	* transactions, then it may try to do a jbd2_journal_restart() while
467	* there are still BJ_Reserved buffers outstanding. These must
468	* be released cleanly from the current transaction.
469	*
470	* In this case, the filesystem must still reserve write access
471	* again before modifying the buffer in the new transaction, but
472	* we do not require it to remember exactly which old buffers it
473	* has reserved. This is consistent with the existing behaviour
474	* that multiple jbd2_journal_get_write_access() calls to the same
475	* buffer are perfectly permissible.
476	* We use journal->j_state_lock here to serialize processing of
477	* t_reserved_list with eviction of buffers from journal_unmap_buffer().
478	*/
479	while (commit_transaction->t_reserved_list) {
480	jh = commit_transaction->t_reserved_list;
481	JBUFFER_TRACE(jh, "reserved, unused: refile");
482	/*
483	* A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
484	* leave undo-committed data.
485	*/
486	if (jh->b_committed_data) {
487	struct buffer_head *bh = jh2bh(jh);
488
489	spin_lock(lock: &jh->b_state_lock);
490	jbd2_free(ptr: jh->b_committed_data, size: bh->b_size);
491	jh->b_committed_data = NULL;
492	spin_unlock(lock: &jh->b_state_lock);
493	}
494	jbd2_journal_refile_buffer(journal, jh);
495	}
496
497	write_unlock(&journal->j_state_lock);
498	/*
499	* Now try to drop any written-back buffers from the journal's
500	* checkpoint lists. We do this before commit because it potentially
501	* frees some memory
502	*/
503	spin_lock(lock: &journal->j_list_lock);
504	__jbd2_journal_clean_checkpoint_list(journal, destroy: false);
505	spin_unlock(lock: &journal->j_list_lock);
506
507	jbd2_debug(`3`, "JBD2: commit phase 1\n");
508
509	/*
510	* Clear revoked flag to reflect there is no revoked buffers
511	* in the next transaction which is going to be started.
512	*/
513	jbd2_clear_buffer_revoked_flags(journal);
514
515	/*
516	* Switch to a new revoke table.
517	*/
518	jbd2_journal_switch_revoke_table(journal);
519
520	write_lock(&journal->j_state_lock);
521	/*
522	* Reserved credits cannot be claimed anymore, free them
523	*/
524	atomic_sub(i: atomic_read(v: &journal->j_reserved_credits),
525	v: &commit_transaction->t_outstanding_credits);
526
527	trace_jbd2_commit_flushing(journal, commit_transaction);
528	stats.run.rs_flushing = jiffies;
529	stats.run.rs_locked = jbd2_time_diff(start: stats.run.rs_locked,
530	end: stats.run.rs_flushing);
531
532	commit_transaction->t_state = T_FLUSH;
533	journal->j_committing_transaction = commit_transaction;
534	journal->j_running_transaction = NULL;
535	start_time = ktime_get();
536	commit_transaction->t_log_start = journal->j_head;
537	wake_up_all(&journal->j_wait_transaction_locked);
538	write_unlock(&journal->j_state_lock);
539
540	jbd2_debug(`3`, "JBD2: commit phase 2a\n");
541
542	/*
543	* Now start flushing things to disk, in the order they appear
544	* on the transaction lists. Data blocks go first.
545	*/
546	err = journal_submit_data_buffers(journal, commit_transaction);
547	if (err)
548	jbd2_journal_abort(journal, err);
549
550	blk_start_plug(&plug);
551	jbd2_journal_write_revoke_records(transaction: commit_transaction, log_bufs: &log_bufs);
552
553	jbd2_debug(`3`, "JBD2: commit phase 2b\n");
554
555	/*
556	* Way to go: we have now written out all of the data for a
557	* transaction! Now comes the tricky part: we need to write out
558	* metadata. Loop over the transaction's entire buffer list:
559	*/
560	write_lock(&journal->j_state_lock);
561	commit_transaction->t_state = T_COMMIT;
562	write_unlock(&journal->j_state_lock);
563
564	trace_jbd2_commit_logging(journal, commit_transaction);
565	stats.run.rs_logging = jiffies;
566	stats.run.rs_flushing = jbd2_time_diff(start: stats.run.rs_flushing,
567	end: stats.run.rs_logging);
568	stats.run.rs_blocks = commit_transaction->t_nr_buffers;
569	stats.run.rs_blocks_logged = `0`;
570
571	J_ASSERT(commit_transaction->t_nr_buffers <=
572	atomic_read(&commit_transaction->t_outstanding_credits));
573
574	err = `0`;
575	bufs = `0`;
576	descriptor = NULL;
577	while (commit_transaction->t_buffers) {
578
579	/ Find the next buffer to be journaled... /
580
581	jh = commit_transaction->t_buffers;
582
583	/ If we're in abort mode, we just un-journal the buffer and*
584	release it. /*
585
586	if (is_journal_aborted(journal)) {
587	clear_buffer_jbddirty(bh: jh2bh(jh));
588	JBUFFER_TRACE(jh, "journal is aborting: refile");
589	jbd2_buffer_abort_trigger(jh,
590	triggers: jh->b_frozen_data ?
591	jh->b_frozen_triggers :
592	jh->b_triggers);
593	jbd2_journal_refile_buffer(journal, jh);
594	/ If that was the last one, we need to clean up*
595	* any descriptor buffers which may have been
596	* already allocated, even if we are now
597	* aborting. */
598	if (!commit_transaction->t_buffers)
599	goto start_journal_io;
600	continue;
601	}
602
603	/ Make sure we have a descriptor block in which to*
604	record the metadata buffer. /*
605
606	if (!descriptor) {
607	J_ASSERT (bufs == `0`);
608
609	jbd2_debug(`4`, "JBD2: get descriptor\n");
610
611	descriptor = jbd2_journal_get_descriptor_buffer(
612	commit_transaction,
613	JBD2_DESCRIPTOR_BLOCK);
614	if (!descriptor) {
615	jbd2_journal_abort(journal, -EIO);
616	continue;
617	}
618
619	jbd2_debug(`4`, "JBD2: got buffer %llu (%p)\n",
620	(unsigned long long)descriptor->b_blocknr,
621	descriptor->b_data);
622	tagp = &descriptor->b_data[sizeof(journal_header_t)];
623	space_left = descriptor->b_size -
624	sizeof(journal_header_t);
625	first_tag = `1`;
626	set_buffer_jwrite(descriptor);
627	set_buffer_dirty(descriptor);
628	wbuf[bufs++] = descriptor;
629
630	/ Record it so that we can wait for IO*
631	completion later /*
632	BUFFER_TRACE(descriptor, "ph3: file as descriptor");
633	jbd2_file_log_bh(head: &log_bufs, bh: descriptor);
634	}
635
636	/ Where is the buffer to be written? /
637
638	err = jbd2_journal_next_log_block(journal, &blocknr);
639	/ If the block mapping failed, just abandon the buffer*
640	and repeat this loop: we'll fall into the
641	refile-on-abort condition above. /*
642	if (err) {
643	jbd2_journal_abort(journal, err);
644	continue;
645	}
646
647	/*
648	* start_this_handle() uses t_outstanding_credits to determine
649	* the free space in the log.
650	*/
651	atomic_dec(v: &commit_transaction->t_outstanding_credits);
652
653	/ Bump b_count to prevent truncate from stumbling over*
654	the shadowed buffer! @@@ This can go if we ever get
655	rid of the shadow pairing of buffers. /*
656	atomic_inc(v: &jh2bh(jh)->b_count);
657
658	/*
659	* Make a temporary IO buffer with which to write it out
660	* (this will requeue the metadata buffer to BJ_Shadow).
661	*/
662	set_bit(nr: BH_JWrite, addr: &jh2bh(jh)->b_state);
663	JBUFFER_TRACE(jh, "ph3: write metadata");
664	flags = jbd2_journal_write_metadata_buffer(transaction: commit_transaction,
665	jh_in: jh, bh_out: &wbuf[bufs], blocknr);
666	if (flags < `0`) {
667	jbd2_journal_abort(journal, flags);
668	continue;
669	}
670	jbd2_file_log_bh(head: &io_bufs, bh: wbuf[bufs]);
671
672	/ Record the new block's tag in the current descriptor*
673	buffer /*
674
675	tag_flag = `0`;
676	if (flags & `1`)
677	tag_flag \|= JBD2_FLAG_ESCAPE;
678	if (!first_tag)
679	tag_flag \|= JBD2_FLAG_SAME_UUID;
680
681	tag = (journal_block_tag_t *) tagp;
682	write_tag_block(j: journal, tag, block: jh2bh(jh)->b_blocknr);
683	tag->t_flags = cpu_to_be16(tag_flag);
684	jbd2_block_tag_csum_set(j: journal, tag, bh: wbuf[bufs],
685	sequence: commit_transaction->t_tid);
686	tagp += tag_bytes;
687	space_left -= tag_bytes;
688	bufs++;
689
690	if (first_tag) {
691	memcpy (tagp, journal->j_uuid, `16`);
692	tagp += `16`;
693	space_left -= `16`;
694	first_tag = `0`;
695	}
696
697	/ If there's no more to do, or if the descriptor is full,*
698	let the IO rip! /*
699
700	if (bufs == journal->j_wbufsize \|\|
701	commit_transaction->t_buffers == NULL \|\|
702	space_left < tag_bytes + `16` + csum_size) {
703
704	jbd2_debug(`4`, "JBD2: Submit %d IOs\n", bufs);
705
706	/ Write an end-of-descriptor marker before*
707	submitting the IOs. "tag" still points to
708	the last tag we set up. /*
709
710	tag->t_flags \|= cpu_to_be16(JBD2_FLAG_LAST_TAG);
711	start_journal_io:
712	if (descriptor)
713	jbd2_descriptor_block_csum_set(journal,
714	descriptor);
715
716	for (i = `0`; i < bufs; i++) {
717	struct buffer_head *bh = wbuf[i];
718
719	/*
720	* Compute checksum.
721	*/
722	if (jbd2_has_feature_checksum(j: journal)) {
723	crc32_sum =
724	jbd2_checksum_data(crc32_sum, bh);
725	}
726
727	lock_buffer(bh);
728	clear_buffer_dirty(bh);
729	set_buffer_uptodate(bh);
730	bh->b_end_io = journal_end_buffer_io_sync;
731	submit_bh(REQ_OP_WRITE \| JBD2_JOURNAL_REQ_FLAGS,
732	bh);
733	}
734	cond_resched();
735
736	/ Force a new descriptor to be generated next*
737	time round the loop. /*
738	descriptor = NULL;
739	bufs = `0`;
740	}
741	}
742
743	err = journal_finish_inode_data_buffers(journal, commit_transaction);
744	if (err) {
745	printk(KERN_WARNING
746	"JBD2: Detected IO errors while flushing file data "
747	"on %s\n", journal->j_devname);
748	if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
749	jbd2_journal_abort(journal, err);
750	err = `0`;
751	}
752
753	/*
754	* Get current oldest transaction in the log before we issue flush
755	* to the filesystem device. After the flush we can be sure that
756	* blocks of all older transactions are checkpointed to persistent
757	* storage and we will be safe to update journal start in the
758	* superblock with the numbers we get here.
759	*/
760	update_tail =
761	jbd2_journal_get_log_tail(journal, tid: &first_tid, block: &first_block);
762
763	write_lock(&journal->j_state_lock);
764	if (update_tail) {
765	long freed = first_block - journal->j_tail;
766
767	if (first_block < journal->j_tail)
768	freed += journal->j_last - journal->j_first;
769	/ Update tail only if we free significant amount of space /
770	if (freed < jbd2_journal_get_max_txn_bufs(journal))
771	update_tail = `0`;
772	}
773	J_ASSERT(commit_transaction->t_state == T_COMMIT);
774	commit_transaction->t_state = T_COMMIT_DFLUSH;
775	write_unlock(&journal->j_state_lock);
776
777	/*
778	* If the journal is not located on the file system device,
779	* then we must flush the file system device before we issue
780	* the commit record
781	*/
782	if (commit_transaction->t_need_data_flush &&
783	(journal->j_fs_dev != journal->j_dev) &&
784	(journal->j_flags & JBD2_BARRIER))
785	blkdev_issue_flush(bdev: journal->j_fs_dev);
786
787	/ Done it all: now write the commit record asynchronously. /
788	if (jbd2_has_feature_async_commit(j: journal)) {
789	err = journal_submit_commit_record(journal, commit_transaction,
790	cbh: &cbh, crc32_sum);
791	if (err)
792	jbd2_journal_abort(journal, err);
793	}
794
795	blk_finish_plug(&plug);
796
797	/ Lo and behold: we have just managed to send a transaction to*
798	the log. Before we can commit it, wait for the IO so far to
799	complete. Control buffers being written are on the
800	transaction's t_log_list queue, and metadata buffers are on
801	the io_bufs list.
802
803	Wait for the buffers in reverse order. That way we are
804	less likely to be woken up until all IOs have completed, and
805	so we incur less scheduling load.
806	*/
807
808	jbd2_debug(`3`, "JBD2: commit phase 3\n");
809
810	while (!list_empty(head: &io_bufs)) {
811	struct buffer_head *bh = list_entry(io_bufs.prev,
812	struct buffer_head,
813	b_assoc_buffers);
814
815	wait_on_buffer(bh);
816	cond_resched();
817
818	if (unlikely(!buffer_uptodate(bh)))
819	err = -EIO;
820	jbd2_unfile_log_bh(bh);
821	stats.run.rs_blocks_logged++;
822
823	/*
824	* The list contains temporary buffer heads created by
825	* jbd2_journal_write_metadata_buffer().
826	*/
827	BUFFER_TRACE(bh, "dumping temporary bh");
828	__brelse(bh);
829	J_ASSERT_BH(bh, atomic_read(&bh->b_count) == `0`);
830	free_buffer_head(bh);
831
832	/ We also have to refile the corresponding shadowed buffer /
833	jh = commit_transaction->t_shadow_list->b_tprev;
834	bh = jh2bh(jh);
835	clear_buffer_jwrite(bh);
836	J_ASSERT_BH(bh, buffer_jbddirty(bh));
837	J_ASSERT_BH(bh, !buffer_shadow(bh));
838
839	/ The metadata is now released for reuse, but we need*
840	to remember it against this transaction so that when
841	we finally commit, we can do any checkpointing
842	required. /*
843	JBUFFER_TRACE(jh, "file as BJ_Forget");
844	jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
845	JBUFFER_TRACE(jh, "brelse shadowed buffer");
846	__brelse(bh);
847	}
848
849	J_ASSERT (commit_transaction->t_shadow_list == NULL);
850
851	jbd2_debug(`3`, "JBD2: commit phase 4\n");
852
853	/ Here we wait for the revoke record and descriptor record buffers /
854	while (!list_empty(head: &log_bufs)) {
855	struct buffer_head *bh;
856
857	bh = list_entry(log_bufs.prev, struct buffer_head, b_assoc_buffers);
858	wait_on_buffer(bh);
859	cond_resched();
860
861	if (unlikely(!buffer_uptodate(bh)))
862	err = -EIO;
863
864	BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
865	clear_buffer_jwrite(bh);
866	jbd2_unfile_log_bh(bh);
867	stats.run.rs_blocks_logged++;
868	__brelse(bh); / One for getblk /
869	/ AKPM: bforget here /
870	}
871
872	if (err)
873	jbd2_journal_abort(journal, err);
874
875	jbd2_debug(`3`, "JBD2: commit phase 5\n");
876	write_lock(&journal->j_state_lock);
877	J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
878	commit_transaction->t_state = T_COMMIT_JFLUSH;
879	write_unlock(&journal->j_state_lock);
880
881	if (!jbd2_has_feature_async_commit(j: journal)) {
882	err = journal_submit_commit_record(journal, commit_transaction,
883	cbh: &cbh, crc32_sum);
884	if (err)
885	jbd2_journal_abort(journal, err);
886	}
887	if (cbh)
888	err = journal_wait_on_commit_record(journal, bh: cbh);
889	stats.run.rs_blocks_logged++;
890	if (jbd2_has_feature_async_commit(j: journal) &&
891	journal->j_flags & JBD2_BARRIER) {
892	blkdev_issue_flush(bdev: journal->j_dev);
893	}
894
895	if (err)
896	jbd2_journal_abort(journal, err);
897
898	WARN_ON_ONCE(
899	atomic_read(&commit_transaction->t_outstanding_credits) < `0`);
900
901	/*
902	* Now disk caches for filesystem device are flushed so we are safe to
903	* erase checkpointed transactions from the log by updating journal
904	* superblock.
905	*/
906	if (update_tail)
907	jbd2_update_log_tail(journal, tid: first_tid, block: first_block);
908
909	/ End of a transaction! Finally, we can do checkpoint*
910	processing: any buffers committed as a result of this
911	transaction can be removed from any checkpoint list it was on
912	before. /*
913
914	jbd2_debug(`3`, "JBD2: commit phase 6\n");
915
916	J_ASSERT(list_empty(&commit_transaction->t_inode_list));
917	J_ASSERT(commit_transaction->t_buffers == NULL);
918	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
919	J_ASSERT(commit_transaction->t_shadow_list == NULL);
920
921	restart_loop:
922	/*
923	* As there are other places (journal_unmap_buffer()) adding buffers
924	* to this list we have to be careful and hold the j_list_lock.
925	*/
926	spin_lock(lock: &journal->j_list_lock);
927	while (commit_transaction->t_forget) {
928	transaction_t *cp_transaction;
929	struct buffer_head *bh;
930	int try_to_free = `0`;
931	bool drop_ref;
932
933	jh = commit_transaction->t_forget;
934	spin_unlock(lock: &journal->j_list_lock);
935	bh = jh2bh(jh);
936	/*
937	* Get a reference so that bh cannot be freed before we are
938	* done with it.
939	*/
940	get_bh(bh);
941	spin_lock(lock: &jh->b_state_lock);
942	J_ASSERT_JH(jh, jh->b_transaction == commit_transaction);
943
944	/*
945	* If there is undo-protected committed data against
946	* this buffer, then we can remove it now. If it is a
947	* buffer needing such protection, the old frozen_data
948	* field now points to a committed version of the
949	* buffer, so rotate that field to the new committed
950	* data.
951	*
952	* Otherwise, we can just throw away the frozen data now.
953	*
954	* We also know that the frozen data has already fired
955	* its triggers if they exist, so we can clear that too.
956	*/
957	if (jh->b_committed_data) {
958	jbd2_free(ptr: jh->b_committed_data, size: bh->b_size);
959	jh->b_committed_data = NULL;
960	if (jh->b_frozen_data) {
961	jh->b_committed_data = jh->b_frozen_data;
962	jh->b_frozen_data = NULL;
963	jh->b_frozen_triggers = NULL;
964	}
965	} else if (jh->b_frozen_data) {
966	jbd2_free(ptr: jh->b_frozen_data, size: bh->b_size);
967	jh->b_frozen_data = NULL;
968	jh->b_frozen_triggers = NULL;
969	}
970
971	spin_lock(lock: &journal->j_list_lock);
972	cp_transaction = jh->b_cp_transaction;
973	if (cp_transaction) {
974	JBUFFER_TRACE(jh, "remove from old cp transaction");
975	cp_transaction->t_chp_stats.cs_dropped++;
976	__jbd2_journal_remove_checkpoint(jh);
977	}
978
979	/ Only re-checkpoint the buffer_head if it is marked*
980	* dirty. If the buffer was added to the BJ_Forget list
981	* by jbd2_journal_forget, it may no longer be dirty and
982	* there's no point in keeping a checkpoint record for
983	* it. */
984
985	/*
986	* A buffer which has been freed while still being journaled
987	* by a previous transaction, refile the buffer to BJ_Forget of
988	* the running transaction. If the just committed transaction
989	* contains "add to orphan" operation, we can completely
990	* invalidate the buffer now. We are rather through in that
991	* since the buffer may be still accessible when blocksize <
992	* pagesize and it is attached to the last partial page.
993	*/
994	if (buffer_freed(bh) && !jh->b_next_transaction) {
995	struct address_space *mapping;
996
997	clear_buffer_freed(bh);
998	clear_buffer_jbddirty(bh);
999
1000	/*
1001	* Block device buffers need to stay mapped all the
1002	* time, so it is enough to clear buffer_jbddirty and
1003	* buffer_freed bits. For the file mapping buffers (i.e.
1004	* journalled data) we need to unmap buffer and clear
1005	* more bits. We also need to be careful about the check
1006	* because the data page mapping can get cleared under
1007	* our hands. Note that if mapping == NULL, we don't
1008	* need to make buffer unmapped because the page is
1009	* already detached from the mapping and buffers cannot
1010	* get reused.
1011	*/
1012	mapping = READ_ONCE(bh->b_folio->mapping);
1013	if (mapping && !sb_is_blkdev_sb(sb: mapping->host->i_sb)) {
1014	clear_buffer_mapped(bh);
1015	clear_buffer_new(bh);
1016	clear_buffer_req(bh);
1017	bh->b_bdev = NULL;
1018	}
1019	}
1020
1021	if (buffer_jbddirty(bh)) {
1022	JBUFFER_TRACE(jh, "add to new checkpointing trans");
1023	__jbd2_journal_insert_checkpoint(jh, commit_transaction);
1024	if (is_journal_aborted(journal))
1025	clear_buffer_jbddirty(bh);
1026	} else {
1027	J_ASSERT_BH(bh, !buffer_dirty(bh));
1028	/*
1029	* The buffer on BJ_Forget list and not jbddirty means
1030	* it has been freed by this transaction and hence it
1031	* could not have been reallocated until this
1032	* transaction has committed. BUT it could be
1033	* reallocated once we have written all the data to
1034	* disk and before we process the buffer on BJ_Forget
1035	* list.
1036	*/
1037	if (!jh->b_next_transaction)
1038	try_to_free = `1`;
1039	}
1040	JBUFFER_TRACE(jh, "refile or unfile buffer");
1041	drop_ref = __jbd2_journal_refile_buffer(jh);
1042	spin_unlock(lock: &jh->b_state_lock);
1043	if (drop_ref)
1044	jbd2_journal_put_journal_head(jh);
1045	if (try_to_free)
1046	release_buffer_page(bh); / Drops bh reference /
1047	else
1048	__brelse(bh);
1049	cond_resched_lock(&journal->j_list_lock);
1050	}
1051	spin_unlock(lock: &journal->j_list_lock);
1052	/*
1053	* This is a bit sleazy. We use j_list_lock to protect transition
1054	* of a transaction into T_FINISHED state and calling
1055	* __jbd2_journal_drop_transaction(). Otherwise we could race with
1056	* other checkpointing code processing the transaction...
1057	*/
1058	write_lock(&journal->j_state_lock);
1059	spin_lock(lock: &journal->j_list_lock);
1060	/*
1061	* Now recheck if some buffers did not get attached to the transaction
1062	* while the lock was dropped...
1063	*/
1064	if (commit_transaction->t_forget) {
1065	spin_unlock(lock: &journal->j_list_lock);
1066	write_unlock(&journal->j_state_lock);
1067	goto restart_loop;
1068	}
1069
1070	/ Add the transaction to the checkpoint list*
1071	* __journal_remove_checkpoint() can not destroy transaction
1072	* under us because it is not marked as T_FINISHED yet */
1073	if (journal->j_checkpoint_transactions == NULL) {
1074	journal->j_checkpoint_transactions = commit_transaction;
1075	commit_transaction->t_cpnext = commit_transaction;
1076	commit_transaction->t_cpprev = commit_transaction;
1077	} else {
1078	commit_transaction->t_cpnext =
1079	journal->j_checkpoint_transactions;
1080	commit_transaction->t_cpprev =
1081	commit_transaction->t_cpnext->t_cpprev;
1082	commit_transaction->t_cpnext->t_cpprev =
1083	commit_transaction;
1084	commit_transaction->t_cpprev->t_cpnext =
1085	commit_transaction;
1086	}
1087	spin_unlock(lock: &journal->j_list_lock);
1088
1089	/ Done with this transaction! /
1090
1091	jbd2_debug(`3`, "JBD2: commit phase 7\n");
1092
1093	J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
1094
1095	commit_transaction->t_start = jiffies;
1096	stats.run.rs_logging = jbd2_time_diff(start: stats.run.rs_logging,
1097	end: commit_transaction->t_start);
1098
1099	/*
1100	* File the transaction statistics
1101	*/
1102	stats.ts_tid = commit_transaction->t_tid;
1103	stats.run.rs_handle_count =
1104	atomic_read(v: &commit_transaction->t_handle_count);
1105	trace_jbd2_run_stats(dev: journal->j_fs_dev->bd_dev,
1106	tid: commit_transaction->t_tid, stats: &stats.run);
1107	stats.ts_requested = (commit_transaction->t_requested) ? `1` : `0`;
1108
1109	commit_transaction->t_state = T_COMMIT_CALLBACK;
1110	J_ASSERT(commit_transaction == journal->j_committing_transaction);
1111	journal->j_commit_sequence = commit_transaction->t_tid;
1112	journal->j_committing_transaction = NULL;
1113	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1114
1115	/*
1116	* weight the commit time higher than the average time so we don't
1117	* react too strongly to vast changes in the commit time
1118	*/
1119	if (likely(journal->j_average_commit_time))
1120	journal->j_average_commit_time = (commit_time +
1121	journal->j_average_commit_time*`3`) / `4`;
1122	else
1123	journal->j_average_commit_time = commit_time;
1124
1125	write_unlock(&journal->j_state_lock);
1126
1127	if (journal->j_commit_callback)
1128	journal->j_commit_callback(journal, commit_transaction);
1129	if (journal->j_fc_cleanup_callback)
1130	journal->j_fc_cleanup_callback(journal, `1`, commit_transaction->t_tid);
1131
1132	trace_jbd2_end_commit(journal, commit_transaction);
1133	jbd2_debug(`1`, "JBD2: commit %d complete, head %d\n",
1134	journal->j_commit_sequence, journal->j_tail_sequence);
1135
1136	write_lock(&journal->j_state_lock);
1137	journal->j_flags &= ~JBD2_FULL_COMMIT_ONGOING;
1138	journal->j_flags &= ~JBD2_FAST_COMMIT_ONGOING;
1139	spin_lock(lock: &journal->j_list_lock);
1140	commit_transaction->t_state = T_FINISHED;
1141	/ Check if the transaction can be dropped now that we are finished /
1142	if (commit_transaction->t_checkpoint_list == NULL) {
1143	__jbd2_journal_drop_transaction(journal, commit_transaction);
1144	jbd2_journal_free_transaction(commit_transaction);
1145	}
1146	spin_unlock(lock: &journal->j_list_lock);
1147	write_unlock(&journal->j_state_lock);
1148	wake_up(&journal->j_wait_done_commit);
1149	wake_up(&journal->j_fc_wait);
1150
1151	/*
1152	* Calculate overall stats
1153	*/
1154	spin_lock(lock: &journal->j_history_lock);
1155	journal->j_stats.ts_tid++;
1156	journal->j_stats.ts_requested += stats.ts_requested;
1157	journal->j_stats.run.rs_wait += stats.run.rs_wait;
1158	journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay;
1159	journal->j_stats.run.rs_running += stats.run.rs_running;
1160	journal->j_stats.run.rs_locked += stats.run.rs_locked;
1161	journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
1162	journal->j_stats.run.rs_logging += stats.run.rs_logging;
1163	journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
1164	journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
1165	journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
1166	spin_unlock(lock: &journal->j_history_lock);
1167	}
1168

source code of linux/fs/jbd2/commit.c