transaction.c source code [linux/fs/jbd2/transaction.c]

1	// SPDX-License-Identifier: GPL-2.0+
2	/*
3	* linux/fs/jbd2/transaction.c
4	*
5	* Written by Stephen C. Tweedie <sct@redhat.com>, 1998
6	*
7	* Copyright 1998 Red Hat corp --- All Rights Reserved
8	*
9	* Generic filesystem transaction handling code; part of the ext2fs
10	* journaling system.
11	*
12	* This file manages transactions (compound commits managed by the
13	* journaling code) and handles (individual atomic operations by the
14	* filesystem).
15	*/
16
17	#include <linux/time.h>
18	#include <linux/fs.h>
19	#include <linux/jbd2.h>
20	#include <linux/errno.h>
21	#include <linux/slab.h>
22	#include <linux/timer.h>
23	#include <linux/mm.h>
24	#include <linux/highmem.h>
25	#include <linux/hrtimer.h>
26	#include <linux/backing-dev.h>
27	#include <linux/bug.h>
28	#include <linux/module.h>
29	#include <linux/sched/mm.h>
30
31	#include <trace/events/jbd2.h>
32
33	static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
34	static void __jbd2_journal_unfile_buffer(struct journal_head *jh);
35
36	static struct kmem_cache *transaction_cache;
37	int __init jbd2_journal_init_transaction_cache(void)
38	{
39	J_ASSERT(!transaction_cache);
40	transaction_cache = kmem_cache_create(name: "jbd2_transaction_s",
41	size: sizeof(transaction_t),
42	align: `0`,
43	SLAB_HWCACHE_ALIGN\|SLAB_TEMPORARY,
44	NULL);
45	if (!transaction_cache) {
46	pr_emerg("JBD2: failed to create transaction cache\n");
47	return -ENOMEM;
48	}
49	return `0`;
50	}
51
52	void jbd2_journal_destroy_transaction_cache(void)
53	{
54	kmem_cache_destroy(s: transaction_cache);
55	transaction_cache = NULL;
56	}
57
58	void jbd2_journal_free_transaction(transaction_t *transaction)
59	{
60	if (unlikely(ZERO_OR_NULL_PTR(transaction)))
61	return;
62	kmem_cache_free(s: transaction_cache, objp: transaction);
63	}
64
65	/*
66	* Base amount of descriptor blocks we reserve for each transaction.
67	*/
68	static int jbd2_descriptor_blocks_per_trans(journal_t *journal)
69	{
70	int tag_space = journal->j_blocksize - sizeof(journal_header_t);
71	int tags_per_block;
72
73	/ Subtract UUID /
74	tag_space -= `16`;
75	if (jbd2_journal_has_csum_v2or3(journal))
76	tag_space -= sizeof(struct jbd2_journal_block_tail);
77	/ Commit code leaves a slack space of 16 bytes at the end of block /
78	tags_per_block = (tag_space - `16`) / journal_tag_bytes(journal);
79	/*
80	* Revoke descriptors are accounted separately so we need to reserve
81	* space for commit block and normal transaction descriptor blocks.
82	*/
83	return `1` + DIV_ROUND_UP(journal->j_max_transaction_buffers,
84	tags_per_block);
85	}
86
87	/*
88	* jbd2_get_transaction: obtain a new transaction_t object.
89	*
90	* Simply initialise a new transaction. Initialize it in
91	* RUNNING state and add it to the current journal (which should not
92	* have an existing running transaction: we only make a new transaction
93	* once we have started to commit the old one).
94	*
95	* Preconditions:
96	* The journal MUST be locked. We don't perform atomic mallocs on the
97	* new transaction and we can't block without protecting against other
98	* processes trying to touch the journal while it is in transition.
99	*
100	*/
101
102	static void jbd2_get_transaction(journal_t *journal,
103	transaction_t *transaction)
104	{
105	transaction->t_journal = journal;
106	transaction->t_state = T_RUNNING;
107	transaction->t_start_time = ktime_get();
108	transaction->t_tid = journal->j_transaction_sequence++;
109	transaction->t_expires = jiffies + journal->j_commit_interval;
110	atomic_set(v: &transaction->t_updates, i: `0`);
111	atomic_set(v: &transaction->t_outstanding_credits,
112	i: jbd2_descriptor_blocks_per_trans(journal) +
113	atomic_read(v: &journal->j_reserved_credits));
114	atomic_set(v: &transaction->t_outstanding_revokes, i: `0`);
115	atomic_set(v: &transaction->t_handle_count, i: `0`);
116	INIT_LIST_HEAD(list: &transaction->t_inode_list);
117	INIT_LIST_HEAD(list: &transaction->t_private_list);
118
119	/ Set up the commit timer for the new transaction. /
120	journal->j_commit_timer.expires = round_jiffies_up(j: transaction->t_expires);
121	add_timer(timer: &journal->j_commit_timer);
122
123	J_ASSERT(journal->j_running_transaction == NULL);
124	journal->j_running_transaction = transaction;
125	transaction->t_max_wait = `0`;
126	transaction->t_start = jiffies;
127	transaction->t_requested = `0`;
128	}
129
130	/*
131	* Handle management.
132	*
133	* A handle_t is an object which represents a single atomic update to a
134	* filesystem, and which tracks all of the modifications which form part
135	* of that one update.
136	*/
137
138	/*
139	* Update transaction's maximum wait time, if debugging is enabled.
140	*
141	* t_max_wait is carefully updated here with use of atomic compare exchange.
142	* Note that there could be multiplre threads trying to do this simultaneously
143	* hence using cmpxchg to avoid any use of locks in this case.
144	* With this t_max_wait can be updated w/o enabling jbd2_journal_enable_debug.
145	*/
146	static inline void update_t_max_wait(transaction_t *transaction,
147	unsigned long ts)
148	{
149	unsigned long oldts, newts;
150
151	if (time_after(transaction->t_start, ts)) {
152	newts = jbd2_time_diff(start: ts, end: transaction->t_start);
153	oldts = READ_ONCE(transaction->t_max_wait);
154	while (oldts < newts)
155	oldts = cmpxchg(&transaction->t_max_wait, oldts, newts);
156	}
157	}
158
159	/*
160	* Wait until running transaction passes to T_FLUSH state and new transaction
161	* can thus be started. Also starts the commit if needed. The function expects
162	* running transaction to exist and releases j_state_lock.
163	*/
164	static void wait_transaction_locked(journal_t *journal)
165	__releases(journal->j_state_lock)
166	{
167	DEFINE_WAIT(wait);
168	int need_to_start;
169	tid_t tid = journal->j_running_transaction->t_tid;
170
171	prepare_to_wait_exclusive(wq_head: &journal->j_wait_transaction_locked, wq_entry: &wait,
172	TASK_UNINTERRUPTIBLE);
173	need_to_start = !tid_geq(x: journal->j_commit_request, y: tid);
174	read_unlock(&journal->j_state_lock);
175	if (need_to_start)
176	jbd2_log_start_commit(journal, tid);
177	jbd2_might_wait_for_commit(journal);
178	schedule();
179	finish_wait(wq_head: &journal->j_wait_transaction_locked, wq_entry: &wait);
180	}
181
182	/*
183	* Wait until running transaction transitions from T_SWITCH to T_FLUSH
184	* state and new transaction can thus be started. The function releases
185	* j_state_lock.
186	*/
187	static void wait_transaction_switching(journal_t *journal)
188	__releases(journal->j_state_lock)
189	{
190	DEFINE_WAIT(wait);
191
192	if (WARN_ON(!journal->j_running_transaction \|\|
193	journal->j_running_transaction->t_state != T_SWITCH)) {
194	read_unlock(&journal->j_state_lock);
195	return;
196	}
197	prepare_to_wait_exclusive(wq_head: &journal->j_wait_transaction_locked, wq_entry: &wait,
198	TASK_UNINTERRUPTIBLE);
199	read_unlock(&journal->j_state_lock);
200	/*
201	* We don't call jbd2_might_wait_for_commit() here as there's no
202	* waiting for outstanding handles happening anymore in T_SWITCH state
203	* and handling of reserved handles actually relies on that for
204	* correctness.
205	*/
206	schedule();
207	finish_wait(wq_head: &journal->j_wait_transaction_locked, wq_entry: &wait);
208	}
209
210	static void sub_reserved_credits(journal_t journal, int* blocks)
211	{
212	atomic_sub(i: blocks, v: &journal->j_reserved_credits);
213	wake_up(&journal->j_wait_reserved);
214	}
215
216	/*
217	* Wait until we can add credits for handle to the running transaction. Called
218	* with j_state_lock held for reading. Returns 0 if handle joined the running
219	* transaction. Returns 1 if we had to wait, j_state_lock is dropped, and
220	* caller must retry.
221	*
222	* Note: because j_state_lock may be dropped depending on the return
223	* value, we need to fake out sparse so ti doesn't complain about a
224	* locking imbalance. Callers of add_transaction_credits will need to
225	* make a similar accomodation.
226	*/
227	static int add_transaction_credits(journal_t journal, int* blocks,
228	int rsv_blocks)
229	__must_hold(&journal->j_state_lock)
230	{
231	transaction_t *t = journal->j_running_transaction;
232	int needed;
233	int total = blocks + rsv_blocks;
234
235	/*
236	* If the current transaction is locked down for commit, wait
237	* for the lock to be released.
238	*/
239	if (t->t_state != T_RUNNING) {
240	WARN_ON_ONCE(t->t_state >= T_FLUSH);
241	wait_transaction_locked(journal);
242	__acquire(&journal->j_state_lock); / fake out sparse /
243	return `1`;
244	}
245
246	/*
247	* If there is not enough space left in the log to write all
248	* potential buffers requested by this operation, we need to
249	* stall pending a log checkpoint to free some more log space.
250	*/
251	needed = atomic_add_return(i: total, v: &t->t_outstanding_credits);
252	if (needed > journal->j_max_transaction_buffers) {
253	/*
254	* If the current transaction is already too large,
255	* then start to commit it: we can then go back and
256	* attach this handle to a new transaction.
257	*/
258	atomic_sub(i: total, v: &t->t_outstanding_credits);
259
260	/*
261	* Is the number of reserved credits in the current transaction too
262	* big to fit this handle? Wait until reserved credits are freed.
263	*/
264	if (atomic_read(v: &journal->j_reserved_credits) + total >
265	journal->j_max_transaction_buffers) {
266	read_unlock(&journal->j_state_lock);
267	jbd2_might_wait_for_commit(journal);
268	wait_event(journal->j_wait_reserved,
269	atomic_read(&journal->j_reserved_credits) + total <=
270	journal->j_max_transaction_buffers);
271	__acquire(&journal->j_state_lock); / fake out sparse /
272	return `1`;
273	}
274
275	wait_transaction_locked(journal);
276	__acquire(&journal->j_state_lock); / fake out sparse /
277	return `1`;
278	}
279
280	/*
281	* The commit code assumes that it can get enough log space
282	* without forcing a checkpoint. This is critical for
283	* correctness: a checkpoint of a buffer which is also
284	* associated with a committing transaction creates a deadlock,
285	* so commit simply cannot force through checkpoints.
286	*
287	* We must therefore ensure the necessary space in the journal
288	* before starting to dirty potentially checkpointed buffers
289	* in the new transaction.
290	*/
291	if (jbd2_log_space_left(journal) < journal->j_max_transaction_buffers) {
292	atomic_sub(i: total, v: &t->t_outstanding_credits);
293	read_unlock(&journal->j_state_lock);
294	jbd2_might_wait_for_commit(journal);
295	write_lock(&journal->j_state_lock);
296	if (jbd2_log_space_left(journal) <
297	journal->j_max_transaction_buffers)
298	__jbd2_log_wait_for_space(journal);
299	write_unlock(&journal->j_state_lock);
300	__acquire(&journal->j_state_lock); / fake out sparse /
301	return `1`;
302	}
303
304	/ No reservation? We are done... /
305	if (!rsv_blocks)
306	return `0`;
307
308	needed = atomic_add_return(i: rsv_blocks, v: &journal->j_reserved_credits);
309	/ We allow at most half of a transaction to be reserved /
310	if (needed > journal->j_max_transaction_buffers / `2`) {
311	sub_reserved_credits(journal, blocks: rsv_blocks);
312	atomic_sub(i: total, v: &t->t_outstanding_credits);
313	read_unlock(&journal->j_state_lock);
314	jbd2_might_wait_for_commit(journal);
315	wait_event(journal->j_wait_reserved,
316	atomic_read(&journal->j_reserved_credits) + rsv_blocks
317	<= journal->j_max_transaction_buffers / `2`);
318	__acquire(&journal->j_state_lock); / fake out sparse /
319	return `1`;
320	}
321	return `0`;
322	}
323
324	/*
325	* start_this_handle: Given a handle, deal with any locking or stalling
326	* needed to make sure that there is enough journal space for the handle
327	* to begin. Attach the handle to a transaction and set up the
328	* transaction's buffer credits.
329	*/
330
331	static int start_this_handle(journal_t journal, handle_t handle,
332	gfp_t gfp_mask)
333	{
334	transaction_t transaction, new_transaction = NULL;
335	int blocks = handle->h_total_credits;
336	int rsv_blocks = `0`;
337	unsigned long ts = jiffies;
338
339	if (handle->h_rsv_handle)
340	rsv_blocks = handle->h_rsv_handle->h_total_credits;
341
342	/*
343	* Limit the number of reserved credits to 1/2 of maximum transaction
344	* size and limit the number of total credits to not exceed maximum
345	* transaction size per operation.
346	*/
347	if ((rsv_blocks > journal->j_max_transaction_buffers / `2`) \|\|
348	(rsv_blocks + blocks > journal->j_max_transaction_buffers)) {
349	printk(KERN_ERR "JBD2: %s wants too many credits "
350	"credits:%d rsv_credits:%d max:%d\n",
351	current->comm, blocks, rsv_blocks,
352	journal->j_max_transaction_buffers);
353	WARN_ON(`1`);
354	return -ENOSPC;
355	}
356
357	alloc_transaction:
358	/*
359	* This check is racy but it is just an optimization of allocating new
360	* transaction early if there are high chances we'll need it. If we
361	* guess wrong, we'll retry or free unused transaction.
362	*/
363	if (!data_race(journal->j_running_transaction)) {
364	/*
365	* If __GFP_FS is not present, then we may be being called from
366	* inside the fs writeback layer, so we MUST NOT fail.
367	*/
368	if ((gfp_mask & __GFP_FS) == `0`)
369	gfp_mask \|= __GFP_NOFAIL;
370	new_transaction = kmem_cache_zalloc(k: transaction_cache,
371	flags: gfp_mask);
372	if (!new_transaction)
373	return -ENOMEM;
374	}
375
376	jbd2_debug(`3`, "New handle %p going live.\n", handle);
377
378	/*
379	* We need to hold j_state_lock until t_updates has been incremented,
380	* for proper journal barrier handling
381	*/
382	repeat:
383	read_lock(&journal->j_state_lock);
384	BUG_ON(journal->j_flags & JBD2_UNMOUNT);
385	if (is_journal_aborted(journal) \|\|
386	(journal->j_errno != `0` && !(journal->j_flags & JBD2_ACK_ERR))) {
387	read_unlock(&journal->j_state_lock);
388	jbd2_journal_free_transaction(transaction: new_transaction);
389	return -EROFS;
390	}
391
392	/*
393	* Wait on the journal's transaction barrier if necessary. Specifically
394	* we allow reserved handles to proceed because otherwise commit could
395	* deadlock on page writeback not being able to complete.
396	*/
397	if (!handle->h_reserved && journal->j_barrier_count) {
398	read_unlock(&journal->j_state_lock);
399	wait_event(journal->j_wait_transaction_locked,
400	journal->j_barrier_count == `0`);
401	goto repeat;
402	}
403
404	if (!journal->j_running_transaction) {
405	read_unlock(&journal->j_state_lock);
406	if (!new_transaction)
407	goto alloc_transaction;
408	write_lock(&journal->j_state_lock);
409	if (!journal->j_running_transaction &&
410	(handle->h_reserved \|\| !journal->j_barrier_count)) {
411	jbd2_get_transaction(journal, transaction: new_transaction);
412	new_transaction = NULL;
413	}
414	write_unlock(&journal->j_state_lock);
415	goto repeat;
416	}
417
418	transaction = journal->j_running_transaction;
419
420	if (!handle->h_reserved) {
421	/ We may have dropped j_state_lock - restart in that case /
422	if (add_transaction_credits(journal, blocks, rsv_blocks)) {
423	/*
424	* add_transaction_credits releases
425	* j_state_lock on a non-zero return
426	*/
427	__release(&journal->j_state_lock);
428	goto repeat;
429	}
430	} else {
431	/*
432	* We have handle reserved so we are allowed to join T_LOCKED
433	* transaction and we don't have to check for transaction size
434	* and journal space. But we still have to wait while running
435	* transaction is being switched to a committing one as it
436	* won't wait for any handles anymore.
437	*/
438	if (transaction->t_state == T_SWITCH) {
439	wait_transaction_switching(journal);
440	goto repeat;
441	}
442	sub_reserved_credits(journal, blocks);
443	handle->h_reserved = `0`;
444	}
445
446	/ OK, account for the buffers that this operation expects to*
447	* use and add the handle to the running transaction.
448	*/
449	update_t_max_wait(transaction, ts);
450	handle->h_transaction = transaction;
451	handle->h_requested_credits = blocks;
452	handle->h_revoke_credits_requested = handle->h_revoke_credits;
453	handle->h_start_jiffies = jiffies;
454	atomic_inc(v: &transaction->t_updates);
455	atomic_inc(v: &transaction->t_handle_count);
456	jbd2_debug(`4`, "Handle %p given %d credits (total %d, free %lu)\n",
457	handle, blocks,
458	atomic_read(&transaction->t_outstanding_credits),
459	jbd2_log_space_left(journal));
460	read_unlock(&journal->j_state_lock);
461	current->journal_info = handle;
462
463	rwsem_acquire_read(&journal->j_trans_commit_map, `0`, `0`, _THIS_IP_);
464	jbd2_journal_free_transaction(transaction: new_transaction);
465	/*
466	* Ensure that no allocations done while the transaction is open are
467	* going to recurse back to the fs layer.
468	*/
469	handle->saved_alloc_context = memalloc_nofs_save();
470	return `0`;
471	}
472
473	/ Allocate a new handle. This should probably be in a slab... /
474	static handle_t new_handle(int* nblocks)
475	{
476	handle_t *handle = jbd2_alloc_handle(GFP_NOFS);
477	if (!handle)
478	return NULL;
479	handle->h_total_credits = nblocks;
480	handle->h_ref = `1`;
481
482	return handle;
483	}
484
485	handle_t jbd2__journal_start(journal_t journal, int nblocks, int rsv_blocks,
486	int revoke_records, gfp_t gfp_mask,
487	unsigned int type, unsigned int line_no)
488	{
489	handle_t *handle = journal_current_handle();
490	int err;
491
492	if (!journal)
493	return ERR_PTR(error: -EROFS);
494
495	if (handle) {
496	J_ASSERT(handle->h_transaction->t_journal == journal);
497	handle->h_ref++;
498	return handle;
499	}
500
501	nblocks += DIV_ROUND_UP(revoke_records,
502	journal->j_revoke_records_per_block);
503	handle = new_handle(nblocks);
504	if (!handle)
505	return ERR_PTR(error: -ENOMEM);
506	if (rsv_blocks) {
507	handle_t *rsv_handle;
508
509	rsv_handle = new_handle(nblocks: rsv_blocks);
510	if (!rsv_handle) {
511	jbd2_free_handle(handle);
512	return ERR_PTR(error: -ENOMEM);
513	}
514	rsv_handle->h_reserved = `1`;
515	rsv_handle->h_journal = journal;
516	handle->h_rsv_handle = rsv_handle;
517	}
518	handle->h_revoke_credits = revoke_records;
519
520	err = start_this_handle(journal, handle, gfp_mask);
521	if (err < `0`) {
522	if (handle->h_rsv_handle)
523	jbd2_free_handle(handle: handle->h_rsv_handle);
524	jbd2_free_handle(handle);
525	return ERR_PTR(error: err);
526	}
527	handle->h_type = type;
528	handle->h_line_no = line_no;
529	trace_jbd2_handle_start(dev: journal->j_fs_dev->bd_dev,
530	tid: handle->h_transaction->t_tid, type,
531	line_no, requested_blocks: nblocks);
532
533	return handle;
534	}
535	EXPORT_SYMBOL(jbd2__journal_start);
536
537
538	/**
539	* jbd2_journal_start() - Obtain a new handle.
540	* @journal: Journal to start transaction on.
541	* @nblocks: number of block buffer we might modify
542	*
543	* We make sure that the transaction can guarantee at least nblocks of
544	* modified buffers in the log. We block until the log can guarantee
545	* that much space. Additionally, if rsv_blocks > 0, we also create another
546	* handle with rsv_blocks reserved blocks in the journal. This handle is
547	* stored in h_rsv_handle. It is not attached to any particular transaction
548	* and thus doesn't block transaction commit. If the caller uses this reserved
549	* handle, it has to set h_rsv_handle to NULL as otherwise jbd2_journal_stop()
550	* on the parent handle will dispose the reserved one. Reserved handle has to
551	* be converted to a normal handle using jbd2_journal_start_reserved() before
552	* it can be used.
553	*
554	* Return a pointer to a newly allocated handle, or an ERR_PTR() value
555	* on failure.
556	*/
557	handle_t jbd2_journal_start(journal_t journal, int nblocks)
558	{
559	return jbd2__journal_start(journal, nblocks, `0`, `0`, GFP_NOFS, `0`, `0`);
560	}
561	EXPORT_SYMBOL(jbd2_journal_start);
562
563	static void __jbd2_journal_unreserve_handle(handle_t handle, transaction_t t)
564	{
565	journal_t *journal = handle->h_journal;
566
567	WARN_ON(!handle->h_reserved);
568	sub_reserved_credits(journal, blocks: handle->h_total_credits);
569	if (t)
570	atomic_sub(i: handle->h_total_credits, v: &t->t_outstanding_credits);
571	}
572
573	void jbd2_journal_free_reserved(handle_t *handle)
574	{
575	journal_t *journal = handle->h_journal;
576
577	/ Get j_state_lock to pin running transaction if it exists /
578	read_lock(&journal->j_state_lock);
579	__jbd2_journal_unreserve_handle(handle, t: journal->j_running_transaction);
580	read_unlock(&journal->j_state_lock);
581	jbd2_free_handle(handle);
582	}
583	EXPORT_SYMBOL(jbd2_journal_free_reserved);
584
585	/**
586	* jbd2_journal_start_reserved() - start reserved handle
587	* @handle: handle to start
588	* @type: for handle statistics
589	* @line_no: for handle statistics
590	*
591	* Start handle that has been previously reserved with jbd2_journal_reserve().
592	* This attaches @handle to the running transaction (or creates one if there's
593	* not transaction running). Unlike jbd2_journal_start() this function cannot
594	* block on journal commit, checkpointing, or similar stuff. It can block on
595	* memory allocation or frozen journal though.
596	*
597	* Return 0 on success, non-zero on error - handle is freed in that case.
598	*/
599	int jbd2_journal_start_reserved(handle_t handle, unsigned* int type,
600	unsigned int line_no)
601	{
602	journal_t *journal = handle->h_journal;
603	int ret = -EIO;
604
605	if (WARN_ON(!handle->h_reserved)) {
606	/ Someone passed in normal handle? Just stop it. /
607	jbd2_journal_stop(handle);
608	return ret;
609	}
610	/*
611	* Usefulness of mixing of reserved and unreserved handles is
612	* questionable. So far nobody seems to need it so just error out.
613	*/
614	if (WARN_ON(current->journal_info)) {
615	jbd2_journal_free_reserved(handle);
616	return ret;
617	}
618
619	handle->h_journal = NULL;
620	/*
621	* GFP_NOFS is here because callers are likely from writeback or
622	* similarly constrained call sites
623	*/
624	ret = start_this_handle(journal, handle, GFP_NOFS);
625	if (ret < `0`) {
626	handle->h_journal = journal;
627	jbd2_journal_free_reserved(handle);
628	return ret;
629	}
630	handle->h_type = type;
631	handle->h_line_no = line_no;
632	trace_jbd2_handle_start(dev: journal->j_fs_dev->bd_dev,
633	tid: handle->h_transaction->t_tid, type,
634	line_no, requested_blocks: handle->h_total_credits);
635	return `0`;
636	}
637	EXPORT_SYMBOL(jbd2_journal_start_reserved);
638
639	/**
640	* jbd2_journal_extend() - extend buffer credits.
641	* @handle: handle to 'extend'
642	* @nblocks: nr blocks to try to extend by.
643	* @revoke_records: number of revoke records to try to extend by.
644	*
645	* Some transactions, such as large extends and truncates, can be done
646	* atomically all at once or in several stages. The operation requests
647	* a credit for a number of buffer modifications in advance, but can
648	* extend its credit if it needs more.
649	*
650	* jbd2_journal_extend tries to give the running handle more buffer credits.
651	* It does not guarantee that allocation - this is a best-effort only.
652	* The calling process MUST be able to deal cleanly with a failure to
653	* extend here.
654	*
655	* Return 0 on success, non-zero on failure.
656	*
657	* return code < 0 implies an error
658	* return code > 0 implies normal transaction-full status.
659	*/
660	int jbd2_journal_extend(handle_t handle, int* nblocks, int revoke_records)
661	{
662	transaction_t *transaction = handle->h_transaction;
663	journal_t *journal;
664	int result;
665	int wanted;
666
667	if (is_handle_aborted(handle))
668	return -EROFS;
669	journal = transaction->t_journal;
670
671	result = `1`;
672
673	read_lock(&journal->j_state_lock);
674
675	/ Don't extend a locked-down transaction! /
676	if (transaction->t_state != T_RUNNING) {
677	jbd2_debug(`3`, "denied handle %p %d blocks: "
678	"transaction not running\n", handle, nblocks);
679	goto error_out;
680	}
681
682	nblocks += DIV_ROUND_UP(
683	handle->h_revoke_credits_requested + revoke_records,
684	journal->j_revoke_records_per_block) -
685	DIV_ROUND_UP(
686	handle->h_revoke_credits_requested,
687	journal->j_revoke_records_per_block);
688	wanted = atomic_add_return(i: nblocks,
689	v: &transaction->t_outstanding_credits);
690
691	if (wanted > journal->j_max_transaction_buffers) {
692	jbd2_debug(`3`, "denied handle %p %d blocks: "
693	"transaction too large\n", handle, nblocks);
694	atomic_sub(i: nblocks, v: &transaction->t_outstanding_credits);
695	goto error_out;
696	}
697
698	trace_jbd2_handle_extend(dev: journal->j_fs_dev->bd_dev,
699	tid: transaction->t_tid,
700	type: handle->h_type, line_no: handle->h_line_no,
701	buffer_credits: handle->h_total_credits,
702	requested_blocks: nblocks);
703
704	handle->h_total_credits += nblocks;
705	handle->h_requested_credits += nblocks;
706	handle->h_revoke_credits += revoke_records;
707	handle->h_revoke_credits_requested += revoke_records;
708	result = `0`;
709
710	jbd2_debug(`3`, "extended handle %p by %d\n", handle, nblocks);
711	error_out:
712	read_unlock(&journal->j_state_lock);
713	return result;
714	}
715
716	static void stop_this_handle(handle_t *handle)
717	{
718	transaction_t *transaction = handle->h_transaction;
719	journal_t *journal = transaction->t_journal;
720	int revokes;
721
722	J_ASSERT(journal_current_handle() == handle);
723	J_ASSERT(atomic_read(&transaction->t_updates) > `0`);
724	current->journal_info = NULL;
725	/*
726	* Subtract necessary revoke descriptor blocks from handle credits. We
727	* take care to account only for revoke descriptor blocks the
728	* transaction will really need as large sequences of transactions with
729	* small numbers of revokes are relatively common.
730	*/
731	revokes = handle->h_revoke_credits_requested - handle->h_revoke_credits;
732	if (revokes) {
733	int t_revokes, revoke_descriptors;
734	int rr_per_blk = journal->j_revoke_records_per_block;
735
736	WARN_ON_ONCE(DIV_ROUND_UP(revokes, rr_per_blk)
737	> handle->h_total_credits);
738	t_revokes = atomic_add_return(i: revokes,
739	v: &transaction->t_outstanding_revokes);
740	revoke_descriptors =
741	DIV_ROUND_UP(t_revokes, rr_per_blk) -
742	DIV_ROUND_UP(t_revokes - revokes, rr_per_blk);
743	handle->h_total_credits -= revoke_descriptors;
744	}
745	atomic_sub(i: handle->h_total_credits,
746	v: &transaction->t_outstanding_credits);
747	if (handle->h_rsv_handle)
748	__jbd2_journal_unreserve_handle(handle: handle->h_rsv_handle,
749	t: transaction);
750	if (atomic_dec_and_test(v: &transaction->t_updates))
751	wake_up(&journal->j_wait_updates);
752
753	rwsem_release(&journal->j_trans_commit_map, _THIS_IP_);
754	/*
755	* Scope of the GFP_NOFS context is over here and so we can restore the
756	* original alloc context.
757	*/
758	memalloc_nofs_restore(flags: handle->saved_alloc_context);
759	}
760
761	/**
762	* jbd2__journal_restart() - restart a handle .
763	* @handle: handle to restart
764	* @nblocks: nr credits requested
765	* @revoke_records: number of revoke record credits requested
766	* @gfp_mask: memory allocation flags (for start_this_handle)
767	*
768	* Restart a handle for a multi-transaction filesystem
769	* operation.
770	*
771	* If the jbd2_journal_extend() call above fails to grant new buffer credits
772	* to a running handle, a call to jbd2_journal_restart will commit the
773	* handle's transaction so far and reattach the handle to a new
774	* transaction capable of guaranteeing the requested number of
775	* credits. We preserve reserved handle if there's any attached to the
776	* passed in handle.
777	*/
778	int jbd2__journal_restart(handle_t handle, int* nblocks, int revoke_records,
779	gfp_t gfp_mask)
780	{
781	transaction_t *transaction = handle->h_transaction;
782	journal_t *journal;
783	tid_t tid;
784	int need_to_start;
785	int ret;
786
787	/ If we've had an abort of any type, don't even think about*
788	* actually doing the restart! */
789	if (is_handle_aborted(handle))
790	return `0`;
791	journal = transaction->t_journal;
792	tid = transaction->t_tid;
793
794	/*
795	* First unlink the handle from its current transaction, and start the
796	* commit on that.
797	*/
798	jbd2_debug(`2`, "restarting handle %p\n", handle);
799	stop_this_handle(handle);
800	handle->h_transaction = NULL;
801
802	/*
803	* TODO: If we use READ_ONCE / WRITE_ONCE for j_commit_request we can
804	* get rid of pointless j_state_lock traffic like this.
805	*/
806	read_lock(&journal->j_state_lock);
807	need_to_start = !tid_geq(x: journal->j_commit_request, y: tid);
808	read_unlock(&journal->j_state_lock);
809	if (need_to_start)
810	jbd2_log_start_commit(journal, tid);
811	handle->h_total_credits = nblocks +
812	DIV_ROUND_UP(revoke_records,
813	journal->j_revoke_records_per_block);
814	handle->h_revoke_credits = revoke_records;
815	ret = start_this_handle(journal, handle, gfp_mask);
816	trace_jbd2_handle_restart(dev: journal->j_fs_dev->bd_dev,
817	tid: ret ? `0` : handle->h_transaction->t_tid,
818	type: handle->h_type, line_no: handle->h_line_no,
819	requested_blocks: handle->h_total_credits);
820	return ret;
821	}
822	EXPORT_SYMBOL(jbd2__journal_restart);
823
824
825	int jbd2_journal_restart(handle_t handle, int* nblocks)
826	{
827	return jbd2__journal_restart(handle, nblocks, `0`, GFP_NOFS);
828	}
829	EXPORT_SYMBOL(jbd2_journal_restart);
830
831	/*
832	* Waits for any outstanding t_updates to finish.
833	* This is called with write j_state_lock held.
834	*/
835	void jbd2_journal_wait_updates(journal_t *journal)
836	{
837	DEFINE_WAIT(wait);
838
839	while (`1`) {
840	/*
841	* Note that the running transaction can get freed under us if
842	* this transaction is getting committed in
843	* jbd2_journal_commit_transaction() ->
844	* jbd2_journal_free_transaction(). This can only happen when we
845	* release j_state_lock -> schedule() -> acquire j_state_lock.
846	* Hence we should everytime retrieve new j_running_transaction
847	* value (after j_state_lock release acquire cycle), else it may
848	* lead to use-after-free of old freed transaction.
849	*/
850	transaction_t *transaction = journal->j_running_transaction;
851
852	if (!transaction)
853	break;
854
855	prepare_to_wait(wq_head: &journal->j_wait_updates, wq_entry: &wait,
856	TASK_UNINTERRUPTIBLE);
857	if (!atomic_read(v: &transaction->t_updates)) {
858	finish_wait(wq_head: &journal->j_wait_updates, wq_entry: &wait);
859	break;
860	}
861	write_unlock(&journal->j_state_lock);
862	schedule();
863	finish_wait(wq_head: &journal->j_wait_updates, wq_entry: &wait);
864	write_lock(&journal->j_state_lock);
865	}
866	}
867
868	/**
869	* jbd2_journal_lock_updates () - establish a transaction barrier.
870	* @journal: Journal to establish a barrier on.
871	*
872	* This locks out any further updates from being started, and blocks
873	* until all existing updates have completed, returning only once the
874	* journal is in a quiescent state with no updates running.
875	*
876	* The journal lock should not be held on entry.
877	*/
878	void jbd2_journal_lock_updates(journal_t *journal)
879	{
880	jbd2_might_wait_for_commit(journal);
881
882	write_lock(&journal->j_state_lock);
883	++journal->j_barrier_count;
884
885	/ Wait until there are no reserved handles /
886	if (atomic_read(v: &journal->j_reserved_credits)) {
887	write_unlock(&journal->j_state_lock);
888	wait_event(journal->j_wait_reserved,
889	atomic_read(&journal->j_reserved_credits) == `0`);
890	write_lock(&journal->j_state_lock);
891	}
892
893	/ Wait until there are no running t_updates /
894	jbd2_journal_wait_updates(journal);
895
896	write_unlock(&journal->j_state_lock);
897
898	/*
899	* We have now established a barrier against other normal updates, but
900	* we also need to barrier against other jbd2_journal_lock_updates() calls
901	* to make sure that we serialise special journal-locked operations
902	* too.
903	*/
904	mutex_lock(&journal->j_barrier);
905	}
906
907	/**
908	* jbd2_journal_unlock_updates () - release barrier
909	* @journal: Journal to release the barrier on.
910	*
911	* Release a transaction barrier obtained with jbd2_journal_lock_updates().
912	*
913	* Should be called without the journal lock held.
914	*/
915	void jbd2_journal_unlock_updates (journal_t *journal)
916	{
917	J_ASSERT(journal->j_barrier_count != `0`);
918
919	mutex_unlock(lock: &journal->j_barrier);
920	write_lock(&journal->j_state_lock);
921	--journal->j_barrier_count;
922	write_unlock(&journal->j_state_lock);
923	wake_up_all(&journal->j_wait_transaction_locked);
924	}
925
926	static void warn_dirty_buffer(struct buffer_head *bh)
927	{
928	printk(KERN_WARNING
929	"JBD2: Spotted dirty metadata buffer (dev = %pg, blocknr = %llu). "
930	"There's a risk of filesystem corruption in case of system "
931	"crash.\n",
932	bh->b_bdev, (unsigned long long)bh->b_blocknr);
933	}
934
935	/ Call t_frozen trigger and copy buffer data into jh->b_frozen_data. /
936	static void jbd2_freeze_jh_data(struct journal_head *jh)
937	{
938	char *source;
939	struct buffer_head *bh = jh2bh(jh);
940
941	J_EXPECT_JH(jh, buffer_uptodate(bh), "Possible IO failure.\n");
942	source = kmap_local_folio(folio: bh->b_folio, offset: bh_offset(bh));
943	/ Fire data frozen trigger just before we copy the data /
944	jbd2_buffer_frozen_trigger(jh, mapped_data: source, triggers: jh->b_triggers);
945	memcpy(jh->b_frozen_data, source, bh->b_size);
946	kunmap_local(source);
947
948	/*
949	* Now that the frozen data is saved off, we need to store any matching
950	* triggers.
951	*/
952	jh->b_frozen_triggers = jh->b_triggers;
953	}
954
955	/*
956	* If the buffer is already part of the current transaction, then there
957	* is nothing we need to do. If it is already part of a prior
958	* transaction which we are still committing to disk, then we need to
959	* make sure that we do not overwrite the old copy: we do copy-out to
960	* preserve the copy going to disk. We also account the buffer against
961	* the handle's metadata buffer credits (unless the buffer is already
962	* part of the transaction, that is).
963	*
964	*/
965	static int
966	do_get_write_access(handle_t handle, struct* journal_head *jh,
967	int force_copy)
968	{
969	struct buffer_head *bh;
970	transaction_t *transaction = handle->h_transaction;
971	journal_t *journal;
972	int error;
973	char *frozen_buffer = NULL;
974	unsigned long start_lock, time_lock;
975
976	journal = transaction->t_journal;
977
978	jbd2_debug(`5`, "journal_head %p, force_copy %d\n", jh, force_copy);
979
980	JBUFFER_TRACE(jh, "entry");
981	repeat:
982	bh = jh2bh(jh);
983
984	/ @@@ Need to check for errors here at some point. /
985
986	start_lock = jiffies;
987	lock_buffer(bh);
988	spin_lock(lock: &jh->b_state_lock);
989
990	/ If it takes too long to lock the buffer, trace it /
991	time_lock = jbd2_time_diff(start: start_lock, end: jiffies);
992	if (time_lock > HZ/`10`)
993	trace_jbd2_lock_buffer_stall(dev: bh->b_bdev->bd_dev,
994	stall_ms: jiffies_to_msecs(j: time_lock));
995
996	/ We now hold the buffer lock so it is safe to query the buffer*
997	* state. Is the buffer dirty?
998	*
999	* If so, there are two possibilities. The buffer may be
1000	* non-journaled, and undergoing a quite legitimate writeback.
1001	* Otherwise, it is journaled, and we don't expect dirty buffers
1002	* in that state (the buffers should be marked JBD_Dirty
1003	* instead.) So either the IO is being done under our own
1004	* control and this is a bug, or it's a third party IO such as
1005	* dump(8) (which may leave the buffer scheduled for read ---
1006	* ie. locked but not dirty) or tune2fs (which may actually have
1007	* the buffer dirtied, ugh.) */
1008
1009	if (buffer_dirty(bh) && jh->b_transaction) {
1010	warn_dirty_buffer(bh);
1011	/*
1012	* We need to clean the dirty flag and we must do it under the
1013	* buffer lock to be sure we don't race with running write-out.
1014	*/
1015	JBUFFER_TRACE(jh, "Journalling dirty buffer");
1016	clear_buffer_dirty(bh);
1017	/*
1018	* The buffer is going to be added to BJ_Reserved list now and
1019	* nothing guarantees jbd2_journal_dirty_metadata() will be
1020	* ever called for it. So we need to set jbddirty bit here to
1021	* make sure the buffer is dirtied and written out when the
1022	* journaling machinery is done with it.
1023	*/
1024	set_buffer_jbddirty(bh);
1025	}
1026
1027	error = -EROFS;
1028	if (is_handle_aborted(handle)) {
1029	spin_unlock(lock: &jh->b_state_lock);
1030	unlock_buffer(bh);
1031	goto out;
1032	}
1033	error = `0`;
1034
1035	/*
1036	* The buffer is already part of this transaction if b_transaction or
1037	* b_next_transaction points to it
1038	*/
1039	if (jh->b_transaction == transaction \|\|
1040	jh->b_next_transaction == transaction) {
1041	unlock_buffer(bh);
1042	goto done;
1043	}
1044
1045	/*
1046	* this is the first time this transaction is touching this buffer,
1047	* reset the modified flag
1048	*/
1049	jh->b_modified = `0`;
1050
1051	/*
1052	* If the buffer is not journaled right now, we need to make sure it
1053	* doesn't get written to disk before the caller actually commits the
1054	* new data
1055	*/
1056	if (!jh->b_transaction) {
1057	JBUFFER_TRACE(jh, "no transaction");
1058	J_ASSERT_JH(jh, !jh->b_next_transaction);
1059	JBUFFER_TRACE(jh, "file as BJ_Reserved");
1060	/*
1061	* Make sure all stores to jh (b_modified, b_frozen_data) are
1062	* visible before attaching it to the running transaction.
1063	* Paired with barrier in jbd2_write_access_granted()
1064	*/
1065	smp_wmb();
1066	spin_lock(lock: &journal->j_list_lock);
1067	if (test_clear_buffer_dirty(bh)) {
1068	/*
1069	* Execute buffer dirty clearing and jh->b_transaction
1070	* assignment under journal->j_list_lock locked to
1071	* prevent bh being removed from checkpoint list if
1072	* the buffer is in an intermediate state (not dirty
1073	* and jh->b_transaction is NULL).
1074	*/
1075	JBUFFER_TRACE(jh, "Journalling dirty buffer");
1076	set_buffer_jbddirty(bh);
1077	}
1078	__jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
1079	spin_unlock(lock: &journal->j_list_lock);
1080	unlock_buffer(bh);
1081	goto done;
1082	}
1083	unlock_buffer(bh);
1084
1085	/*
1086	* If there is already a copy-out version of this buffer, then we don't
1087	* need to make another one
1088	*/
1089	if (jh->b_frozen_data) {
1090	JBUFFER_TRACE(jh, "has frozen data");
1091	J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
1092	goto attach_next;
1093	}
1094
1095	JBUFFER_TRACE(jh, "owned by older transaction");
1096	J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
1097	J_ASSERT_JH(jh, jh->b_transaction == journal->j_committing_transaction);
1098
1099	/*
1100	* There is one case we have to be very careful about. If the
1101	* committing transaction is currently writing this buffer out to disk
1102	* and has NOT made a copy-out, then we cannot modify the buffer
1103	* contents at all right now. The essence of copy-out is that it is
1104	* the extra copy, not the primary copy, which gets journaled. If the
1105	* primary copy is already going to disk then we cannot do copy-out
1106	* here.
1107	*/
1108	if (buffer_shadow(bh)) {
1109	JBUFFER_TRACE(jh, "on shadow: sleep");
1110	spin_unlock(lock: &jh->b_state_lock);
1111	wait_on_bit_io(word: &bh->b_state, bit: BH_Shadow, TASK_UNINTERRUPTIBLE);
1112	goto repeat;
1113	}
1114
1115	/*
1116	* Only do the copy if the currently-owning transaction still needs it.
1117	* If buffer isn't on BJ_Metadata list, the committing transaction is
1118	* past that stage (here we use the fact that BH_Shadow is set under
1119	* bh_state lock together with refiling to BJ_Shadow list and at this
1120	* point we know the buffer doesn't have BH_Shadow set).
1121	*
1122	* Subtle point, though: if this is a get_undo_access, then we will be
1123	* relying on the frozen_data to contain the new value of the
1124	* committed_data record after the transaction, so we HAVE to force the
1125	* frozen_data copy in that case.
1126	*/
1127	if (jh->b_jlist == BJ_Metadata \|\| force_copy) {
1128	JBUFFER_TRACE(jh, "generate frozen data");
1129	if (!frozen_buffer) {
1130	JBUFFER_TRACE(jh, "allocate memory for buffer");
1131	spin_unlock(lock: &jh->b_state_lock);
1132	frozen_buffer = jbd2_alloc(size: jh2bh(jh)->b_size,
1133	GFP_NOFS \| __GFP_NOFAIL);
1134	goto repeat;
1135	}
1136	jh->b_frozen_data = frozen_buffer;
1137	frozen_buffer = NULL;
1138	jbd2_freeze_jh_data(jh);
1139	}
1140	attach_next:
1141	/*
1142	* Make sure all stores to jh (b_modified, b_frozen_data) are visible
1143	* before attaching it to the running transaction. Paired with barrier
1144	* in jbd2_write_access_granted()
1145	*/
1146	smp_wmb();
1147	jh->b_next_transaction = transaction;
1148
1149	done:
1150	spin_unlock(lock: &jh->b_state_lock);
1151
1152	/*
1153	* If we are about to journal a buffer, then any revoke pending on it is
1154	* no longer valid
1155	*/
1156	jbd2_journal_cancel_revoke(handle, jh);
1157
1158	out:
1159	if (unlikely(frozen_buffer)) / It's usually NULL /
1160	jbd2_free(ptr: frozen_buffer, size: bh->b_size);
1161
1162	JBUFFER_TRACE(jh, "exit");
1163	return error;
1164	}
1165
1166	/ Fast check whether buffer is already attached to the required transaction /
1167	static bool jbd2_write_access_granted(handle_t handle, struct* buffer_head *bh,
1168	bool undo)
1169	{
1170	struct journal_head *jh;
1171	bool ret = false;
1172
1173	/ Dirty buffers require special handling... /
1174	if (buffer_dirty(bh))
1175	return false;
1176
1177	/*
1178	* RCU protects us from dereferencing freed pages. So the checks we do
1179	* are guaranteed not to oops. However the jh slab object can get freed
1180	* & reallocated while we work with it. So we have to be careful. When
1181	* we see jh attached to the running transaction, we know it must stay
1182	* so until the transaction is committed. Thus jh won't be freed and
1183	* will be attached to the same bh while we run. However it can
1184	* happen jh gets freed, reallocated, and attached to the transaction
1185	* just after we get pointer to it from bh. So we have to be careful
1186	* and recheck jh still belongs to our bh before we return success.
1187	*/
1188	rcu_read_lock();
1189	if (!buffer_jbd(bh))
1190	goto out;
1191	/ This should be bh2jh() but that doesn't work with inline functions /
1192	jh = READ_ONCE(bh->b_private);
1193	if (!jh)
1194	goto out;
1195	/ For undo access buffer must have data copied /
1196	if (undo && !jh->b_committed_data)
1197	goto out;
1198	if (READ_ONCE(jh->b_transaction) != handle->h_transaction &&
1199	READ_ONCE(jh->b_next_transaction) != handle->h_transaction)
1200	goto out;
1201	/*
1202	* There are two reasons for the barrier here:
1203	* 1) Make sure to fetch b_bh after we did previous checks so that we
1204	* detect when jh went through free, realloc, attach to transaction
1205	* while we were checking. Paired with implicit barrier in that path.
1206	* 2) So that access to bh done after jbd2_write_access_granted()
1207	* doesn't get reordered and see inconsistent state of concurrent
1208	* do_get_write_access().
1209	*/
1210	smp_mb();
1211	if (unlikely(jh->b_bh != bh))
1212	goto out;
1213	ret = true;
1214	out:
1215	rcu_read_unlock();
1216	return ret;
1217	}
1218
1219	/**
1220	* jbd2_journal_get_write_access() - notify intent to modify a buffer
1221	* for metadata (not data) update.
1222	* @handle: transaction to add buffer modifications to
1223	* @bh: bh to be used for metadata writes
1224	*
1225	* Returns: error code or 0 on success.
1226	*
1227	* In full data journalling mode the buffer may be of type BJ_AsyncData,
1228	* because we're ``write()ing`` a buffer which is also part of a shared mapping.
1229	*/
1230
1231	int jbd2_journal_get_write_access(handle_t handle, struct* buffer_head *bh)
1232	{
1233	struct journal_head *jh;
1234	journal_t *journal;
1235	int rc;
1236
1237	if (is_handle_aborted(handle))
1238	return -EROFS;
1239
1240	journal = handle->h_transaction->t_journal;
1241	if (jbd2_check_fs_dev_write_error(journal)) {
1242	/*
1243	* If the fs dev has writeback errors, it may have failed
1244	* to async write out metadata buffers in the background.
1245	* In this case, we could read old data from disk and write
1246	* it out again, which may lead to on-disk filesystem
1247	* inconsistency. Aborting journal can avoid it happen.
1248	*/
1249	jbd2_journal_abort(journal, -EIO);
1250	return -EIO;
1251	}
1252
1253	if (jbd2_write_access_granted(handle, bh, undo: false))
1254	return `0`;
1255
1256	jh = jbd2_journal_add_journal_head(bh);
1257	/ We do not want to get caught playing with fields which the*
1258	* log thread also manipulates. Make sure that the buffer
1259	* completes any outstanding IO before proceeding. */
1260	rc = do_get_write_access(handle, jh, force_copy: `0`);
1261	jbd2_journal_put_journal_head(jh);
1262	return rc;
1263	}
1264
1265
1266	/*
1267	* When the user wants to journal a newly created buffer_head
1268	* (ie. getblk() returned a new buffer and we are going to populate it
1269	* manually rather than reading off disk), then we need to keep the
1270	* buffer_head locked until it has been completely filled with new
1271	* data. In this case, we should be able to make the assertion that
1272	* the bh is not already part of an existing transaction.
1273	*
1274	* The buffer should already be locked by the caller by this point.
1275	* There is no lock ranking violation: it was a newly created,
1276	* unlocked buffer beforehand. */
1277
1278	/**
1279	* jbd2_journal_get_create_access () - notify intent to use newly created bh
1280	* @handle: transaction to new buffer to
1281	* @bh: new buffer.
1282	*
1283	* Call this if you create a new bh.
1284	*/
1285	int jbd2_journal_get_create_access(handle_t handle, struct* buffer_head *bh)
1286	{
1287	transaction_t *transaction = handle->h_transaction;
1288	journal_t *journal;
1289	struct journal_head *jh = jbd2_journal_add_journal_head(bh);
1290	int err;
1291
1292	jbd2_debug(`5`, "journal_head %p\n", jh);
1293	err = -EROFS;
1294	if (is_handle_aborted(handle))
1295	goto out;
1296	journal = transaction->t_journal;
1297	err = `0`;
1298
1299	JBUFFER_TRACE(jh, "entry");
1300	/*
1301	* The buffer may already belong to this transaction due to pre-zeroing
1302	* in the filesystem's new_block code. It may also be on the previous,
1303	* committing transaction's lists, but it HAS to be in Forget state in
1304	* that case: the transaction must have deleted the buffer for it to be
1305	* reused here.
1306	*/
1307	spin_lock(lock: &jh->b_state_lock);
1308	J_ASSERT_JH(jh, (jh->b_transaction == transaction \|\|
1309	jh->b_transaction == NULL \|\|
1310	(jh->b_transaction == journal->j_committing_transaction &&
1311	jh->b_jlist == BJ_Forget)));
1312
1313	J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
1314	J_ASSERT_JH(jh, buffer_locked(jh2bh(jh)));
1315
1316	if (jh->b_transaction == NULL) {
1317	/*
1318	* Previous jbd2_journal_forget() could have left the buffer
1319	* with jbddirty bit set because it was being committed. When
1320	* the commit finished, we've filed the buffer for
1321	* checkpointing and marked it dirty. Now we are reallocating
1322	* the buffer so the transaction freeing it must have
1323	* committed and so it's safe to clear the dirty bit.
1324	*/
1325	clear_buffer_dirty(bh: jh2bh(jh));
1326	/ first access by this transaction /
1327	jh->b_modified = `0`;
1328
1329	JBUFFER_TRACE(jh, "file as BJ_Reserved");
1330	spin_lock(lock: &journal->j_list_lock);
1331	__jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
1332	spin_unlock(lock: &journal->j_list_lock);
1333	} else if (jh->b_transaction == journal->j_committing_transaction) {
1334	/ first access by this transaction /
1335	jh->b_modified = `0`;
1336
1337	JBUFFER_TRACE(jh, "set next transaction");
1338	spin_lock(lock: &journal->j_list_lock);
1339	jh->b_next_transaction = transaction;
1340	spin_unlock(lock: &journal->j_list_lock);
1341	}
1342	spin_unlock(lock: &jh->b_state_lock);
1343
1344	/*
1345	* akpm: I added this. ext3_alloc_branch can pick up new indirect
1346	* blocks which contain freed but then revoked metadata. We need
1347	* to cancel the revoke in case we end up freeing it yet again
1348	* and the reallocating as data - this would cause a second revoke,
1349	* which hits an assertion error.
1350	*/
1351	JBUFFER_TRACE(jh, "cancelling revoke");
1352	jbd2_journal_cancel_revoke(handle, jh);
1353	out:
1354	jbd2_journal_put_journal_head(jh);
1355	return err;
1356	}
1357
1358	/**
1359	* jbd2_journal_get_undo_access() - Notify intent to modify metadata with
1360	* non-rewindable consequences
1361	* @handle: transaction
1362	* @bh: buffer to undo
1363	*
1364	* Sometimes there is a need to distinguish between metadata which has
1365	* been committed to disk and that which has not. The ext3fs code uses
1366	* this for freeing and allocating space, we have to make sure that we
1367	* do not reuse freed space until the deallocation has been committed,
1368	* since if we overwrote that space we would make the delete
1369	* un-rewindable in case of a crash.
1370	*
1371	* To deal with that, jbd2_journal_get_undo_access requests write access to a
1372	* buffer for parts of non-rewindable operations such as delete
1373	* operations on the bitmaps. The journaling code must keep a copy of
1374	* the buffer's contents prior to the undo_access call until such time
1375	* as we know that the buffer has definitely been committed to disk.
1376	*
1377	* We never need to know which transaction the committed data is part
1378	* of, buffers touched here are guaranteed to be dirtied later and so
1379	* will be committed to a new transaction in due course, at which point
1380	* we can discard the old committed data pointer.
1381	*
1382	* Returns error number or 0 on success.
1383	*/
1384	int jbd2_journal_get_undo_access(handle_t handle, struct* buffer_head *bh)
1385	{
1386	int err;
1387	struct journal_head *jh;
1388	char *committed_data = NULL;
1389
1390	if (is_handle_aborted(handle))
1391	return -EROFS;
1392
1393	if (jbd2_write_access_granted(handle, bh, undo: true))
1394	return `0`;
1395
1396	jh = jbd2_journal_add_journal_head(bh);
1397	JBUFFER_TRACE(jh, "entry");
1398
1399	/*
1400	* Do this first --- it can drop the journal lock, so we want to
1401	* make sure that obtaining the committed_data is done
1402	* atomically wrt. completion of any outstanding commits.
1403	*/
1404	err = do_get_write_access(handle, jh, force_copy: `1`);
1405	if (err)
1406	goto out;
1407
1408	repeat:
1409	if (!jh->b_committed_data)
1410	committed_data = jbd2_alloc(size: jh2bh(jh)->b_size,
1411	GFP_NOFS\|__GFP_NOFAIL);
1412
1413	spin_lock(lock: &jh->b_state_lock);
1414	if (!jh->b_committed_data) {
1415	/ Copy out the current buffer contents into the*
1416	* preserved, committed copy. */
1417	JBUFFER_TRACE(jh, "generate b_committed data");
1418	if (!committed_data) {
1419	spin_unlock(lock: &jh->b_state_lock);
1420	goto repeat;
1421	}
1422
1423	jh->b_committed_data = committed_data;
1424	committed_data = NULL;
1425	memcpy(jh->b_committed_data, bh->b_data, bh->b_size);
1426	}
1427	spin_unlock(lock: &jh->b_state_lock);
1428	out:
1429	jbd2_journal_put_journal_head(jh);
1430	if (unlikely(committed_data))
1431	jbd2_free(ptr: committed_data, size: bh->b_size);
1432	return err;
1433	}
1434
1435	/**
1436	* jbd2_journal_set_triggers() - Add triggers for commit writeout
1437	* @bh: buffer to trigger on
1438	* @type: struct jbd2_buffer_trigger_type containing the trigger(s).
1439	*
1440	* Set any triggers on this journal_head. This is always safe, because
1441	* triggers for a committing buffer will be saved off, and triggers for
1442	* a running transaction will match the buffer in that transaction.
1443	*
1444	* Call with NULL to clear the triggers.
1445	*/
1446	void jbd2_journal_set_triggers(struct buffer_head *bh,
1447	struct jbd2_buffer_trigger_type *type)
1448	{
1449	struct journal_head *jh = jbd2_journal_grab_journal_head(bh);
1450
1451	if (WARN_ON_ONCE(!jh))
1452	return;
1453	jh->b_triggers = type;
1454	jbd2_journal_put_journal_head(jh);
1455	}
1456
1457	void jbd2_buffer_frozen_trigger(struct journal_head jh, void* *mapped_data,
1458	struct jbd2_buffer_trigger_type *triggers)
1459	{
1460	struct buffer_head *bh = jh2bh(jh);
1461
1462	if (!triggers \|\| !triggers->t_frozen)
1463	return;
1464
1465	triggers->t_frozen(triggers, bh, mapped_data, bh->b_size);
1466	}
1467
1468	void jbd2_buffer_abort_trigger(struct journal_head *jh,
1469	struct jbd2_buffer_trigger_type *triggers)
1470	{
1471	if (!triggers \|\| !triggers->t_abort)
1472	return;
1473
1474	triggers->t_abort(triggers, jh2bh(jh));
1475	}
1476
1477	/**
1478	* jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata
1479	* @handle: transaction to add buffer to.
1480	* @bh: buffer to mark
1481	*
1482	* mark dirty metadata which needs to be journaled as part of the current
1483	* transaction.
1484	*
1485	* The buffer must have previously had jbd2_journal_get_write_access()
1486	* called so that it has a valid journal_head attached to the buffer
1487	* head.
1488	*
1489	* The buffer is placed on the transaction's metadata list and is marked
1490	* as belonging to the transaction.
1491	*
1492	* Returns error number or 0 on success.
1493	*
1494	* Special care needs to be taken if the buffer already belongs to the
1495	* current committing transaction (in which case we should have frozen
1496	* data present for that commit). In that case, we don't relink the
1497	* buffer: that only gets done when the old transaction finally
1498	* completes its commit.
1499	*/
1500	int jbd2_journal_dirty_metadata(handle_t handle, struct* buffer_head *bh)
1501	{
1502	transaction_t *transaction = handle->h_transaction;
1503	journal_t *journal;
1504	struct journal_head *jh;
1505	int ret = `0`;
1506
1507	if (!buffer_jbd(bh))
1508	return -EUCLEAN;
1509
1510	/*
1511	* We don't grab jh reference here since the buffer must be part
1512	* of the running transaction.
1513	*/
1514	jh = bh2jh(bh);
1515	jbd2_debug(`5`, "journal_head %p\n", jh);
1516	JBUFFER_TRACE(jh, "entry");
1517
1518	/*
1519	* This and the following assertions are unreliable since we may see jh
1520	* in inconsistent state unless we grab bh_state lock. But this is
1521	* crucial to catch bugs so let's do a reliable check until the
1522	* lockless handling is fully proven.
1523	*/
1524	if (data_race(jh->b_transaction != transaction &&
1525	jh->b_next_transaction != transaction)) {
1526	spin_lock(lock: &jh->b_state_lock);
1527	J_ASSERT_JH(jh, jh->b_transaction == transaction \|\|
1528	jh->b_next_transaction == transaction);
1529	spin_unlock(lock: &jh->b_state_lock);
1530	}
1531	if (jh->b_modified == `1`) {
1532	/ If it's in our transaction it must be in BJ_Metadata list. /
1533	if (data_race(jh->b_transaction == transaction &&
1534	jh->b_jlist != BJ_Metadata)) {
1535	spin_lock(lock: &jh->b_state_lock);
1536	if (jh->b_transaction == transaction &&
1537	jh->b_jlist != BJ_Metadata)
1538	pr_err("JBD2: assertion failure: h_type=%u "
1539	"h_line_no=%u block_no=%llu jlist=%u\n",
1540	handle->h_type, handle->h_line_no,
1541	(unsigned long long) bh->b_blocknr,
1542	jh->b_jlist);
1543	J_ASSERT_JH(jh, jh->b_transaction != transaction \|\|
1544	jh->b_jlist == BJ_Metadata);
1545	spin_unlock(lock: &jh->b_state_lock);
1546	}
1547	goto out;
1548	}
1549
1550	journal = transaction->t_journal;
1551	spin_lock(lock: &jh->b_state_lock);
1552
1553	if (is_handle_aborted(handle)) {
1554	/*
1555	* Check journal aborting with @jh->b_state_lock locked,
1556	* since 'jh->b_transaction' could be replaced with
1557	* 'jh->b_next_transaction' during old transaction
1558	* committing if journal aborted, which may fail
1559	* assertion on 'jh->b_frozen_data == NULL'.
1560	*/
1561	ret = -EROFS;
1562	goto out_unlock_bh;
1563	}
1564
1565	if (jh->b_modified == `0`) {
1566	/*
1567	* This buffer's got modified and becoming part
1568	* of the transaction. This needs to be done
1569	* once a transaction -bzzz
1570	*/
1571	if (WARN_ON_ONCE(jbd2_handle_buffer_credits(handle) <= `0`)) {
1572	ret = -ENOSPC;
1573	goto out_unlock_bh;
1574	}
1575	jh->b_modified = `1`;
1576	handle->h_total_credits--;
1577	}
1578
1579	/*
1580	* fastpath, to avoid expensive locking. If this buffer is already
1581	* on the running transaction's metadata list there is nothing to do.
1582	* Nobody can take it off again because there is a handle open.
1583	* I _think_ we're OK here with SMP barriers - a mistaken decision will
1584	* result in this test being false, so we go in and take the locks.
1585	*/
1586	if (jh->b_transaction == transaction && jh->b_jlist == BJ_Metadata) {
1587	JBUFFER_TRACE(jh, "fastpath");
1588	if (unlikely(jh->b_transaction !=
1589	journal->j_running_transaction)) {
1590	printk(KERN_ERR "JBD2: %s: "
1591	"jh->b_transaction (%llu, %p, %u) != "
1592	"journal->j_running_transaction (%p, %u)\n",
1593	journal->j_devname,
1594	(unsigned long long) bh->b_blocknr,
1595	jh->b_transaction,
1596	jh->b_transaction ? jh->b_transaction->t_tid : `0`,
1597	journal->j_running_transaction,
1598	journal->j_running_transaction ?
1599	journal->j_running_transaction->t_tid : `0`);
1600	ret = -EINVAL;
1601	}
1602	goto out_unlock_bh;
1603	}
1604
1605	set_buffer_jbddirty(bh);
1606
1607	/*
1608	* Metadata already on the current transaction list doesn't
1609	* need to be filed. Metadata on another transaction's list must
1610	* be committing, and will be refiled once the commit completes:
1611	* leave it alone for now.
1612	*/
1613	if (jh->b_transaction != transaction) {
1614	JBUFFER_TRACE(jh, "already on other transaction");
1615	if (unlikely(((jh->b_transaction !=
1616	journal->j_committing_transaction)) \|\|
1617	(jh->b_next_transaction != transaction))) {
1618	printk(KERN_ERR "jbd2_journal_dirty_metadata: %s: "
1619	"bad jh for block %llu: "
1620	"transaction (%p, %u), "
1621	"jh->b_transaction (%p, %u), "
1622	"jh->b_next_transaction (%p, %u), jlist %u\n",
1623	journal->j_devname,
1624	(unsigned long long) bh->b_blocknr,
1625	transaction, transaction->t_tid,
1626	jh->b_transaction,
1627	jh->b_transaction ?
1628	jh->b_transaction->t_tid : `0`,
1629	jh->b_next_transaction,
1630	jh->b_next_transaction ?
1631	jh->b_next_transaction->t_tid : `0`,
1632	jh->b_jlist);
1633	WARN_ON(`1`);
1634	ret = -EINVAL;
1635	}
1636	/ And this case is illegal: we can't reuse another*
1637	* transaction's data buffer, ever. */
1638	goto out_unlock_bh;
1639	}
1640
1641	/ That test should have eliminated the following case: /
1642	J_ASSERT_JH(jh, jh->b_frozen_data == NULL);
1643
1644	JBUFFER_TRACE(jh, "file as BJ_Metadata");
1645	spin_lock(lock: &journal->j_list_lock);
1646	__jbd2_journal_file_buffer(jh, transaction, BJ_Metadata);
1647	spin_unlock(lock: &journal->j_list_lock);
1648	out_unlock_bh:
1649	spin_unlock(lock: &jh->b_state_lock);
1650	out:
1651	JBUFFER_TRACE(jh, "exit");
1652	return ret;
1653	}
1654
1655	/**
1656	* jbd2_journal_forget() - bforget() for potentially-journaled buffers.
1657	* @handle: transaction handle
1658	* @bh: bh to 'forget'
1659	*
1660	* We can only do the bforget if there are no commits pending against the
1661	* buffer. If the buffer is dirty in the current running transaction we
1662	* can safely unlink it.
1663	*
1664	* bh may not be a journalled buffer at all - it may be a non-JBD
1665	* buffer which came off the hashtable. Check for this.
1666	*
1667	* Decrements bh->b_count by one.
1668	*
1669	* Allow this call even if the handle has aborted --- it may be part of
1670	* the caller's cleanup after an abort.
1671	*/
1672	int jbd2_journal_forget(handle_t handle, struct* buffer_head *bh)
1673	{
1674	transaction_t *transaction = handle->h_transaction;
1675	journal_t *journal;
1676	struct journal_head *jh;
1677	int drop_reserve = `0`;
1678	int err = `0`;
1679	int was_modified = `0`;
1680
1681	if (is_handle_aborted(handle))
1682	return -EROFS;
1683	journal = transaction->t_journal;
1684
1685	BUFFER_TRACE(bh, "entry");
1686
1687	jh = jbd2_journal_grab_journal_head(bh);
1688	if (!jh) {
1689	__bforget(bh);
1690	return `0`;
1691	}
1692
1693	spin_lock(lock: &jh->b_state_lock);
1694
1695	/ Critical error: attempting to delete a bitmap buffer, maybe?*
1696	* Don't do any jbd operations, and return an error. */
1697	if (!J_EXPECT_JH(jh, !jh->b_committed_data,
1698	"inconsistent data on disk")) {
1699	err = -EIO;
1700	goto drop;
1701	}
1702
1703	/ keep track of whether or not this transaction modified us /
1704	was_modified = jh->b_modified;
1705
1706	/*
1707	* The buffer's going from the transaction, we must drop
1708	* all references -bzzz
1709	*/
1710	jh->b_modified = `0`;
1711
1712	if (jh->b_transaction == transaction) {
1713	J_ASSERT_JH(jh, !jh->b_frozen_data);
1714
1715	/ If we are forgetting a buffer which is already part*
1716	* of this transaction, then we can just drop it from
1717	* the transaction immediately. */
1718	clear_buffer_dirty(bh);
1719	clear_buffer_jbddirty(bh);
1720
1721	JBUFFER_TRACE(jh, "belongs to current transaction: unfile");
1722
1723	/*
1724	* we only want to drop a reference if this transaction
1725	* modified the buffer
1726	*/
1727	if (was_modified)
1728	drop_reserve = `1`;
1729
1730	/*
1731	* We are no longer going to journal this buffer.
1732	* However, the commit of this transaction is still
1733	* important to the buffer: the delete that we are now
1734	* processing might obsolete an old log entry, so by
1735	* committing, we can satisfy the buffer's checkpoint.
1736	*
1737	* So, if we have a checkpoint on the buffer, we should
1738	* now refile the buffer on our BJ_Forget list so that
1739	* we know to remove the checkpoint after we commit.
1740	*/
1741
1742	spin_lock(lock: &journal->j_list_lock);
1743	if (jh->b_cp_transaction) {
1744	__jbd2_journal_temp_unlink_buffer(jh);
1745	__jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
1746	} else {
1747	__jbd2_journal_unfile_buffer(jh);
1748	jbd2_journal_put_journal_head(jh);
1749	}
1750	spin_unlock(lock: &journal->j_list_lock);
1751	} else if (jh->b_transaction) {
1752	J_ASSERT_JH(jh, (jh->b_transaction ==
1753	journal->j_committing_transaction));
1754	/ However, if the buffer is still owned by a prior*
1755	* (committing) transaction, we can't drop it yet... */
1756	JBUFFER_TRACE(jh, "belongs to older transaction");
1757	/ ... but we CAN drop it from the new transaction through*
1758	* marking the buffer as freed and set j_next_transaction to
1759	* the new transaction, so that not only the commit code
1760	* knows it should clear dirty bits when it is done with the
1761	* buffer, but also the buffer can be checkpointed only
1762	* after the new transaction commits. */
1763
1764	set_buffer_freed(bh);
1765
1766	if (!jh->b_next_transaction) {
1767	spin_lock(lock: &journal->j_list_lock);
1768	jh->b_next_transaction = transaction;
1769	spin_unlock(lock: &journal->j_list_lock);
1770	} else {
1771	J_ASSERT(jh->b_next_transaction == transaction);
1772
1773	/*
1774	* only drop a reference if this transaction modified
1775	* the buffer
1776	*/
1777	if (was_modified)
1778	drop_reserve = `1`;
1779	}
1780	} else {
1781	/*
1782	* Finally, if the buffer is not belongs to any
1783	* transaction, we can just drop it now if it has no
1784	* checkpoint.
1785	*/
1786	spin_lock(lock: &journal->j_list_lock);
1787	if (!jh->b_cp_transaction) {
1788	JBUFFER_TRACE(jh, "belongs to none transaction");
1789	spin_unlock(lock: &journal->j_list_lock);
1790	goto drop;
1791	}
1792
1793	/*
1794	* Otherwise, if the buffer has been written to disk,
1795	* it is safe to remove the checkpoint and drop it.
1796	*/
1797	if (jbd2_journal_try_remove_checkpoint(jh) >= `0`) {
1798	spin_unlock(lock: &journal->j_list_lock);
1799	goto drop;
1800	}
1801
1802	/*
1803	* The buffer is still not written to disk, we should
1804	* attach this buffer to current transaction so that the
1805	* buffer can be checkpointed only after the current
1806	* transaction commits.
1807	*/
1808	clear_buffer_dirty(bh);
1809	__jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
1810	spin_unlock(lock: &journal->j_list_lock);
1811	}
1812	drop:
1813	__brelse(bh);
1814	spin_unlock(lock: &jh->b_state_lock);
1815	jbd2_journal_put_journal_head(jh);
1816	if (drop_reserve) {
1817	/ no need to reserve log space for this block -bzzz /
1818	handle->h_total_credits++;
1819	}
1820	return err;
1821	}
1822
1823	/**
1824	* jbd2_journal_stop() - complete a transaction
1825	* @handle: transaction to complete.
1826	*
1827	* All done for a particular handle.
1828	*
1829	* There is not much action needed here. We just return any remaining
1830	* buffer credits to the transaction and remove the handle. The only
1831	* complication is that we need to start a commit operation if the
1832	* filesystem is marked for synchronous update.
1833	*
1834	* jbd2_journal_stop itself will not usually return an error, but it may
1835	* do so in unusual circumstances. In particular, expect it to
1836	* return -EIO if a jbd2_journal_abort has been executed since the
1837	* transaction began.
1838	*/
1839	int jbd2_journal_stop(handle_t *handle)
1840	{
1841	transaction_t *transaction = handle->h_transaction;
1842	journal_t *journal;
1843	int err = `0`, wait_for_commit = `0`;
1844	tid_t tid;
1845	pid_t pid;
1846
1847	if (--handle->h_ref > `0`) {
1848	jbd2_debug(`4`, "h_ref %d -> %d\n", handle->h_ref + `1`,
1849	handle->h_ref);
1850	if (is_handle_aborted(handle))
1851	return -EIO;
1852	return `0`;
1853	}
1854	if (!transaction) {
1855	/*
1856	* Handle is already detached from the transaction so there is
1857	* nothing to do other than free the handle.
1858	*/
1859	memalloc_nofs_restore(flags: handle->saved_alloc_context);
1860	goto free_and_exit;
1861	}
1862	journal = transaction->t_journal;
1863	tid = transaction->t_tid;
1864
1865	if (is_handle_aborted(handle))
1866	err = -EIO;
1867
1868	jbd2_debug(`4`, "Handle %p going down\n", handle);
1869	trace_jbd2_handle_stats(dev: journal->j_fs_dev->bd_dev,
1870	tid, type: handle->h_type, line_no: handle->h_line_no,
1871	interval: jiffies - handle->h_start_jiffies,
1872	sync: handle->h_sync, requested_blocks: handle->h_requested_credits,
1873	dirtied_blocks: (handle->h_requested_credits -
1874	handle->h_total_credits));
1875
1876	/*
1877	* Implement synchronous transaction batching. If the handle
1878	* was synchronous, don't force a commit immediately. Let's
1879	* yield and let another thread piggyback onto this
1880	* transaction. Keep doing that while new threads continue to
1881	* arrive. It doesn't cost much - we're about to run a commit
1882	* and sleep on IO anyway. Speeds up many-threaded, many-dir
1883	* operations by 30x or more...
1884	*
1885	* We try and optimize the sleep time against what the
1886	* underlying disk can do, instead of having a static sleep
1887	* time. This is useful for the case where our storage is so
1888	* fast that it is more optimal to go ahead and force a flush
1889	* and wait for the transaction to be committed than it is to
1890	* wait for an arbitrary amount of time for new writers to
1891	* join the transaction. We achieve this by measuring how
1892	* long it takes to commit a transaction, and compare it with
1893	* how long this transaction has been running, and if run time
1894	* < commit time then we sleep for the delta and commit. This
1895	* greatly helps super fast disks that would see slowdowns as
1896	* more threads started doing fsyncs.
1897	*
1898	* But don't do this if this process was the most recent one
1899	* to perform a synchronous write. We do this to detect the
1900	* case where a single process is doing a stream of sync
1901	* writes. No point in waiting for joiners in that case.
1902	*
1903	* Setting max_batch_time to 0 disables this completely.
1904	*/
1905	pid = current->pid;
1906	if (handle->h_sync && journal->j_last_sync_writer != pid &&
1907	journal->j_max_batch_time) {
1908	u64 commit_time, trans_time;
1909
1910	journal->j_last_sync_writer = pid;
1911
1912	read_lock(&journal->j_state_lock);
1913	commit_time = journal->j_average_commit_time;
1914	read_unlock(&journal->j_state_lock);
1915
1916	trans_time = ktime_to_ns(ktime_sub(ktime_get(),
1917	transaction->t_start_time));
1918
1919	commit_time = max_t(u64, commit_time,
1920	`1000`*journal->j_min_batch_time);
1921	commit_time = min_t(u64, commit_time,
1922	`1000`*journal->j_max_batch_time);
1923
1924	if (trans_time < commit_time) {
1925	ktime_t expires = ktime_add_ns(ktime_get(),
1926	commit_time);
1927	set_current_state(TASK_UNINTERRUPTIBLE);
1928	schedule_hrtimeout(expires: &expires, mode: HRTIMER_MODE_ABS);
1929	}
1930	}
1931
1932	if (handle->h_sync)
1933	transaction->t_synchronous_commit = `1`;
1934
1935	/*
1936	* If the handle is marked SYNC, we need to set another commit
1937	* going! We also want to force a commit if the transaction is too
1938	* old now.
1939	*/
1940	if (handle->h_sync \|\|
1941	time_after_eq(jiffies, transaction->t_expires)) {
1942	/ Do this even for aborted journals: an abort still*
1943	* completes the commit thread, it just doesn't write
1944	* anything to disk. */
1945
1946	jbd2_debug(`2`, "transaction too old, requesting commit for "
1947	"handle %p\n", handle);
1948	/ This is non-blocking /
1949	jbd2_log_start_commit(journal, tid);
1950
1951	/*
1952	* Special case: JBD2_SYNC synchronous updates require us
1953	* to wait for the commit to complete.
1954	*/
1955	if (handle->h_sync && !(current->flags & PF_MEMALLOC))
1956	wait_for_commit = `1`;
1957	}
1958
1959	/*
1960	* Once stop_this_handle() drops t_updates, the transaction could start
1961	* committing on us and eventually disappear. So we must not
1962	* dereference transaction pointer again after calling
1963	* stop_this_handle().
1964	*/
1965	stop_this_handle(handle);
1966
1967	if (wait_for_commit)
1968	err = jbd2_log_wait_commit(journal, tid);
1969
1970	free_and_exit:
1971	if (handle->h_rsv_handle)
1972	jbd2_free_handle(handle: handle->h_rsv_handle);
1973	jbd2_free_handle(handle);
1974	return err;
1975	}
1976
1977	/*
1978	*
1979	* List management code snippets: various functions for manipulating the
1980	* transaction buffer lists.
1981	*
1982	*/
1983
1984	/*
1985	* Append a buffer to a transaction list, given the transaction's list head
1986	* pointer.
1987	*
1988	* j_list_lock is held.
1989	*
1990	* jh->b_state_lock is held.
1991	*/
1992
1993	static inline void
1994	__blist_add_buffer(struct journal_head list, struct** journal_head *jh)
1995	{
1996	if (!*list) {
1997	jh->b_tnext = jh->b_tprev = jh;
1998	*list = jh;
1999	} else {
2000	/ Insert at the tail of the list to preserve order /
2001	struct journal_head first = list, *last = first->b_tprev;
2002	jh->b_tprev = last;
2003	jh->b_tnext = first;
2004	last->b_tnext = first->b_tprev = jh;
2005	}
2006	}
2007
2008	/*
2009	* Remove a buffer from a transaction list, given the transaction's list
2010	* head pointer.
2011	*
2012	* Called with j_list_lock held, and the journal may not be locked.
2013	*
2014	* jh->b_state_lock is held.
2015	*/
2016
2017	static inline void
2018	__blist_del_buffer(struct journal_head list, struct** journal_head *jh)
2019	{
2020	if (*list == jh) {
2021	*list = jh->b_tnext;
2022	if (*list == jh)
2023	*list = NULL;
2024	}
2025	jh->b_tprev->b_tnext = jh->b_tnext;
2026	jh->b_tnext->b_tprev = jh->b_tprev;
2027	}
2028
2029	/*
2030	* Remove a buffer from the appropriate transaction list.
2031	*
2032	* Note that this function can change the value of
2033	* bh->b_transaction->t_buffers, t_forget, t_shadow_list, t_log_list or
2034	* t_reserved_list. If the caller is holding onto a copy of one of these
2035	* pointers, it could go bad. Generally the caller needs to re-read the
2036	* pointer from the transaction_t.
2037	*
2038	* Called under j_list_lock.
2039	*/
2040	static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
2041	{
2042	struct journal_head **list = NULL;
2043	transaction_t *transaction;
2044	struct buffer_head *bh = jh2bh(jh);
2045
2046	lockdep_assert_held(&jh->b_state_lock);
2047	transaction = jh->b_transaction;
2048	if (transaction)
2049	assert_spin_locked(&transaction->t_journal->j_list_lock);
2050
2051	J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
2052	if (jh->b_jlist != BJ_None)
2053	J_ASSERT_JH(jh, transaction != NULL);
2054
2055	switch (jh->b_jlist) {
2056	case BJ_None:
2057	return;
2058	case BJ_Metadata:
2059	transaction->t_nr_buffers--;
2060	J_ASSERT_JH(jh, transaction->t_nr_buffers >= `0`);
2061	list = &transaction->t_buffers;
2062	break;
2063	case BJ_Forget:
2064	list = &transaction->t_forget;
2065	break;
2066	case BJ_Shadow:
2067	list = &transaction->t_shadow_list;
2068	break;
2069	case BJ_Reserved:
2070	list = &transaction->t_reserved_list;
2071	break;
2072	}
2073
2074	__blist_del_buffer(list, jh);
2075	jh->b_jlist = BJ_None;
2076	if (transaction && is_journal_aborted(journal: transaction->t_journal))
2077	clear_buffer_jbddirty(bh);
2078	else if (test_clear_buffer_jbddirty(bh))
2079	mark_buffer_dirty(bh); / Expose it to the VM /
2080	}
2081
2082	/*
2083	* Remove buffer from all transactions. The caller is responsible for dropping
2084	* the jh reference that belonged to the transaction.
2085	*
2086	* Called with bh_state lock and j_list_lock
2087	*/
2088	static void __jbd2_journal_unfile_buffer(struct journal_head *jh)
2089	{
2090	J_ASSERT_JH(jh, jh->b_transaction != NULL);
2091	J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
2092
2093	__jbd2_journal_temp_unlink_buffer(jh);
2094	jh->b_transaction = NULL;
2095	}
2096
2097	void jbd2_journal_unfile_buffer(journal_t journal, struct* journal_head *jh)
2098	{
2099	struct buffer_head *bh = jh2bh(jh);
2100
2101	/ Get reference so that buffer cannot be freed before we unlock it /
2102	get_bh(bh);
2103	spin_lock(lock: &jh->b_state_lock);
2104	spin_lock(lock: &journal->j_list_lock);
2105	__jbd2_journal_unfile_buffer(jh);
2106	spin_unlock(lock: &journal->j_list_lock);
2107	spin_unlock(lock: &jh->b_state_lock);
2108	jbd2_journal_put_journal_head(jh);
2109	__brelse(bh);
2110	}
2111
2112	/**
2113	* jbd2_journal_try_to_free_buffers() - try to free page buffers.
2114	* @journal: journal for operation
2115	* @folio: Folio to detach data from.
2116	*
2117	* For all the buffers on this page,
2118	* if they are fully written out ordered data, move them onto BUF_CLEAN
2119	* so try_to_free_buffers() can reap them.
2120	*
2121	* This function returns non-zero if we wish try_to_free_buffers()
2122	* to be called. We do this if the page is releasable by try_to_free_buffers().
2123	* We also do it if the page has locked or dirty buffers and the caller wants
2124	* us to perform sync or async writeout.
2125	*
2126	* This complicates JBD locking somewhat. We aren't protected by the
2127	* BKL here. We wish to remove the buffer from its committing or
2128	* running transaction's ->t_datalist via __jbd2_journal_unfile_buffer.
2129	*
2130	* This may change the value of transaction_t->t_datalist, so anyone
2131	* who looks at t_datalist needs to lock against this function.
2132	*
2133	* Even worse, someone may be doing a jbd2_journal_dirty_data on this
2134	* buffer. So we need to lock against that. jbd2_journal_dirty_data()
2135	* will come out of the lock with the buffer dirty, which makes it
2136	* ineligible for release here.
2137	*
2138	* Who else is affected by this? hmm... Really the only contender
2139	* is do_get_write_access() - it could be looking at the buffer while
2140	* journal_try_to_free_buffer() is changing its state. But that
2141	* cannot happen because we never reallocate freed data as metadata
2142	* while the data is part of a transaction. Yes?
2143	*
2144	* Return false on failure, true on success
2145	*/
2146	bool jbd2_journal_try_to_free_buffers(journal_t journal, struct* folio *folio)
2147	{
2148	struct buffer_head *head;
2149	struct buffer_head *bh;
2150	bool ret = false;
2151
2152	J_ASSERT(folio_test_locked(folio));
2153
2154	head = folio_buffers(folio);
2155	bh = head;
2156	do {
2157	struct journal_head *jh;
2158
2159	/*
2160	* We take our own ref against the journal_head here to avoid
2161	* having to add tons of locking around each instance of
2162	* jbd2_journal_put_journal_head().
2163	*/
2164	jh = jbd2_journal_grab_journal_head(bh);
2165	if (!jh)
2166	continue;
2167
2168	spin_lock(lock: &jh->b_state_lock);
2169	if (!jh->b_transaction && !jh->b_next_transaction) {
2170	spin_lock(lock: &journal->j_list_lock);
2171	/ Remove written-back checkpointed metadata buffer /
2172	if (jh->b_cp_transaction != NULL)
2173	jbd2_journal_try_remove_checkpoint(jh);
2174	spin_unlock(lock: &journal->j_list_lock);
2175	}
2176	spin_unlock(lock: &jh->b_state_lock);
2177	jbd2_journal_put_journal_head(jh);
2178	if (buffer_jbd(bh))
2179	goto busy;
2180	} while ((bh = bh->b_this_page) != head);
2181
2182	ret = try_to_free_buffers(folio);
2183	busy:
2184	return ret;
2185	}
2186
2187	/*
2188	* This buffer is no longer needed. If it is on an older transaction's
2189	* checkpoint list we need to record it on this transaction's forget list
2190	* to pin this buffer (and hence its checkpointing transaction) down until
2191	* this transaction commits. If the buffer isn't on a checkpoint list, we
2192	* release it.
2193	* Returns non-zero if JBD no longer has an interest in the buffer.
2194	*
2195	* Called under j_list_lock.
2196	*
2197	* Called under jh->b_state_lock.
2198	*/
2199	static int __dispose_buffer(struct journal_head jh, transaction_t transaction)
2200	{
2201	int may_free = `1`;
2202	struct buffer_head *bh = jh2bh(jh);
2203
2204	if (jh->b_cp_transaction) {
2205	JBUFFER_TRACE(jh, "on running+cp transaction");
2206	__jbd2_journal_temp_unlink_buffer(jh);
2207	/*
2208	* We don't want to write the buffer anymore, clear the
2209	* bit so that we don't confuse checks in
2210	* __journal_file_buffer
2211	*/
2212	clear_buffer_dirty(bh);
2213	__jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
2214	may_free = `0`;
2215	} else {
2216	JBUFFER_TRACE(jh, "on running transaction");
2217	__jbd2_journal_unfile_buffer(jh);
2218	jbd2_journal_put_journal_head(jh);
2219	}
2220	return may_free;
2221	}
2222
2223	/*
2224	* jbd2_journal_invalidate_folio
2225	*
2226	* This code is tricky. It has a number of cases to deal with.
2227	*
2228	* There are two invariants which this code relies on:
2229	*
2230	* i_size must be updated on disk before we start calling invalidate_folio
2231	* on the data.
2232	*
2233	* This is done in ext3 by defining an ext3_setattr method which
2234	* updates i_size before truncate gets going. By maintaining this
2235	* invariant, we can be sure that it is safe to throw away any buffers
2236	* attached to the current transaction: once the transaction commits,
2237	* we know that the data will not be needed.
2238	*
2239	* Note however that we can not throw away data belonging to the
2240	* previous, committing transaction!
2241	*
2242	* Any disk blocks which are part of the previous, committing
2243	* transaction (and which therefore cannot be discarded immediately) are
2244	* not going to be reused in the new running transaction
2245	*
2246	* The bitmap committed_data images guarantee this: any block which is
2247	* allocated in one transaction and removed in the next will be marked
2248	* as in-use in the committed_data bitmap, so cannot be reused until
2249	* the next transaction to delete the block commits. This means that
2250	* leaving committing buffers dirty is quite safe: the disk blocks
2251	* cannot be reallocated to a different file and so buffer aliasing is
2252	* not possible.
2253	*
2254	*
2255	* The above applies mainly to ordered data mode. In writeback mode we
2256	* don't make guarantees about the order in which data hits disk --- in
2257	* particular we don't guarantee that new dirty data is flushed before
2258	* transaction commit --- so it is always safe just to discard data
2259	* immediately in that mode. --sct
2260	*/
2261
2262	/*
2263	* The journal_unmap_buffer helper function returns zero if the buffer
2264	* concerned remains pinned as an anonymous buffer belonging to an older
2265	* transaction.
2266	*
2267	* We're outside-transaction here. Either or both of j_running_transaction
2268	* and j_committing_transaction may be NULL.
2269	*/
2270	static int journal_unmap_buffer(journal_t journal, struct* buffer_head *bh,
2271	int partial_page)
2272	{
2273	transaction_t *transaction;
2274	struct journal_head *jh;
2275	int may_free = `1`;
2276
2277	BUFFER_TRACE(bh, "entry");
2278
2279	/*
2280	* It is safe to proceed here without the j_list_lock because the
2281	* buffers cannot be stolen by try_to_free_buffers as long as we are
2282	* holding the page lock. --sct
2283	*/
2284
2285	jh = jbd2_journal_grab_journal_head(bh);
2286	if (!jh)
2287	goto zap_buffer_unlocked;
2288
2289	/ OK, we have data buffer in journaled mode /
2290	write_lock(&journal->j_state_lock);
2291	spin_lock(lock: &jh->b_state_lock);
2292	spin_lock(lock: &journal->j_list_lock);
2293
2294	/*
2295	* We cannot remove the buffer from checkpoint lists until the
2296	* transaction adding inode to orphan list (let's call it T)
2297	* is committed. Otherwise if the transaction changing the
2298	* buffer would be cleaned from the journal before T is
2299	* committed, a crash will cause that the correct contents of
2300	* the buffer will be lost. On the other hand we have to
2301	* clear the buffer dirty bit at latest at the moment when the
2302	* transaction marking the buffer as freed in the filesystem
2303	* structures is committed because from that moment on the
2304	* block can be reallocated and used by a different page.
2305	* Since the block hasn't been freed yet but the inode has
2306	* already been added to orphan list, it is safe for us to add
2307	* the buffer to BJ_Forget list of the newest transaction.
2308	*
2309	* Also we have to clear buffer_mapped flag of a truncated buffer
2310	* because the buffer_head may be attached to the page straddling
2311	* i_size (can happen only when blocksize < pagesize) and thus the
2312	* buffer_head can be reused when the file is extended again. So we end
2313	* up keeping around invalidated buffers attached to transactions'
2314	* BJ_Forget list just to stop checkpointing code from cleaning up
2315	* the transaction this buffer was modified in.
2316	*/
2317	transaction = jh->b_transaction;
2318	if (transaction == NULL) {
2319	/ First case: not on any transaction. If it*
2320	* has no checkpoint link, then we can zap it:
2321	* it's a writeback-mode buffer so we don't care
2322	* if it hits disk safely. */
2323	if (!jh->b_cp_transaction) {
2324	JBUFFER_TRACE(jh, "not on any transaction: zap");
2325	goto zap_buffer;
2326	}
2327
2328	if (!buffer_dirty(bh)) {
2329	/ bdflush has written it. We can drop it now /
2330	__jbd2_journal_remove_checkpoint(jh);
2331	goto zap_buffer;
2332	}
2333
2334	/ OK, it must be in the journal but still not*
2335	* written fully to disk: it's metadata or
2336	* journaled data... */
2337
2338	if (journal->j_running_transaction) {
2339	/ ... and once the current transaction has*
2340	* committed, the buffer won't be needed any
2341	* longer. */
2342	JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget");
2343	may_free = __dispose_buffer(jh,
2344	transaction: journal->j_running_transaction);
2345	goto zap_buffer;
2346	} else {
2347	/ There is no currently-running transaction. So the*
2348	* orphan record which we wrote for this file must have
2349	* passed into commit. We must attach this buffer to
2350	* the committing transaction, if it exists. */
2351	if (journal->j_committing_transaction) {
2352	JBUFFER_TRACE(jh, "give to committing trans");
2353	may_free = __dispose_buffer(jh,
2354	transaction: journal->j_committing_transaction);
2355	goto zap_buffer;
2356	} else {
2357	/ The orphan record's transaction has*
2358	* committed. We can cleanse this buffer */
2359	clear_buffer_jbddirty(bh);
2360	__jbd2_journal_remove_checkpoint(jh);
2361	goto zap_buffer;
2362	}
2363	}
2364	} else if (transaction == journal->j_committing_transaction) {
2365	JBUFFER_TRACE(jh, "on committing transaction");
2366	/*
2367	* The buffer is committing, we simply cannot touch
2368	* it. If the page is straddling i_size we have to wait
2369	* for commit and try again.
2370	*/
2371	if (partial_page) {
2372	spin_unlock(lock: &journal->j_list_lock);
2373	spin_unlock(lock: &jh->b_state_lock);
2374	write_unlock(&journal->j_state_lock);
2375	jbd2_journal_put_journal_head(jh);
2376	/ Already zapped buffer? Nothing to do... /
2377	if (!bh->b_bdev)
2378	return `0`;
2379	return -EBUSY;
2380	}
2381	/*
2382	* OK, buffer won't be reachable after truncate. We just clear
2383	* b_modified to not confuse transaction credit accounting, and
2384	* set j_next_transaction to the running transaction (if there
2385	* is one) and mark buffer as freed so that commit code knows
2386	* it should clear dirty bits when it is done with the buffer.
2387	*/
2388	set_buffer_freed(bh);
2389	if (journal->j_running_transaction && buffer_jbddirty(bh))
2390	jh->b_next_transaction = journal->j_running_transaction;
2391	jh->b_modified = `0`;
2392	spin_unlock(lock: &journal->j_list_lock);
2393	spin_unlock(lock: &jh->b_state_lock);
2394	write_unlock(&journal->j_state_lock);
2395	jbd2_journal_put_journal_head(jh);
2396	return `0`;
2397	} else {
2398	/ Good, the buffer belongs to the running transaction.*
2399	* We are writing our own transaction's data, not any
2400	* previous one's, so it is safe to throw it away
2401	* (remember that we expect the filesystem to have set
2402	* i_size already for this truncate so recovery will not
2403	* expose the disk blocks we are discarding here.) */
2404	J_ASSERT_JH(jh, transaction == journal->j_running_transaction);
2405	JBUFFER_TRACE(jh, "on running transaction");
2406	may_free = __dispose_buffer(jh, transaction);
2407	}
2408
2409	zap_buffer:
2410	/*
2411	* This is tricky. Although the buffer is truncated, it may be reused
2412	* if blocksize < pagesize and it is attached to the page straddling
2413	* EOF. Since the buffer might have been added to BJ_Forget list of the
2414	* running transaction, journal_get_write_access() won't clear
2415	* b_modified and credit accounting gets confused. So clear b_modified
2416	* here.
2417	*/
2418	jh->b_modified = `0`;
2419	spin_unlock(lock: &journal->j_list_lock);
2420	spin_unlock(lock: &jh->b_state_lock);
2421	write_unlock(&journal->j_state_lock);
2422	jbd2_journal_put_journal_head(jh);
2423	zap_buffer_unlocked:
2424	clear_buffer_dirty(bh);
2425	J_ASSERT_BH(bh, !buffer_jbddirty(bh));
2426	clear_buffer_mapped(bh);
2427	clear_buffer_req(bh);
2428	clear_buffer_new(bh);
2429	clear_buffer_delay(bh);
2430	clear_buffer_unwritten(bh);
2431	bh->b_bdev = NULL;
2432	return may_free;
2433	}
2434
2435	/**
2436	* jbd2_journal_invalidate_folio()
2437	* @journal: journal to use for flush...
2438	* @folio: folio to flush
2439	* @offset: start of the range to invalidate
2440	* @length: length of the range to invalidate
2441	*
2442	* Reap page buffers containing data after in the specified range in page.
2443	* Can return -EBUSY if buffers are part of the committing transaction and
2444	* the page is straddling i_size. Caller then has to wait for current commit
2445	* and try again.
2446	*/
2447	int jbd2_journal_invalidate_folio(journal_t journal, struct* folio *folio,
2448	size_t offset, size_t length)
2449	{
2450	struct buffer_head head, bh, *next;
2451	unsigned int stop = offset + length;
2452	unsigned int curr_off = `0`;
2453	int partial_page = (offset \|\| length < folio_size(folio));
2454	int may_free = `1`;
2455	int ret = `0`;
2456
2457	if (!folio_test_locked(folio))
2458	BUG();
2459	head = folio_buffers(folio);
2460	if (!head)
2461	return `0`;
2462
2463	BUG_ON(stop > folio_size(folio) \|\| stop < length);
2464
2465	/ We will potentially be playing with lists other than just the*
2466	* data lists (especially for journaled data mode), so be
2467	* cautious in our locking. */
2468
2469	bh = head;
2470	do {
2471	unsigned int next_off = curr_off + bh->b_size;
2472	next = bh->b_this_page;
2473
2474	if (next_off > stop)
2475	return `0`;
2476
2477	if (offset <= curr_off) {
2478	/ This block is wholly outside the truncation point /
2479	lock_buffer(bh);
2480	ret = journal_unmap_buffer(journal, bh, partial_page);
2481	unlock_buffer(bh);
2482	if (ret < `0`)
2483	return ret;
2484	may_free &= ret;
2485	}
2486	curr_off = next_off;
2487	bh = next;
2488
2489	} while (bh != head);
2490
2491	if (!partial_page) {
2492	if (may_free && try_to_free_buffers(folio))
2493	J_ASSERT(!folio_buffers(folio));
2494	}
2495	return `0`;
2496	}
2497
2498	/*
2499	* File a buffer on the given transaction list.
2500	*/
2501	void __jbd2_journal_file_buffer(struct journal_head *jh,
2502	transaction_t transaction, int* jlist)
2503	{
2504	struct journal_head **list = NULL;
2505	int was_dirty = `0`;
2506	struct buffer_head *bh = jh2bh(jh);
2507
2508	lockdep_assert_held(&jh->b_state_lock);
2509	assert_spin_locked(&transaction->t_journal->j_list_lock);
2510
2511	J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
2512	J_ASSERT_JH(jh, jh->b_transaction == transaction \|\|
2513	jh->b_transaction == NULL);
2514
2515	if (jh->b_transaction && jh->b_jlist == jlist)
2516	return;
2517
2518	if (jlist == BJ_Metadata \|\| jlist == BJ_Reserved \|\|
2519	jlist == BJ_Shadow \|\| jlist == BJ_Forget) {
2520	/*
2521	* For metadata buffers, we track dirty bit in buffer_jbddirty
2522	* instead of buffer_dirty. We should not see a dirty bit set
2523	* here because we clear it in do_get_write_access but e.g.
2524	* tune2fs can modify the sb and set the dirty bit at any time
2525	* so we try to gracefully handle that.
2526	*/
2527	if (buffer_dirty(bh))
2528	warn_dirty_buffer(bh);
2529	if (test_clear_buffer_dirty(bh) \|\|
2530	test_clear_buffer_jbddirty(bh))
2531	was_dirty = `1`;
2532	}
2533
2534	if (jh->b_transaction)
2535	__jbd2_journal_temp_unlink_buffer(jh);
2536	else
2537	jbd2_journal_grab_journal_head(bh);
2538	jh->b_transaction = transaction;
2539
2540	switch (jlist) {
2541	case BJ_None:
2542	J_ASSERT_JH(jh, !jh->b_committed_data);
2543	J_ASSERT_JH(jh, !jh->b_frozen_data);
2544	return;
2545	case BJ_Metadata:
2546	transaction->t_nr_buffers++;
2547	list = &transaction->t_buffers;
2548	break;
2549	case BJ_Forget:
2550	list = &transaction->t_forget;
2551	break;
2552	case BJ_Shadow:
2553	list = &transaction->t_shadow_list;
2554	break;
2555	case BJ_Reserved:
2556	list = &transaction->t_reserved_list;
2557	break;
2558	}
2559
2560	__blist_add_buffer(list, jh);
2561	jh->b_jlist = jlist;
2562
2563	if (was_dirty)
2564	set_buffer_jbddirty(bh);
2565	}
2566
2567	void jbd2_journal_file_buffer(struct journal_head *jh,
2568	transaction_t transaction, int* jlist)
2569	{
2570	spin_lock(lock: &jh->b_state_lock);
2571	spin_lock(lock: &transaction->t_journal->j_list_lock);
2572	__jbd2_journal_file_buffer(jh, transaction, jlist);
2573	spin_unlock(lock: &transaction->t_journal->j_list_lock);
2574	spin_unlock(lock: &jh->b_state_lock);
2575	}
2576
2577	/*
2578	* Remove a buffer from its current buffer list in preparation for
2579	* dropping it from its current transaction entirely. If the buffer has
2580	* already started to be used by a subsequent transaction, refile the
2581	* buffer on that transaction's metadata list.
2582	*
2583	* Called under j_list_lock
2584	* Called under jh->b_state_lock
2585	*
2586	* When this function returns true, there's no next transaction to refile to
2587	* and the caller has to drop jh reference through
2588	* jbd2_journal_put_journal_head().
2589	*/
2590	bool __jbd2_journal_refile_buffer(struct journal_head *jh)
2591	{
2592	int was_dirty, jlist;
2593	struct buffer_head *bh = jh2bh(jh);
2594
2595	lockdep_assert_held(&jh->b_state_lock);
2596	if (jh->b_transaction)
2597	assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock);
2598
2599	/ If the buffer is now unused, just drop it. /
2600	if (jh->b_next_transaction == NULL) {
2601	__jbd2_journal_unfile_buffer(jh);
2602	return true;
2603	}
2604
2605	/*
2606	* It has been modified by a later transaction: add it to the new
2607	* transaction's metadata list.
2608	*/
2609
2610	was_dirty = test_clear_buffer_jbddirty(bh);
2611	__jbd2_journal_temp_unlink_buffer(jh);
2612
2613	/*
2614	* b_transaction must be set, otherwise the new b_transaction won't
2615	* be holding jh reference
2616	*/
2617	J_ASSERT_JH(jh, jh->b_transaction != NULL);
2618
2619	/*
2620	* We set b_transaction here because b_next_transaction will inherit
2621	* our jh reference and thus __jbd2_journal_file_buffer() must not
2622	* take a new one.
2623	*/
2624	WRITE_ONCE(jh->b_transaction, jh->b_next_transaction);
2625	WRITE_ONCE(jh->b_next_transaction, NULL);
2626	if (buffer_freed(bh))
2627	jlist = BJ_Forget;
2628	else if (jh->b_modified)
2629	jlist = BJ_Metadata;
2630	else
2631	jlist = BJ_Reserved;
2632	__jbd2_journal_file_buffer(jh, transaction: jh->b_transaction, jlist);
2633	J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING);
2634
2635	if (was_dirty)
2636	set_buffer_jbddirty(bh);
2637	return false;
2638	}
2639
2640	/*
2641	* __jbd2_journal_refile_buffer() with necessary locking added. We take our
2642	* bh reference so that we can safely unlock bh.
2643	*
2644	* The jh and bh may be freed by this call.
2645	*/
2646	void jbd2_journal_refile_buffer(journal_t journal, struct* journal_head *jh)
2647	{
2648	bool drop;
2649
2650	spin_lock(lock: &jh->b_state_lock);
2651	spin_lock(lock: &journal->j_list_lock);
2652	drop = __jbd2_journal_refile_buffer(jh);
2653	spin_unlock(lock: &jh->b_state_lock);
2654	spin_unlock(lock: &journal->j_list_lock);
2655	if (drop)
2656	jbd2_journal_put_journal_head(jh);
2657	}
2658
2659	/*
2660	* File inode in the inode list of the handle's transaction
2661	*/
2662	static int jbd2_journal_file_inode(handle_t handle, struct* jbd2_inode *jinode,
2663	unsigned long flags, loff_t start_byte, loff_t end_byte)
2664	{
2665	transaction_t *transaction = handle->h_transaction;
2666	journal_t *journal;
2667
2668	if (is_handle_aborted(handle))
2669	return -EROFS;
2670	journal = transaction->t_journal;
2671
2672	jbd2_debug(`4`, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino,
2673	transaction->t_tid);
2674
2675	spin_lock(lock: &journal->j_list_lock);
2676	jinode->i_flags \|= flags;
2677
2678	if (jinode->i_dirty_end) {
2679	jinode->i_dirty_start = min(jinode->i_dirty_start, start_byte);
2680	jinode->i_dirty_end = max(jinode->i_dirty_end, end_byte);
2681	} else {
2682	jinode->i_dirty_start = start_byte;
2683	jinode->i_dirty_end = end_byte;
2684	}
2685
2686	/ Is inode already attached where we need it? /
2687	if (jinode->i_transaction == transaction \|\|
2688	jinode->i_next_transaction == transaction)
2689	goto done;
2690
2691	/*
2692	* We only ever set this variable to 1 so the test is safe. Since
2693	* t_need_data_flush is likely to be set, we do the test to save some
2694	* cacheline bouncing
2695	*/
2696	if (!transaction->t_need_data_flush)
2697	transaction->t_need_data_flush = `1`;
2698	/ On some different transaction's list - should be*
2699	* the committing one */
2700	if (jinode->i_transaction) {
2701	J_ASSERT(jinode->i_next_transaction == NULL);
2702	J_ASSERT(jinode->i_transaction ==
2703	journal->j_committing_transaction);
2704	jinode->i_next_transaction = transaction;
2705	goto done;
2706	}
2707	/ Not on any transaction list... /
2708	J_ASSERT(!jinode->i_next_transaction);
2709	jinode->i_transaction = transaction;
2710	list_add(new: &jinode->i_list, head: &transaction->t_inode_list);
2711	done:
2712	spin_unlock(lock: &journal->j_list_lock);
2713
2714	return `0`;
2715	}
2716
2717	int jbd2_journal_inode_ranged_write(handle_t *handle,
2718	struct jbd2_inode *jinode, loff_t start_byte, loff_t length)
2719	{
2720	return jbd2_journal_file_inode(handle, jinode,
2721	JI_WRITE_DATA \| JI_WAIT_DATA, start_byte,
2722	end_byte: start_byte + length - `1`);
2723	}
2724
2725	int jbd2_journal_inode_ranged_wait(handle_t handle, struct* jbd2_inode *jinode,
2726	loff_t start_byte, loff_t length)
2727	{
2728	return jbd2_journal_file_inode(handle, jinode, JI_WAIT_DATA,
2729	start_byte, end_byte: start_byte + length - `1`);
2730	}
2731
2732	/*
2733	* File truncate and transaction commit interact with each other in a
2734	* non-trivial way. If a transaction writing data block A is
2735	* committing, we cannot discard the data by truncate until we have
2736	* written them. Otherwise if we crashed after the transaction with
2737	* write has committed but before the transaction with truncate has
2738	* committed, we could see stale data in block A. This function is a
2739	* helper to solve this problem. It starts writeout of the truncated
2740	* part in case it is in the committing transaction.
2741	*
2742	* Filesystem code must call this function when inode is journaled in
2743	* ordered mode before truncation happens and after the inode has been
2744	* placed on orphan list with the new inode size. The second condition
2745	* avoids the race that someone writes new data and we start
2746	* committing the transaction after this function has been called but
2747	* before a transaction for truncate is started (and furthermore it
2748	* allows us to optimize the case where the addition to orphan list
2749	* happens in the same transaction as write --- we don't have to write
2750	* any data in such case).
2751	*/
2752	int jbd2_journal_begin_ordered_truncate(journal_t *journal,
2753	struct jbd2_inode *jinode,
2754	loff_t new_size)
2755	{
2756	transaction_t inode_trans, commit_trans;
2757	int ret = `0`;
2758
2759	/ This is a quick check to avoid locking if not necessary /
2760	if (!jinode->i_transaction)
2761	goto out;
2762	/ Locks are here just to force reading of recent values, it is*
2763	* enough that the transaction was not committing before we started
2764	* a transaction adding the inode to orphan list */
2765	read_lock(&journal->j_state_lock);
2766	commit_trans = journal->j_committing_transaction;
2767	read_unlock(&journal->j_state_lock);
2768	spin_lock(lock: &journal->j_list_lock);
2769	inode_trans = jinode->i_transaction;
2770	spin_unlock(lock: &journal->j_list_lock);
2771	if (inode_trans == commit_trans) {
2772	ret = filemap_fdatawrite_range(mapping: jinode->i_vfs_inode->i_mapping,
2773	start: new_size, LLONG_MAX);
2774	if (ret)
2775	jbd2_journal_abort(journal, ret);
2776	}
2777	out:
2778	return ret;
2779	}
2780

source code of linux/fs/jbd2/transaction.c