xfs_log.c source code [linux/fs/xfs/xfs_log.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* Copyright (c) 2000-2005 Silicon Graphics, Inc.
4	* All Rights Reserved.
5	*/
6	#include "xfs.h"
7	#include "xfs_fs.h"
8	#include "xfs_shared.h"
9	#include "xfs_format.h"
10	#include "xfs_log_format.h"
11	#include "xfs_trans_resv.h"
12	#include "xfs_mount.h"
13	#include "xfs_errortag.h"
14	#include "xfs_error.h"
15	#include "xfs_trans.h"
16	#include "xfs_trans_priv.h"
17	#include "xfs_log.h"
18	#include "xfs_log_priv.h"
19	#include "xfs_trace.h"
20	#include "xfs_sysfs.h"
21	#include "xfs_sb.h"
22	#include "xfs_health.h"
23	#include "xfs_zone_alloc.h"
24
25	struct kmem_cache *xfs_log_ticket_cache;
26
27	/ Local miscellaneous function prototypes /
28	STATIC struct xlog *
29	xlog_alloc_log(
30	struct xfs_mount *mp,
31	struct xfs_buftarg *log_target,
32	xfs_daddr_t blk_offset,
33	int num_bblks);
34	STATIC void
35	xlog_dealloc_log(
36	struct xlog *log);
37
38	/ local state machine functions /
39	STATIC void xlog_state_done_syncing(
40	struct xlog_in_core *iclog);
41	STATIC void xlog_state_do_callback(
42	struct xlog *log);
43	STATIC int
44	xlog_state_get_iclog_space(
45	struct xlog *log,
46	int len,
47	struct xlog_in_core **iclog,
48	struct xlog_ticket *ticket,
49	int *logoffsetp);
50	STATIC void
51	xlog_sync(
52	struct xlog *log,
53	struct xlog_in_core *iclog,
54	struct xlog_ticket *ticket);
55	#if defined(DEBUG)
56	STATIC void
57	xlog_verify_iclog(
58	struct xlog *log,
59	struct xlog_in_core *iclog,
60	int count);
61	STATIC void
62	xlog_verify_tail_lsn(
63	struct xlog *log,
64	struct xlog_in_core *iclog);
65	#else
66	#define xlog_verify_iclog(a,b,c)
67	#define xlog_verify_tail_lsn(a,b)
68	#endif
69
70	STATIC int
71	xlog_iclogs_empty(
72	struct xlog *log);
73
74	static int
75	xfs_log_cover(struct xfs_mount *);
76
77	/*
78	* We need to make sure the buffer pointer returned is naturally aligned for the
79	* biggest basic data type we put into it. We have already accounted for this
80	* padding when sizing the buffer.
81	*
82	* However, this padding does not get written into the log, and hence we have to
83	* track the space used by the log vectors separately to prevent log space hangs
84	* due to inaccurate accounting (i.e. a leak) of the used log space through the
85	* CIL context ticket.
86	*
87	* We also add space for the xlog_op_header that describes this region in the
88	* log. This prepends the data region we return to the caller to copy their data
89	* into, so do all the static initialisation of the ophdr now. Because the ophdr
90	* is not 8 byte aligned, we have to be careful to ensure that we align the
91	* start of the buffer such that the region we return to the call is 8 byte
92	* aligned and packed against the tail of the ophdr.
93	*/
94	void *
95	xlog_prepare_iovec(
96	struct xfs_log_vec *lv,
97	struct xfs_log_iovec **vecp,
98	uint type)
99	{
100	struct xfs_log_iovec vec = vecp;
101	struct xlog_op_header *oph;
102	uint32_t len;
103	void *buf;
104
105	if (vec) {
106	ASSERT(vec - lv->lv_iovecp < lv->lv_niovecs);
107	vec++;
108	} else {
109	vec = &lv->lv_iovecp[`0`];
110	}
111
112	len = lv->lv_buf_len + sizeof(struct xlog_op_header);
113	if (!IS_ALIGNED(len, sizeof(uint64_t))) {
114	lv->lv_buf_len = round_up(len, sizeof(uint64_t)) -
115	sizeof(struct xlog_op_header);
116	}
117
118	vec->i_type = type;
119	vec->i_addr = lv->lv_buf + lv->lv_buf_len;
120
121	oph = vec->i_addr;
122	oph->oh_clientid = XFS_TRANSACTION;
123	oph->oh_res2 = `0`;
124	oph->oh_flags = `0`;
125
126	buf = vec->i_addr + sizeof(struct xlog_op_header);
127	ASSERT(IS_ALIGNED((unsigned long)buf, sizeof(uint64_t)));
128
129	*vecp = vec;
130	return buf;
131	}
132
133	static inline void
134	xlog_grant_sub_space(
135	struct xlog_grant_head *head,
136	int64_t bytes)
137	{
138	atomic64_sub(i: bytes, v: &head->grant);
139	}
140
141	static inline void
142	xlog_grant_add_space(
143	struct xlog_grant_head *head,
144	int64_t bytes)
145	{
146	atomic64_add(i: bytes, v: &head->grant);
147	}
148
149	static void
150	xlog_grant_head_init(
151	struct xlog_grant_head *head)
152	{
153	atomic64_set(v: &head->grant, i: `0`);
154	INIT_LIST_HEAD(list: &head->waiters);
155	spin_lock_init(&head->lock);
156	}
157
158	void
159	xlog_grant_return_space(
160	struct xlog *log,
161	xfs_lsn_t old_head,
162	xfs_lsn_t new_head)
163	{
164	int64_t diff = xlog_lsn_sub(log, new_head, old_head);
165
166	xlog_grant_sub_space(head: &log->l_reserve_head, bytes: diff);
167	xlog_grant_sub_space(head: &log->l_write_head, bytes: diff);
168	}
169
170	/*
171	* Return the space in the log between the tail and the head. In the case where
172	* we have overrun available reservation space, return 0. The memory barrier
173	* pairs with the smp_wmb() in xlog_cil_ail_insert() to ensure that grant head
174	* vs tail space updates are seen in the correct order and hence avoid
175	* transients as space is transferred from the grant heads to the AIL on commit
176	* completion.
177	*/
178	static uint64_t
179	xlog_grant_space_left(
180	struct xlog *log,
181	struct xlog_grant_head *head)
182	{
183	int64_t free_bytes;
184
185	smp_rmb(); / paired with smp_wmb in xlog_cil_ail_insert() /
186	free_bytes = log->l_logsize - READ_ONCE(log->l_tail_space) -
187	atomic64_read(v: &head->grant);
188	if (free_bytes > `0`)
189	return free_bytes;
190	return `0`;
191	}
192
193	STATIC void
194	xlog_grant_head_wake_all(
195	struct xlog_grant_head *head)
196	{
197	struct xlog_ticket *tic;
198
199	spin_lock(lock: &head->lock);
200	list_for_each_entry(tic, &head->waiters, t_queue)
201	wake_up_process(tsk: tic->t_task);
202	spin_unlock(lock: &head->lock);
203	}
204
205	static inline int
206	xlog_ticket_reservation(
207	struct xlog *log,
208	struct xlog_grant_head *head,
209	struct xlog_ticket *tic)
210	{
211	if (head == &log->l_write_head) {
212	ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV);
213	return tic->t_unit_res;
214	}
215
216	if (tic->t_flags & XLOG_TIC_PERM_RESERV)
217	return tic->t_unit_res * tic->t_cnt;
218
219	return tic->t_unit_res;
220	}
221
222	STATIC bool
223	xlog_grant_head_wake(
224	struct xlog *log,
225	struct xlog_grant_head *head,
226	int *free_bytes)
227	{
228	struct xlog_ticket *tic;
229	int need_bytes;
230
231	list_for_each_entry(tic, &head->waiters, t_queue) {
232	need_bytes = xlog_ticket_reservation(log, head, tic);
233	if (*free_bytes < need_bytes)
234	return false;
235
236	*free_bytes -= need_bytes;
237	trace_xfs_log_grant_wake_up(log, tic);
238	wake_up_process(tsk: tic->t_task);
239	}
240
241	return true;
242	}
243
244	STATIC int
245	xlog_grant_head_wait(
246	struct xlog *log,
247	struct xlog_grant_head *head,
248	struct xlog_ticket *tic,
249	int need_bytes) __releases(&head->lock)
250	__acquires(&head->lock)
251	{
252	list_add_tail(new: &tic->t_queue, head: &head->waiters);
253
254	do {
255	if (xlog_is_shutdown(log))
256	goto shutdown;
257
258	__set_current_state(TASK_UNINTERRUPTIBLE);
259	spin_unlock(lock: &head->lock);
260
261	XFS_STATS_INC(log->l_mp, xs_sleep_logspace);
262
263	/ Push on the AIL to free up all the log space. /
264	xfs_ail_push_all(ailp: log->l_ailp);
265
266	trace_xfs_log_grant_sleep(log, tic);
267	schedule();
268	trace_xfs_log_grant_wake(log, tic);
269
270	spin_lock(lock: &head->lock);
271	if (xlog_is_shutdown(log))
272	goto shutdown;
273	} while (xlog_grant_space_left(log, head) < need_bytes);
274
275	list_del_init(entry: &tic->t_queue);
276	return `0`;
277	shutdown:
278	list_del_init(entry: &tic->t_queue);
279	return -EIO;
280	}
281
282	/*
283	* Atomically get the log space required for a log ticket.
284	*
285	* Once a ticket gets put onto head->waiters, it will only return after the
286	* needed reservation is satisfied.
287	*
288	* This function is structured so that it has a lock free fast path. This is
289	* necessary because every new transaction reservation will come through this
290	* path. Hence any lock will be globally hot if we take it unconditionally on
291	* every pass.
292	*
293	* As tickets are only ever moved on and off head->waiters under head->lock, we
294	* only need to take that lock if we are going to add the ticket to the queue
295	* and sleep. We can avoid taking the lock if the ticket was never added to
296	* head->waiters because the t_queue list head will be empty and we hold the
297	* only reference to it so it can safely be checked unlocked.
298	*/
299	STATIC int
300	xlog_grant_head_check(
301	struct xlog *log,
302	struct xlog_grant_head *head,
303	struct xlog_ticket *tic,
304	int *need_bytes)
305	{
306	int free_bytes;
307	int error = `0`;
308
309	ASSERT(!xlog_in_recovery(log));
310
311	/*
312	* If there are other waiters on the queue then give them a chance at
313	* logspace before us. Wake up the first waiters, if we do not wake
314	* up all the waiters then go to sleep waiting for more free space,
315	* otherwise try to get some space for this transaction.
316	*/
317	*need_bytes = xlog_ticket_reservation(log, head, tic);
318	free_bytes = xlog_grant_space_left(log, head);
319	if (!list_empty_careful(head: &head->waiters)) {
320	spin_lock(lock: &head->lock);
321	if (!xlog_grant_head_wake(log, head, free_bytes: &free_bytes) \|\|
322	free_bytes < *need_bytes) {
323	error = xlog_grant_head_wait(log, head, tic,
324	need_bytes: *need_bytes);
325	}
326	spin_unlock(lock: &head->lock);
327	} else if (free_bytes < *need_bytes) {
328	spin_lock(lock: &head->lock);
329	error = xlog_grant_head_wait(log, head, tic, need_bytes: *need_bytes);
330	spin_unlock(lock: &head->lock);
331	}
332
333	return error;
334	}
335
336	bool
337	xfs_log_writable(
338	struct xfs_mount *mp)
339	{
340	/*
341	* Do not write to the log on norecovery mounts, if the data or log
342	* devices are read-only, or if the filesystem is shutdown. Read-only
343	* mounts allow internal writes for log recovery and unmount purposes,
344	* so don't restrict that case.
345	*/
346	if (xfs_has_norecovery(mp))
347	return false;
348	if (xfs_readonly_buftarg(mp->m_ddev_targp))
349	return false;
350	if (xfs_readonly_buftarg(mp->m_log->l_targ))
351	return false;
352	if (xlog_is_shutdown(log: mp->m_log))
353	return false;
354	return true;
355	}
356
357	/*
358	* Replenish the byte reservation required by moving the grant write head.
359	*/
360	int
361	xfs_log_regrant(
362	struct xfs_mount *mp,
363	struct xlog_ticket *tic)
364	{
365	struct xlog *log = mp->m_log;
366	int need_bytes;
367	int error = `0`;
368
369	if (xlog_is_shutdown(log))
370	return -EIO;
371
372	XFS_STATS_INC(mp, xs_try_logspace);
373
374	/*
375	* This is a new transaction on the ticket, so we need to change the
376	* transaction ID so that the next transaction has a different TID in
377	* the log. Just add one to the existing tid so that we can see chains
378	* of rolling transactions in the log easily.
379	*/
380	tic->t_tid++;
381	tic->t_curr_res = tic->t_unit_res;
382	if (tic->t_cnt > `0`)
383	return `0`;
384
385	trace_xfs_log_regrant(log, tic);
386
387	error = xlog_grant_head_check(log, head: &log->l_write_head, tic,
388	need_bytes: &need_bytes);
389	if (error)
390	goto out_error;
391
392	xlog_grant_add_space(head: &log->l_write_head, bytes: need_bytes);
393	trace_xfs_log_regrant_exit(log, tic);
394	return `0`;
395
396	out_error:
397	/*
398	* If we are failing, make sure the ticket doesn't have any current
399	* reservations. We don't want to add this back when the ticket/
400	* transaction gets cancelled.
401	*/
402	tic->t_curr_res = `0`;
403	tic->t_cnt = `0`; / ungrant will give back unit_res * t_cnt. /
404	return error;
405	}
406
407	/*
408	* Reserve log space and return a ticket corresponding to the reservation.
409	*
410	* Each reservation is going to reserve extra space for a log record header.
411	* When writes happen to the on-disk log, we don't subtract the length of the
412	* log record header from any reservation. By wasting space in each
413	* reservation, we prevent over allocation problems.
414	*/
415	int
416	xfs_log_reserve(
417	struct xfs_mount *mp,
418	int unit_bytes,
419	int cnt,
420	struct xlog_ticket **ticp,
421	bool permanent)
422	{
423	struct xlog *log = mp->m_log;
424	struct xlog_ticket *tic;
425	int need_bytes;
426	int error = `0`;
427
428	if (xlog_is_shutdown(log))
429	return -EIO;
430
431	XFS_STATS_INC(mp, xs_try_logspace);
432
433	ASSERT(*ticp == NULL);
434	tic = xlog_ticket_alloc(log, unit_bytes, count: cnt, permanent);
435	*ticp = tic;
436	trace_xfs_log_reserve(log, tic);
437	error = xlog_grant_head_check(log, head: &log->l_reserve_head, tic,
438	need_bytes: &need_bytes);
439	if (error)
440	goto out_error;
441
442	xlog_grant_add_space(head: &log->l_reserve_head, bytes: need_bytes);
443	xlog_grant_add_space(head: &log->l_write_head, bytes: need_bytes);
444	trace_xfs_log_reserve_exit(log, tic);
445	return `0`;
446
447	out_error:
448	/*
449	* If we are failing, make sure the ticket doesn't have any current
450	* reservations. We don't want to add this back when the ticket/
451	* transaction gets cancelled.
452	*/
453	tic->t_curr_res = `0`;
454	tic->t_cnt = `0`; / ungrant will give back unit_res * t_cnt. /
455	return error;
456	}
457
458	/*
459	* Run all the pending iclog callbacks and wake log force waiters and iclog
460	* space waiters so they can process the newly set shutdown state. We really
461	* don't care what order we process callbacks here because the log is shut down
462	* and so state cannot change on disk anymore. However, we cannot wake waiters
463	* until the callbacks have been processed because we may be in unmount and
464	* we must ensure that all AIL operations the callbacks perform have completed
465	* before we tear down the AIL.
466	*
467	* We avoid processing actively referenced iclogs so that we don't run callbacks
468	* while the iclog owner might still be preparing the iclog for IO submssion.
469	* These will be caught by xlog_state_iclog_release() and call this function
470	* again to process any callbacks that may have been added to that iclog.
471	*/
472	static void
473	xlog_state_shutdown_callbacks(
474	struct xlog *log)
475	{
476	struct xlog_in_core *iclog;
477	LIST_HEAD(cb_list);
478
479	iclog = log->l_iclog;
480	do {
481	if (atomic_read(v: &iclog->ic_refcnt)) {
482	/ Reference holder will re-run iclog callbacks. /
483	continue;
484	}
485	list_splice_init(list: &iclog->ic_callbacks, head: &cb_list);
486	spin_unlock(lock: &log->l_icloglock);
487
488	xlog_cil_process_committed(list: &cb_list);
489
490	spin_lock(lock: &log->l_icloglock);
491	wake_up_all(&iclog->ic_write_wait);
492	wake_up_all(&iclog->ic_force_wait);
493	} while ((iclog = iclog->ic_next) != log->l_iclog);
494
495	wake_up_all(&log->l_flush_wait);
496	}
497
498	/*
499	* Flush iclog to disk if this is the last reference to the given iclog and the
500	* it is in the WANT_SYNC state.
501	*
502	* If XLOG_ICL_NEED_FUA is already set on the iclog, we need to ensure that the
503	* log tail is updated correctly. NEED_FUA indicates that the iclog will be
504	* written to stable storage, and implies that a commit record is contained
505	* within the iclog. We need to ensure that the log tail does not move beyond
506	* the tail that the first commit record in the iclog ordered against, otherwise
507	* correct recovery of that checkpoint becomes dependent on future operations
508	* performed on this iclog.
509	*
510	* Hence if NEED_FUA is set and the current iclog tail lsn is empty, write the
511	* current tail into iclog. Once the iclog tail is set, future operations must
512	* not modify it, otherwise they potentially violate ordering constraints for
513	* the checkpoint commit that wrote the initial tail lsn value. The tail lsn in
514	* the iclog will get zeroed on activation of the iclog after sync, so we
515	* always capture the tail lsn on the iclog on the first NEED_FUA release
516	* regardless of the number of active reference counts on this iclog.
517	*/
518	int
519	xlog_state_release_iclog(
520	struct xlog *log,
521	struct xlog_in_core *iclog,
522	struct xlog_ticket *ticket)
523	{
524	bool last_ref;
525
526	lockdep_assert_held(&log->l_icloglock);
527
528	trace_xlog_iclog_release(iclog, _RET_IP_);
529	/*
530	* Grabbing the current log tail needs to be atomic w.r.t. the writing
531	* of the tail LSN into the iclog so we guarantee that the log tail does
532	* not move between the first time we know that the iclog needs to be
533	* made stable and when we eventually submit it.
534	*/
535	if ((iclog->ic_state == XLOG_STATE_WANT_SYNC \|\|
536	(iclog->ic_flags & XLOG_ICL_NEED_FUA)) &&
537	!iclog->ic_header.h_tail_lsn) {
538	iclog->ic_header.h_tail_lsn =
539	cpu_to_be64(atomic64_read(&log->l_tail_lsn));
540	}
541
542	last_ref = atomic_dec_and_test(v: &iclog->ic_refcnt);
543
544	if (xlog_is_shutdown(log)) {
545	/*
546	* If there are no more references to this iclog, process the
547	* pending iclog callbacks that were waiting on the release of
548	* this iclog.
549	*/
550	if (last_ref)
551	xlog_state_shutdown_callbacks(log);
552	return -EIO;
553	}
554
555	if (!last_ref)
556	return `0`;
557
558	if (iclog->ic_state != XLOG_STATE_WANT_SYNC) {
559	ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
560	return `0`;
561	}
562
563	iclog->ic_state = XLOG_STATE_SYNCING;
564	xlog_verify_tail_lsn(log, iclog);
565	trace_xlog_iclog_syncing(iclog, _RET_IP_);
566
567	spin_unlock(lock: &log->l_icloglock);
568	xlog_sync(log, iclog, ticket);
569	spin_lock(lock: &log->l_icloglock);
570	return `0`;
571	}
572
573	/*
574	* Mount a log filesystem
575	*
576	* mp - ubiquitous xfs mount point structure
577	* log_target - buftarg of on-disk log device
578	* blk_offset - Start block # where block size is 512 bytes (BBSIZE)
579	* num_bblocks - Number of BBSIZE blocks in on-disk log
580	*
581	* Return error or zero.
582	*/
583	int
584	xfs_log_mount(
585	xfs_mount_t *mp,
586	struct xfs_buftarg *log_target,
587	xfs_daddr_t blk_offset,
588	int num_bblks)
589	{
590	struct xlog *log;
591	int error = `0`;
592	int min_logfsbs;
593
594	if (!xfs_has_norecovery(mp)) {
595	xfs_notice(mp, "Mounting V%d Filesystem %pU",
596	XFS_SB_VERSION_NUM(&mp->m_sb),
597	&mp->m_sb.sb_uuid);
598	} else {
599	xfs_notice(mp,
600	"Mounting V%d filesystem %pU in no-recovery mode. Filesystem will be inconsistent.",
601	XFS_SB_VERSION_NUM(&mp->m_sb),
602	&mp->m_sb.sb_uuid);
603	ASSERT(xfs_is_readonly(mp));
604	}
605
606	log = xlog_alloc_log(mp, log_target, blk_offset, num_bblks);
607	if (IS_ERR(ptr: log)) {
608	error = PTR_ERR(ptr: log);
609	goto out;
610	}
611	mp->m_log = log;
612
613	/*
614	* Now that we have set up the log and it's internal geometry
615	* parameters, we can validate the given log space and drop a critical
616	* message via syslog if the log size is too small. A log that is too
617	* small can lead to unexpected situations in transaction log space
618	* reservation stage. The superblock verifier has already validated all
619	* the other log geometry constraints, so we don't have to check those
620	* here.
621	*
622	* Note: For v4 filesystems, we can't just reject the mount if the
623	* validation fails. This would mean that people would have to
624	* downgrade their kernel just to remedy the situation as there is no
625	* way to grow the log (short of black magic surgery with xfs_db).
626	*
627	* We can, however, reject mounts for V5 format filesystems, as the
628	* mkfs binary being used to make the filesystem should never create a
629	* filesystem with a log that is too small.
630	*/
631	min_logfsbs = xfs_log_calc_minimum_size(mp);
632	if (mp->m_sb.sb_logblocks < min_logfsbs) {
633	xfs_warn(mp,
634	"Log size %d blocks too small, minimum size is %d blocks",
635	mp->m_sb.sb_logblocks, min_logfsbs);
636
637	/*
638	* Log check errors are always fatal on v5; or whenever bad
639	* metadata leads to a crash.
640	*/
641	if (xfs_has_crc(mp)) {
642	xfs_crit(mp, "AAIEEE! Log failed size checks. Abort!");
643	ASSERT(`0`);
644	error = -EINVAL;
645	goto out_free_log;
646	}
647	xfs_crit(mp, "Log size out of supported range.");
648	xfs_crit(mp,
649	"Continuing onwards, but if log hangs are experienced then please report this message in the bug report.");
650	}
651
652	/*
653	* Initialize the AIL now we have a log.
654	*/
655	error = xfs_trans_ail_init(mp);
656	if (error) {
657	xfs_warn(mp, "AIL initialisation failed: error %d", error);
658	goto out_free_log;
659	}
660	log->l_ailp = mp->m_ail;
661
662	/*
663	* skip log recovery on a norecovery mount. pretend it all
664	* just worked.
665	*/
666	if (!xfs_has_norecovery(mp)) {
667	error = xlog_recover(log);
668	if (error) {
669	xfs_warn(mp, "log mount/recovery failed: error %d",
670	error);
671	xlog_recover_cancel(log);
672	goto out_destroy_ail;
673	}
674	}
675
676	error = xfs_sysfs_init(kobj: &log->l_kobj, ktype: &xfs_log_ktype, parent_kobj: &mp->m_kobj,
677	name: "log");
678	if (error)
679	goto out_destroy_ail;
680
681	/ Normal transactions can now occur /
682	clear_bit(XLOG_ACTIVE_RECOVERY, addr: &log->l_opstate);
683
684	/*
685	* Now the log has been fully initialised and we know were our
686	* space grant counters are, we can initialise the permanent ticket
687	* needed for delayed logging to work.
688	*/
689	xlog_cil_init_post_recovery(log);
690
691	return `0`;
692
693	out_destroy_ail:
694	xfs_trans_ail_destroy(mp);
695	out_free_log:
696	xlog_dealloc_log(log);
697	out:
698	return error;
699	}
700
701	/*
702	* Finish the recovery of the file system. This is separate from the
703	* xfs_log_mount() call, because it depends on the code in xfs_mountfs() to read
704	* in the root and real-time bitmap inodes between calling xfs_log_mount() and
705	* here.
706	*
707	* If we finish recovery successfully, start the background log work. If we are
708	* not doing recovery, then we have a RO filesystem and we don't need to start
709	* it.
710	*/
711	int
712	xfs_log_mount_finish(
713	struct xfs_mount *mp)
714	{
715	struct xlog *log = mp->m_log;
716	int error = `0`;
717
718	if (xfs_has_norecovery(mp)) {
719	ASSERT(xfs_is_readonly(mp));
720	return `0`;
721	}
722
723	/*
724	* During the second phase of log recovery, we need iget and
725	* iput to behave like they do for an active filesystem.
726	* xfs_fs_drop_inode needs to be able to prevent the deletion
727	* of inodes before we're done replaying log items on those
728	* inodes. Turn it off immediately after recovery finishes
729	* so that we don't leak the quota inodes if subsequent mount
730	* activities fail.
731	*
732	* We let all inodes involved in redo item processing end up on
733	* the LRU instead of being evicted immediately so that if we do
734	* something to an unlinked inode, the irele won't cause
735	* premature truncation and freeing of the inode, which results
736	* in log recovery failure. We have to evict the unreferenced
737	* lru inodes after clearing SB_ACTIVE because we don't
738	* otherwise clean up the lru if there's a subsequent failure in
739	* xfs_mountfs, which leads to us leaking the inodes if nothing
740	* else (e.g. quotacheck) references the inodes before the
741	* mount failure occurs.
742	*/
743	mp->m_super->s_flags \|= SB_ACTIVE;
744	xfs_log_work_queue(mp);
745	if (xlog_recovery_needed(log))
746	error = xlog_recover_finish(log);
747	mp->m_super->s_flags &= ~SB_ACTIVE;
748	evict_inodes(sb: mp->m_super);
749
750	/*
751	* Drain the buffer LRU after log recovery. This is required for v4
752	* filesystems to avoid leaving around buffers with NULL verifier ops,
753	* but we do it unconditionally to make sure we're always in a clean
754	* cache state after mount.
755	*
756	* Don't push in the error case because the AIL may have pending intents
757	* that aren't removed until recovery is cancelled.
758	*/
759	if (xlog_recovery_needed(log)) {
760	if (!error) {
761	xfs_log_force(mp, XFS_LOG_SYNC);
762	xfs_ail_push_all_sync(ailp: mp->m_ail);
763	}
764	xfs_notice(mp, "Ending recovery (logdev: %s)",
765	mp->m_logname ? mp->m_logname : "internal");
766	} else {
767	xfs_info(mp, "Ending clean mount");
768	}
769	xfs_buftarg_drain(mp->m_ddev_targp);
770
771	clear_bit(XLOG_RECOVERY_NEEDED, addr: &log->l_opstate);
772
773	/ Make sure the log is dead if we're returning failure. /
774	ASSERT(!error \|\| xlog_is_shutdown(log));
775
776	return error;
777	}
778
779	/*
780	* The mount has failed. Cancel the recovery if it hasn't completed and destroy
781	* the log.
782	*/
783	void
784	xfs_log_mount_cancel(
785	struct xfs_mount *mp)
786	{
787	xlog_recover_cancel(mp->m_log);
788	xfs_log_unmount(mp);
789	}
790
791	/*
792	* Flush out the iclog to disk ensuring that device caches are flushed and
793	* the iclog hits stable storage before any completion waiters are woken.
794	*/
795	static inline int
796	xlog_force_iclog(
797	struct xlog_in_core *iclog)
798	{
799	atomic_inc(v: &iclog->ic_refcnt);
800	iclog->ic_flags \|= XLOG_ICL_NEED_FLUSH \| XLOG_ICL_NEED_FUA;
801	if (iclog->ic_state == XLOG_STATE_ACTIVE)
802	xlog_state_switch_iclogs(log: iclog->ic_log, iclog, eventual_size: `0`);
803	return xlog_state_release_iclog(log: iclog->ic_log, iclog, NULL);
804	}
805
806	/*
807	* Cycle all the iclogbuf locks to make sure all log IO completion
808	* is done before we tear down these buffers.
809	*/
810	static void
811	xlog_wait_iclog_completion(struct xlog *log)
812	{
813	int i;
814	struct xlog_in_core *iclog = log->l_iclog;
815
816	for (i = `0`; i < log->l_iclog_bufs; i++) {
817	down(sem: &iclog->ic_sema);
818	up(sem: &iclog->ic_sema);
819	iclog = iclog->ic_next;
820	}
821	}
822
823	/*
824	* Wait for the iclog and all prior iclogs to be written disk as required by the
825	* log force state machine. Waiting on ic_force_wait ensures iclog completions
826	* have been ordered and callbacks run before we are woken here, hence
827	* guaranteeing that all the iclogs up to this one are on stable storage.
828	*/
829	int
830	xlog_wait_on_iclog(
831	struct xlog_in_core *iclog)
832	__releases(iclog->ic_log->l_icloglock)
833	{
834	struct xlog *log = iclog->ic_log;
835
836	trace_xlog_iclog_wait_on(iclog, _RET_IP_);
837	if (!xlog_is_shutdown(log) &&
838	iclog->ic_state != XLOG_STATE_ACTIVE &&
839	iclog->ic_state != XLOG_STATE_DIRTY) {
840	XFS_STATS_INC(log->l_mp, xs_log_force_sleep);
841	xlog_wait(wq: &iclog->ic_force_wait, lock: &log->l_icloglock);
842	} else {
843	spin_unlock(lock: &log->l_icloglock);
844	}
845
846	if (xlog_is_shutdown(log))
847	return -EIO;
848	return `0`;
849	}
850
851	/*
852	* Write out an unmount record using the ticket provided. We have to account for
853	* the data space used in the unmount ticket as this write is not done from a
854	* transaction context that has already done the accounting for us.
855	*/
856	static int
857	xlog_write_unmount_record(
858	struct xlog *log,
859	struct xlog_ticket *ticket)
860	{
861	struct {
862	struct xlog_op_header ophdr;
863	struct xfs_unmount_log_format ulf;
864	} unmount_rec = {
865	.ophdr = {
866	.oh_clientid = XFS_LOG,
867	.oh_tid = cpu_to_be32(ticket->t_tid),
868	.oh_flags = XLOG_UNMOUNT_TRANS,
869	},
870	.ulf = {
871	.magic = XLOG_UNMOUNT_TYPE,
872	},
873	};
874	struct xfs_log_iovec reg = {
875	.i_addr = &unmount_rec,
876	.i_len = sizeof(unmount_rec),
877	.i_type = XLOG_REG_TYPE_UNMOUNT,
878	};
879	struct xfs_log_vec vec = {
880	.lv_niovecs = `1`,
881	.lv_iovecp = &reg,
882	};
883	LIST_HEAD(lv_chain);
884	list_add(new: &vec.lv_list, head: &lv_chain);
885
886	BUILD_BUG_ON((sizeof(struct xlog_op_header) +
887	sizeof(struct xfs_unmount_log_format)) !=
888	sizeof(unmount_rec));
889
890	/ account for space used by record data /
891	ticket->t_curr_res -= sizeof(unmount_rec);
892
893	return xlog_write(log, NULL, lv_chain: &lv_chain, tic: ticket, len: reg.i_len);
894	}
895
896	/*
897	* Mark the filesystem clean by writing an unmount record to the head of the
898	* log.
899	*/
900	static void
901	xlog_unmount_write(
902	struct xlog *log)
903	{
904	struct xfs_mount *mp = log->l_mp;
905	struct xlog_in_core *iclog;
906	struct xlog_ticket *tic = NULL;
907	int error;
908
909	error = xfs_log_reserve(mp, unit_bytes: `600`, cnt: `1`, ticp: &tic, permanent: `0`);
910	if (error)
911	goto out_err;
912
913	error = xlog_write_unmount_record(log, ticket: tic);
914	/*
915	* At this point, we're umounting anyway, so there's no point in
916	* transitioning log state to shutdown. Just continue...
917	*/
918	out_err:
919	if (error)
920	xfs_alert(mp, "%s: unmount record failed", __func__);
921
922	spin_lock(lock: &log->l_icloglock);
923	iclog = log->l_iclog;
924	error = xlog_force_iclog(iclog);
925	xlog_wait_on_iclog(iclog);
926
927	if (tic) {
928	trace_xfs_log_umount_write(log, tic);
929	xfs_log_ticket_ungrant(log, ticket: tic);
930	}
931	}
932
933	static void
934	xfs_log_unmount_verify_iclog(
935	struct xlog *log)
936	{
937	struct xlog_in_core *iclog = log->l_iclog;
938
939	do {
940	ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
941	ASSERT(iclog->ic_offset == `0`);
942	} while ((iclog = iclog->ic_next) != log->l_iclog);
943	}
944
945	/*
946	* Unmount record used to have a string "Unmount filesystem--" in the
947	* data section where the "Un" was really a magic number (XLOG_UNMOUNT_TYPE).
948	* We just write the magic number now since that particular field isn't
949	* currently architecture converted and "Unmount" is a bit foo.
950	* As far as I know, there weren't any dependencies on the old behaviour.
951	*/
952	static void
953	xfs_log_unmount_write(
954	struct xfs_mount *mp)
955	{
956	struct xlog *log = mp->m_log;
957
958	if (!xfs_log_writable(mp))
959	return;
960
961	xfs_log_force(mp, XFS_LOG_SYNC);
962
963	if (xlog_is_shutdown(log))
964	return;
965
966	/*
967	* If we think the summary counters are bad, avoid writing the unmount
968	* record to force log recovery at next mount, after which the summary
969	* counters will be recalculated. Refer to xlog_check_unmount_rec for
970	* more details.
971	*/
972	if (XFS_TEST_ERROR(xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS), mp,
973	XFS_ERRTAG_FORCE_SUMMARY_RECALC)) {
974	xfs_alert(mp, "%s: will fix summary counters at next mount",
975	__func__);
976	return;
977	}
978
979	xfs_log_unmount_verify_iclog(log);
980	xlog_unmount_write(log);
981	}
982
983	/*
984	* Empty the log for unmount/freeze.
985	*
986	* To do this, we first need to shut down the background log work so it is not
987	* trying to cover the log as we clean up. We then need to unpin all objects in
988	* the log so we can then flush them out. Once they have completed their IO and
989	* run the callbacks removing themselves from the AIL, we can cover the log.
990	*/
991	int
992	xfs_log_quiesce(
993	struct xfs_mount *mp)
994	{
995	/*
996	* Clear log incompat features since we're quiescing the log. Report
997	* failures, though it's not fatal to have a higher log feature
998	* protection level than the log contents actually require.
999	*/
1000	if (xfs_clear_incompat_log_features(mp)) {
1001	int error;
1002
1003	error = xfs_sync_sb(mp, false);
1004	if (error)
1005	xfs_warn(mp,
1006	"Failed to clear log incompat features on quiesce");
1007	}
1008
1009	cancel_delayed_work_sync(dwork: &mp->m_log->l_work);
1010	xfs_log_force(mp, XFS_LOG_SYNC);
1011
1012	/*
1013	* The superblock buffer is uncached and while xfs_ail_push_all_sync()
1014	* will push it, xfs_buftarg_wait() will not wait for it. Further,
1015	* xfs_buf_iowait() cannot be used because it was pushed with the
1016	* XBF_ASYNC flag set, so we need to use a lock/unlock pair to wait for
1017	* the IO to complete.
1018	*/
1019	xfs_ail_push_all_sync(ailp: mp->m_ail);
1020	xfs_buftarg_wait(mp->m_ddev_targp);
1021	xfs_buf_lock(mp->m_sb_bp);
1022	xfs_buf_unlock(mp->m_sb_bp);
1023
1024	return xfs_log_cover(mp);
1025	}
1026
1027	void
1028	xfs_log_clean(
1029	struct xfs_mount *mp)
1030	{
1031	xfs_log_quiesce(mp);
1032	xfs_log_unmount_write(mp);
1033	}
1034
1035	/*
1036	* Shut down and release the AIL and Log.
1037	*
1038	* During unmount, we need to ensure we flush all the dirty metadata objects
1039	* from the AIL so that the log is empty before we write the unmount record to
1040	* the log. Once this is done, we can tear down the AIL and the log.
1041	*/
1042	void
1043	xfs_log_unmount(
1044	struct xfs_mount *mp)
1045	{
1046	xfs_log_clean(mp);
1047
1048	/*
1049	* If shutdown has come from iclog IO context, the log
1050	* cleaning will have been skipped and so we need to wait
1051	* for the iclog to complete shutdown processing before we
1052	* tear anything down.
1053	*/
1054	xlog_wait_iclog_completion(log: mp->m_log);
1055
1056	xfs_buftarg_drain(mp->m_ddev_targp);
1057
1058	xfs_trans_ail_destroy(mp);
1059
1060	xfs_sysfs_del(kobj: &mp->m_log->l_kobj);
1061
1062	xlog_dealloc_log(log: mp->m_log);
1063	}
1064
1065	void
1066	xfs_log_item_init(
1067	struct xfs_mount *mp,
1068	struct xfs_log_item *item,
1069	int type,
1070	const struct xfs_item_ops *ops)
1071	{
1072	item->li_log = mp->m_log;
1073	item->li_ailp = mp->m_ail;
1074	item->li_type = type;
1075	item->li_ops = ops;
1076	item->li_lv = NULL;
1077
1078	INIT_LIST_HEAD(list: &item->li_ail);
1079	INIT_LIST_HEAD(list: &item->li_cil);
1080	INIT_LIST_HEAD(list: &item->li_bio_list);
1081	INIT_LIST_HEAD(list: &item->li_trans);
1082	}
1083
1084	/*
1085	* Wake up processes waiting for log space after we have moved the log tail.
1086	*/
1087	void
1088	xfs_log_space_wake(
1089	struct xfs_mount *mp)
1090	{
1091	struct xlog *log = mp->m_log;
1092	int free_bytes;
1093
1094	if (xlog_is_shutdown(log))
1095	return;
1096
1097	if (!list_empty_careful(head: &log->l_write_head.waiters)) {
1098	ASSERT(!xlog_in_recovery(log));
1099
1100	spin_lock(lock: &log->l_write_head.lock);
1101	free_bytes = xlog_grant_space_left(log, head: &log->l_write_head);
1102	xlog_grant_head_wake(log, head: &log->l_write_head, free_bytes: &free_bytes);
1103	spin_unlock(lock: &log->l_write_head.lock);
1104	}
1105
1106	if (!list_empty_careful(head: &log->l_reserve_head.waiters)) {
1107	ASSERT(!xlog_in_recovery(log));
1108
1109	spin_lock(lock: &log->l_reserve_head.lock);
1110	free_bytes = xlog_grant_space_left(log, head: &log->l_reserve_head);
1111	xlog_grant_head_wake(log, head: &log->l_reserve_head, free_bytes: &free_bytes);
1112	spin_unlock(lock: &log->l_reserve_head.lock);
1113	}
1114	}
1115
1116	/*
1117	* Determine if we have a transaction that has gone to disk that needs to be
1118	* covered. To begin the transition to the idle state firstly the log needs to
1119	* be idle. That means the CIL, the AIL and the iclogs needs to be empty before
1120	* we start attempting to cover the log.
1121	*
1122	* Only if we are then in a state where covering is needed, the caller is
1123	* informed that dummy transactions are required to move the log into the idle
1124	* state.
1125	*
1126	* If there are any items in the AIl or CIL, then we do not want to attempt to
1127	* cover the log as we may be in a situation where there isn't log space
1128	* available to run a dummy transaction and this can lead to deadlocks when the
1129	* tail of the log is pinned by an item that is modified in the CIL. Hence
1130	* there's no point in running a dummy transaction at this point because we
1131	* can't start trying to idle the log until both the CIL and AIL are empty.
1132	*/
1133	static bool
1134	xfs_log_need_covered(
1135	struct xfs_mount *mp)
1136	{
1137	struct xlog *log = mp->m_log;
1138	bool needed = false;
1139
1140	if (!xlog_cil_empty(log))
1141	return false;
1142
1143	spin_lock(lock: &log->l_icloglock);
1144	switch (log->l_covered_state) {
1145	case XLOG_STATE_COVER_DONE:
1146	case XLOG_STATE_COVER_DONE2:
1147	case XLOG_STATE_COVER_IDLE:
1148	break;
1149	case XLOG_STATE_COVER_NEED:
1150	case XLOG_STATE_COVER_NEED2:
1151	if (xfs_ail_min_lsn(log->l_ailp))
1152	break;
1153	if (!xlog_iclogs_empty(log))
1154	break;
1155
1156	needed = true;
1157	if (log->l_covered_state == XLOG_STATE_COVER_NEED)
1158	log->l_covered_state = XLOG_STATE_COVER_DONE;
1159	else
1160	log->l_covered_state = XLOG_STATE_COVER_DONE2;
1161	break;
1162	default:
1163	needed = true;
1164	break;
1165	}
1166	spin_unlock(lock: &log->l_icloglock);
1167	return needed;
1168	}
1169
1170	/*
1171	* Explicitly cover the log. This is similar to background log covering but
1172	* intended for usage in quiesce codepaths. The caller is responsible to ensure
1173	* the log is idle and suitable for covering. The CIL, iclog buffers and AIL
1174	* must all be empty.
1175	*/
1176	static int
1177	xfs_log_cover(
1178	struct xfs_mount *mp)
1179	{
1180	int error = `0`;
1181	bool need_covered;
1182
1183	ASSERT((xlog_cil_empty(mp->m_log) && xlog_iclogs_empty(mp->m_log) &&
1184	!xfs_ail_min_lsn(mp->m_log->l_ailp)) \|\|
1185	xlog_is_shutdown(mp->m_log));
1186
1187	if (!xfs_log_writable(mp))
1188	return `0`;
1189
1190	/*
1191	* xfs_log_need_covered() is not idempotent because it progresses the
1192	* state machine if the log requires covering. Therefore, we must call
1193	* this function once and use the result until we've issued an sb sync.
1194	* Do so first to make that abundantly clear.
1195	*
1196	* Fall into the covering sequence if the log needs covering or the
1197	* mount has lazy superblock accounting to sync to disk. The sb sync
1198	* used for covering accumulates the in-core counters, so covering
1199	* handles this for us.
1200	*/
1201	need_covered = xfs_log_need_covered(mp);
1202	if (!need_covered && !xfs_has_lazysbcount(mp))
1203	return `0`;
1204
1205	/*
1206	* To cover the log, commit the superblock twice (at most) in
1207	* independent checkpoints. The first serves as a reference for the
1208	* tail pointer. The sync transaction and AIL push empties the AIL and
1209	* updates the in-core tail to the LSN of the first checkpoint. The
1210	* second commit updates the on-disk tail with the in-core LSN,
1211	* covering the log. Push the AIL one more time to leave it empty, as
1212	* we found it.
1213	*/
1214	do {
1215	error = xfs_sync_sb(mp, true);
1216	if (error)
1217	break;
1218	xfs_ail_push_all_sync(ailp: mp->m_ail);
1219	} while (xfs_log_need_covered(mp));
1220
1221	return error;
1222	}
1223
1224	static void
1225	xlog_ioend_work(
1226	struct work_struct *work)
1227	{
1228	struct xlog_in_core *iclog =
1229	container_of(work, struct xlog_in_core, ic_end_io_work);
1230	struct xlog *log = iclog->ic_log;
1231	int error;
1232
1233	error = blk_status_to_errno(status: iclog->ic_bio.bi_status);
1234	#ifdef DEBUG
1235	/ treat writes with injected CRC errors as failed /
1236	if (iclog->ic_fail_crc)
1237	error = -EIO;
1238	#endif
1239
1240	/*
1241	* Race to shutdown the filesystem if we see an error.
1242	*/
1243	if (XFS_TEST_ERROR(error, log->l_mp, XFS_ERRTAG_IODONE_IOERR)) {
1244	xfs_alert(log->l_mp, "log I/O error %d", error);
1245	xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
1246	}
1247
1248	xlog_state_done_syncing(iclog);
1249	bio_uninit(&iclog->ic_bio);
1250
1251	/*
1252	* Drop the lock to signal that we are done. Nothing references the
1253	* iclog after this, so an unmount waiting on this lock can now tear it
1254	* down safely. As such, it is unsafe to reference the iclog after the
1255	* unlock as we could race with it being freed.
1256	*/
1257	up(sem: &iclog->ic_sema);
1258	}
1259
1260	/*
1261	* Return size of each in-core log record buffer.
1262	*
1263	* All machines get 8 x 32kB buffers by default, unless tuned otherwise.
1264	*
1265	* If the filesystem blocksize is too large, we may need to choose a
1266	* larger size since the directory code currently logs entire blocks.
1267	*/
1268	STATIC void
1269	xlog_get_iclog_buffer_size(
1270	struct xfs_mount *mp,
1271	struct xlog *log)
1272	{
1273	if (mp->m_logbufs <= `0`)
1274	mp->m_logbufs = XLOG_MAX_ICLOGS;
1275	if (mp->m_logbsize <= `0`)
1276	mp->m_logbsize = XLOG_BIG_RECORD_BSIZE;
1277
1278	log->l_iclog_bufs = mp->m_logbufs;
1279	log->l_iclog_size = mp->m_logbsize;
1280
1281	/*
1282	* # headers = size / 32k - one header holds cycles from 32k of data.
1283	*/
1284	log->l_iclog_heads =
1285	DIV_ROUND_UP(mp->m_logbsize, XLOG_HEADER_CYCLE_SIZE);
1286	log->l_iclog_hsize = log->l_iclog_heads << BBSHIFT;
1287	}
1288
1289	void
1290	xfs_log_work_queue(
1291	struct xfs_mount *mp)
1292	{
1293	queue_delayed_work(wq: mp->m_sync_workqueue, dwork: &mp->m_log->l_work,
1294	delay: msecs_to_jiffies(xfs_syncd_centisecs * `10`));
1295	}
1296
1297	/*
1298	* Clear the log incompat flags if we have the opportunity.
1299	*
1300	* This only happens if we're about to log the second dummy transaction as part
1301	* of covering the log.
1302	*/
1303	static inline void
1304	xlog_clear_incompat(
1305	struct xlog *log)
1306	{
1307	struct xfs_mount *mp = log->l_mp;
1308
1309	if (!xfs_sb_has_incompat_log_feature(&mp->m_sb,
1310	XFS_SB_FEAT_INCOMPAT_LOG_ALL))
1311	return;
1312
1313	if (log->l_covered_state != XLOG_STATE_COVER_DONE2)
1314	return;
1315
1316	xfs_clear_incompat_log_features(mp);
1317	}
1318
1319	/*
1320	* Every sync period we need to unpin all items in the AIL and push them to
1321	* disk. If there is nothing dirty, then we might need to cover the log to
1322	* indicate that the filesystem is idle.
1323	*/
1324	static void
1325	xfs_log_worker(
1326	struct work_struct *work)
1327	{
1328	struct xlog *log = container_of(to_delayed_work(work),
1329	struct xlog, l_work);
1330	struct xfs_mount *mp = log->l_mp;
1331
1332	/ dgc: errors ignored - not fatal and nowhere to report them /
1333	if (xfs_fs_writable(mp, level: SB_FREEZE_WRITE) && xfs_log_need_covered(mp)) {
1334	/*
1335	* Dump a transaction into the log that contains no real change.
1336	* This is needed to stamp the current tail LSN into the log
1337	* during the covering operation.
1338	*
1339	* We cannot use an inode here for this - that will push dirty
1340	* state back up into the VFS and then periodic inode flushing
1341	* will prevent log covering from making progress. Hence we
1342	* synchronously log the superblock instead to ensure the
1343	* superblock is immediately unpinned and can be written back.
1344	*/
1345	xlog_clear_incompat(log);
1346	xfs_sync_sb(mp, true);
1347	} else
1348	xfs_log_force(mp, flags: `0`);
1349
1350	/ start pushing all the metadata that is currently dirty /
1351	xfs_ail_push_all(ailp: mp->m_ail);
1352
1353	/ queue us up again /
1354	xfs_log_work_queue(mp);
1355	}
1356
1357	/*
1358	* This routine initializes some of the log structure for a given mount point.
1359	* Its primary purpose is to fill in enough, so recovery can occur. However,
1360	* some other stuff may be filled in too.
1361	*/
1362	STATIC struct xlog *
1363	xlog_alloc_log(
1364	struct xfs_mount *mp,
1365	struct xfs_buftarg *log_target,
1366	xfs_daddr_t blk_offset,
1367	int num_bblks)
1368	{
1369	struct xlog *log;
1370	xlog_rec_header_t *head;
1371	xlog_in_core_t **iclogp;
1372	xlog_in_core_t iclog, prev_iclog=NULL;
1373	int i;
1374	int error = -ENOMEM;
1375	uint log2_size = `0`;
1376
1377	log = kzalloc(sizeof(struct xlog), GFP_KERNEL \| __GFP_RETRY_MAYFAIL);
1378	if (!log) {
1379	xfs_warn(mp, "Log allocation failed: No memory!");
1380	goto out;
1381	}
1382
1383	log->l_mp = mp;
1384	log->l_targ = log_target;
1385	log->l_logsize = BBTOB(num_bblks);
1386	log->l_logBBstart = blk_offset;
1387	log->l_logBBsize = num_bblks;
1388	log->l_covered_state = XLOG_STATE_COVER_IDLE;
1389	set_bit(XLOG_ACTIVE_RECOVERY, addr: &log->l_opstate);
1390	INIT_DELAYED_WORK(&log->l_work, xfs_log_worker);
1391	INIT_LIST_HEAD(list: &log->r_dfops);
1392
1393	log->l_prev_block = -`1`;
1394	/ log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 /
1395	xlog_assign_atomic_lsn(lsn: &log->l_tail_lsn, cycle: `1`, block: `0`);
1396	log->l_curr_cycle = `1`; / 0 is bad since this is initial value /
1397
1398	if (xfs_has_logv2(mp) && mp->m_sb.sb_logsunit > `1`)
1399	log->l_iclog_roundoff = mp->m_sb.sb_logsunit;
1400	else
1401	log->l_iclog_roundoff = BBSIZE;
1402
1403	xlog_grant_head_init(head: &log->l_reserve_head);
1404	xlog_grant_head_init(head: &log->l_write_head);
1405
1406	error = -EFSCORRUPTED;
1407	if (xfs_has_sector(mp)) {
1408	log2_size = mp->m_sb.sb_logsectlog;
1409	if (log2_size < BBSHIFT) {
1410	xfs_warn(mp, "Log sector size too small (0x%x < 0x%x)",
1411	log2_size, BBSHIFT);
1412	goto out_free_log;
1413	}
1414
1415	log2_size -= BBSHIFT;
1416	if (log2_size > mp->m_sectbb_log) {
1417	xfs_warn(mp, "Log sector size too large (0x%x > 0x%x)",
1418	log2_size, mp->m_sectbb_log);
1419	goto out_free_log;
1420	}
1421
1422	/ for larger sector sizes, must have v2 or external log /
1423	if (log2_size && log->l_logBBstart > `0` &&
1424	!xfs_has_logv2(mp)) {
1425	xfs_warn(mp,
1426	"log sector size (0x%x) invalid for configuration.",
1427	log2_size);
1428	goto out_free_log;
1429	}
1430	}
1431	log->l_sectBBsize = `1` << log2_size;
1432
1433	xlog_get_iclog_buffer_size(mp, log);
1434
1435	spin_lock_init(&log->l_icloglock);
1436	init_waitqueue_head(&log->l_flush_wait);
1437
1438	iclogp = &log->l_iclog;
1439	/*
1440	* The amount of memory to allocate for the iclog structure is
1441	* rather funky due to the way the structure is defined. It is
1442	* done this way so that we can use different sizes for machines
1443	* with different amounts of memory. See the definition of
1444	* xlog_in_core_t in xfs_log_priv.h for details.
1445	*/
1446	ASSERT(log->l_iclog_size >= `4096`);
1447	for (i = `0`; i < log->l_iclog_bufs; i++) {
1448	size_t bvec_size = howmany(log->l_iclog_size, PAGE_SIZE) *
1449	sizeof(struct bio_vec);
1450
1451	iclog = kzalloc(sizeof(*iclog) + bvec_size,
1452	GFP_KERNEL \| __GFP_RETRY_MAYFAIL);
1453	if (!iclog)
1454	goto out_free_iclog;
1455
1456	*iclogp = iclog;
1457	iclog->ic_prev = prev_iclog;
1458	prev_iclog = iclog;
1459
1460	iclog->ic_data = kvzalloc(log->l_iclog_size,
1461	GFP_KERNEL \| __GFP_RETRY_MAYFAIL);
1462	if (!iclog->ic_data)
1463	goto out_free_iclog;
1464	head = &iclog->ic_header;
1465	memset(head, `0`, sizeof(xlog_rec_header_t));
1466	head->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
1467	head->h_version = cpu_to_be32(
1468	xfs_has_logv2(log->l_mp) ? `2` : `1`);
1469	head->h_size = cpu_to_be32(log->l_iclog_size);
1470	/ new fields /
1471	head->h_fmt = cpu_to_be32(XLOG_FMT);
1472	memcpy(&head->h_fs_uuid, &mp->m_sb.sb_uuid, sizeof(uuid_t));
1473
1474	iclog->ic_size = log->l_iclog_size - log->l_iclog_hsize;
1475	iclog->ic_state = XLOG_STATE_ACTIVE;
1476	iclog->ic_log = log;
1477	atomic_set(v: &iclog->ic_refcnt, i: `0`);
1478	INIT_LIST_HEAD(list: &iclog->ic_callbacks);
1479	iclog->ic_datap = (void *)iclog->ic_data + log->l_iclog_hsize;
1480
1481	init_waitqueue_head(&iclog->ic_force_wait);
1482	init_waitqueue_head(&iclog->ic_write_wait);
1483	INIT_WORK(&iclog->ic_end_io_work, xlog_ioend_work);
1484	sema_init(sem: &iclog->ic_sema, val: `1`);
1485
1486	iclogp = &iclog->ic_next;
1487	}
1488	iclogp = log->l_iclog; /* complete ring /
1489	log->l_iclog->ic_prev = prev_iclog; / re-write 1st prev ptr /
1490
1491	log->l_ioend_workqueue = alloc_workqueue(fmt: "xfs-log/%s",
1492	XFS_WQFLAGS(WQ_FREEZABLE \| WQ_MEM_RECLAIM \|
1493	WQ_HIGHPRI),
1494	max_active: `0`, mp->m_super->s_id);
1495	if (!log->l_ioend_workqueue)
1496	goto out_free_iclog;
1497
1498	error = xlog_cil_init(log);
1499	if (error)
1500	goto out_destroy_workqueue;
1501	return log;
1502
1503	out_destroy_workqueue:
1504	destroy_workqueue(wq: log->l_ioend_workqueue);
1505	out_free_iclog:
1506	for (iclog = log->l_iclog; iclog; iclog = prev_iclog) {
1507	prev_iclog = iclog->ic_next;
1508	kvfree(addr: iclog->ic_data);
1509	kfree(objp: iclog);
1510	if (prev_iclog == log->l_iclog)
1511	break;
1512	}
1513	out_free_log:
1514	kfree(objp: log);
1515	out:
1516	return ERR_PTR(error);
1517	} / xlog_alloc_log /
1518
1519	/*
1520	* Stamp cycle number in every block
1521	*/
1522	STATIC void
1523	xlog_pack_data(
1524	struct xlog *log,
1525	struct xlog_in_core *iclog,
1526	int roundoff)
1527	{
1528	int i, j, k;
1529	int size = iclog->ic_offset + roundoff;
1530	__be32 cycle_lsn;
1531	char *dp;
1532
1533	cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn);
1534
1535	dp = iclog->ic_datap;
1536	for (i = `0`; i < BTOBB(size); i++) {
1537	if (i >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE))
1538	break;
1539	iclog->ic_header.h_cycle_data[i] = (__be32 )dp;
1540	(__be32 )dp = cycle_lsn;
1541	dp += BBSIZE;
1542	}
1543
1544	if (xfs_has_logv2(mp: log->l_mp)) {
1545	xlog_in_core_2_t *xhdr = iclog->ic_data;
1546
1547	for ( ; i < BTOBB(size); i++) {
1548	j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
1549	k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
1550	xhdr[j].hic_xheader.xh_cycle_data[k] = (__be32 )dp;
1551	(__be32 )dp = cycle_lsn;
1552	dp += BBSIZE;
1553	}
1554
1555	for (i = `1`; i < log->l_iclog_heads; i++)
1556	xhdr[i].hic_xheader.xh_cycle = cycle_lsn;
1557	}
1558	}
1559
1560	/*
1561	* Calculate the checksum for a log buffer.
1562	*
1563	* This is a little more complicated than it should be because the various
1564	* headers and the actual data are non-contiguous.
1565	*/
1566	__le32
1567	xlog_cksum(
1568	struct xlog *log,
1569	struct xlog_rec_header *rhead,
1570	char *dp,
1571	int size)
1572	{
1573	uint32_t crc;
1574
1575	/ first generate the crc for the record header ... /
1576	crc = xfs_start_cksum_update((char *)rhead,
1577	sizeof(struct xlog_rec_header),
1578	offsetof(struct xlog_rec_header, h_crc));
1579
1580	/ ... then for additional cycle data for v2 logs ... /
1581	if (xfs_has_logv2(mp: log->l_mp)) {
1582	union xlog_in_core2 xhdr = (union* xlog_in_core2 *)rhead;
1583	int i;
1584	int xheads;
1585
1586	xheads = DIV_ROUND_UP(size, XLOG_HEADER_CYCLE_SIZE);
1587
1588	for (i = `1`; i < xheads; i++) {
1589	crc = crc32c(crc, &xhdr[i].hic_xheader,
1590	sizeof(struct xlog_rec_ext_header));
1591	}
1592	}
1593
1594	/ ... and finally for the payload /
1595	crc = crc32c(crc, p: dp, len: size);
1596
1597	return xfs_end_cksum(crc);
1598	}
1599
1600	static void
1601	xlog_bio_end_io(
1602	struct bio *bio)
1603	{
1604	struct xlog_in_core *iclog = bio->bi_private;
1605
1606	queue_work(wq: iclog->ic_log->l_ioend_workqueue,
1607	work: &iclog->ic_end_io_work);
1608	}
1609
1610	STATIC void
1611	xlog_write_iclog(
1612	struct xlog *log,
1613	struct xlog_in_core *iclog,
1614	uint64_t bno,
1615	unsigned int count)
1616	{
1617	ASSERT(bno < log->l_logBBsize);
1618	trace_xlog_iclog_write(iclog, _RET_IP_);
1619
1620	/*
1621	* We lock the iclogbufs here so that we can serialise against I/O
1622	* completion during unmount. We might be processing a shutdown
1623	* triggered during unmount, and that can occur asynchronously to the
1624	* unmount thread, and hence we need to ensure that completes before
1625	* tearing down the iclogbufs. Hence we need to hold the buffer lock
1626	* across the log IO to archieve that.
1627	*/
1628	down(sem: &iclog->ic_sema);
1629	if (xlog_is_shutdown(log)) {
1630	/*
1631	* It would seem logical to return EIO here, but we rely on
1632	* the log state machine to propagate I/O errors instead of
1633	* doing it here. We kick of the state machine and unlock
1634	* the buffer manually, the code needs to be kept in sync
1635	* with the I/O completion path.
1636	*/
1637	goto sync;
1638	}
1639
1640	/*
1641	* We use REQ_SYNC \| REQ_IDLE here to tell the block layer the are more
1642	* IOs coming immediately after this one. This prevents the block layer
1643	* writeback throttle from throttling log writes behind background
1644	* metadata writeback and causing priority inversions.
1645	*/
1646	bio_init(bio: &iclog->ic_bio, bdev: log->l_targ->bt_bdev, table: iclog->ic_bvec,
1647	howmany(count, PAGE_SIZE),
1648	opf: REQ_OP_WRITE \| REQ_META \| REQ_SYNC \| REQ_IDLE);
1649	iclog->ic_bio.bi_iter.bi_sector = log->l_logBBstart + bno;
1650	iclog->ic_bio.bi_end_io = xlog_bio_end_io;
1651	iclog->ic_bio.bi_private = iclog;
1652
1653	if (iclog->ic_flags & XLOG_ICL_NEED_FLUSH) {
1654	iclog->ic_bio.bi_opf \|= REQ_PREFLUSH;
1655	/*
1656	* For external log devices, we also need to flush the data
1657	* device cache first to ensure all metadata writeback covered
1658	* by the LSN in this iclog is on stable storage. This is slow,
1659	* but it must complete before we issue the external log IO.
1660	*
1661	* If the flush fails, we cannot conclude that past metadata
1662	* writeback from the log succeeded. Repeating the flush is
1663	* not possible, hence we must shut down with log IO error to
1664	* avoid shutdown re-entering this path and erroring out again.
1665	*/
1666	if (log->l_targ != log->l_mp->m_ddev_targp &&
1667	blkdev_issue_flush(bdev: log->l_mp->m_ddev_targp->bt_bdev))
1668	goto shutdown;
1669	}
1670	if (iclog->ic_flags & XLOG_ICL_NEED_FUA)
1671	iclog->ic_bio.bi_opf \|= REQ_FUA;
1672
1673	iclog->ic_flags &= ~(XLOG_ICL_NEED_FLUSH \| XLOG_ICL_NEED_FUA);
1674
1675	if (is_vmalloc_addr(x: iclog->ic_data)) {
1676	if (!bio_add_vmalloc(bio: &iclog->ic_bio, vaddr: iclog->ic_data, len: count))
1677	goto shutdown;
1678	} else {
1679	bio_add_virt_nofail(bio: &iclog->ic_bio, vaddr: iclog->ic_data, len: count);
1680	}
1681
1682	/*
1683	* If this log buffer would straddle the end of the log we will have
1684	* to split it up into two bios, so that we can continue at the start.
1685	*/
1686	if (bno + BTOBB(count) > log->l_logBBsize) {
1687	struct bio *split;
1688
1689	split = bio_split(bio: &iclog->ic_bio, sectors: log->l_logBBsize - bno,
1690	GFP_NOIO, bs: &fs_bio_set);
1691	bio_chain(split, &iclog->ic_bio);
1692	submit_bio(bio: split);
1693
1694	/ restart at logical offset zero for the remainder /
1695	iclog->ic_bio.bi_iter.bi_sector = log->l_logBBstart;
1696	}
1697
1698	submit_bio(bio: &iclog->ic_bio);
1699	return;
1700	shutdown:
1701	xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
1702	sync:
1703	xlog_state_done_syncing(iclog);
1704	up(sem: &iclog->ic_sema);
1705	}
1706
1707	/*
1708	* We need to bump cycle number for the part of the iclog that is
1709	* written to the start of the log. Watch out for the header magic
1710	* number case, though.
1711	*/
1712	static void
1713	xlog_split_iclog(
1714	struct xlog *log,
1715	void *data,
1716	uint64_t bno,
1717	unsigned int count)
1718	{
1719	unsigned int split_offset = BBTOB(log->l_logBBsize - bno);
1720	unsigned int i;
1721
1722	for (i = split_offset; i < count; i += BBSIZE) {
1723	uint32_t cycle = get_unaligned_be32(p: data + i);
1724
1725	if (++cycle == XLOG_HEADER_MAGIC_NUM)
1726	cycle++;
1727	put_unaligned_be32(val: cycle, p: data + i);
1728	}
1729	}
1730
1731	static int
1732	xlog_calc_iclog_size(
1733	struct xlog *log,
1734	struct xlog_in_core *iclog,
1735	uint32_t *roundoff)
1736	{
1737	uint32_t count_init, count;
1738
1739	/ Add for LR header /
1740	count_init = log->l_iclog_hsize + iclog->ic_offset;
1741	count = roundup(count_init, log->l_iclog_roundoff);
1742
1743	*roundoff = count - count_init;
1744
1745	ASSERT(count >= count_init);
1746	ASSERT(*roundoff < log->l_iclog_roundoff);
1747	return count;
1748	}
1749
1750	/*
1751	* Flush out the in-core log (iclog) to the on-disk log in an asynchronous
1752	* fashion. Previously, we should have moved the current iclog
1753	* ptr in the log to point to the next available iclog. This allows further
1754	* write to continue while this code syncs out an iclog ready to go.
1755	* Before an in-core log can be written out, the data section must be scanned
1756	* to save away the 1st word of each BBSIZE block into the header. We replace
1757	* it with the current cycle count. Each BBSIZE block is tagged with the
1758	* cycle count because there in an implicit assumption that drives will
1759	* guarantee that entire 512 byte blocks get written at once. In other words,
1760	* we can't have part of a 512 byte block written and part not written. By
1761	* tagging each block, we will know which blocks are valid when recovering
1762	* after an unclean shutdown.
1763	*
1764	* This routine is single threaded on the iclog. No other thread can be in
1765	* this routine with the same iclog. Changing contents of iclog can there-
1766	* fore be done without grabbing the state machine lock. Updating the global
1767	* log will require grabbing the lock though.
1768	*
1769	* The entire log manager uses a logical block numbering scheme. Only
1770	* xlog_write_iclog knows about the fact that the log may not start with
1771	* block zero on a given device.
1772	*/
1773	STATIC void
1774	xlog_sync(
1775	struct xlog *log,
1776	struct xlog_in_core *iclog,
1777	struct xlog_ticket *ticket)
1778	{
1779	unsigned int count; / byte count of bwrite /
1780	unsigned int roundoff; / roundoff to BB or stripe /
1781	uint64_t bno;
1782	unsigned int size;
1783
1784	ASSERT(atomic_read(&iclog->ic_refcnt) == `0`);
1785	trace_xlog_iclog_sync(iclog, _RET_IP_);
1786
1787	count = xlog_calc_iclog_size(log, iclog, roundoff: &roundoff);
1788
1789	/*
1790	* If we have a ticket, account for the roundoff via the ticket
1791	* reservation to avoid touching the hot grant heads needlessly.
1792	* Otherwise, we have to move grant heads directly.
1793	*/
1794	if (ticket) {
1795	ticket->t_curr_res -= roundoff;
1796	} else {
1797	xlog_grant_add_space(head: &log->l_reserve_head, bytes: roundoff);
1798	xlog_grant_add_space(head: &log->l_write_head, bytes: roundoff);
1799	}
1800
1801	/ put cycle number in every block /
1802	xlog_pack_data(log, iclog, roundoff);
1803
1804	/ real byte length /
1805	size = iclog->ic_offset;
1806	if (xfs_has_logv2(mp: log->l_mp))
1807	size += roundoff;
1808	iclog->ic_header.h_len = cpu_to_be32(size);
1809
1810	XFS_STATS_INC(log->l_mp, xs_log_writes);
1811	XFS_STATS_ADD(log->l_mp, xs_log_blocks, BTOBB(count));
1812
1813	bno = BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn));
1814
1815	/ Do we need to split this write into 2 parts? /
1816	if (bno + BTOBB(count) > log->l_logBBsize)
1817	xlog_split_iclog(log, data: &iclog->ic_header, bno, count);
1818
1819	/ calculcate the checksum /
1820	iclog->ic_header.h_crc = xlog_cksum(log, rhead: &iclog->ic_header,
1821	dp: iclog->ic_datap, size);
1822	/*
1823	* Intentionally corrupt the log record CRC based on the error injection
1824	* frequency, if defined. This facilitates testing log recovery in the
1825	* event of torn writes. Hence, set the IOABORT state to abort the log
1826	* write on I/O completion and shutdown the fs. The subsequent mount
1827	* detects the bad CRC and attempts to recover.
1828	*/
1829	#ifdef DEBUG
1830	if (XFS_TEST_ERROR(false, log->l_mp, XFS_ERRTAG_LOG_BAD_CRC)) {
1831	iclog->ic_header.h_crc &= cpu_to_le32(`0xAAAAAAAA`);
1832	iclog->ic_fail_crc = true;
1833	xfs_warn(log->l_mp,
1834	"Intentionally corrupted log record at LSN 0x%llx. Shutdown imminent.",
1835	be64_to_cpu(iclog->ic_header.h_lsn));
1836	}
1837	#endif
1838	xlog_verify_iclog(log, iclog, count);
1839	xlog_write_iclog(log, iclog, bno, count);
1840	}
1841
1842	/*
1843	* Deallocate a log structure
1844	*/
1845	STATIC void
1846	xlog_dealloc_log(
1847	struct xlog *log)
1848	{
1849	xlog_in_core_t iclog, next_iclog;
1850	int i;
1851
1852	/*
1853	* Destroy the CIL after waiting for iclog IO completion because an
1854	* iclog EIO error will try to shut down the log, which accesses the
1855	* CIL to wake up the waiters.
1856	*/
1857	xlog_cil_destroy(log);
1858
1859	iclog = log->l_iclog;
1860	for (i = `0`; i < log->l_iclog_bufs; i++) {
1861	next_iclog = iclog->ic_next;
1862	kvfree(addr: iclog->ic_data);
1863	kfree(objp: iclog);
1864	iclog = next_iclog;
1865	}
1866
1867	log->l_mp->m_log = NULL;
1868	destroy_workqueue(wq: log->l_ioend_workqueue);
1869	kfree(objp: log);
1870	}
1871
1872	/*
1873	* Update counters atomically now that memcpy is done.
1874	*/
1875	static inline void
1876	xlog_state_finish_copy(
1877	struct xlog *log,
1878	struct xlog_in_core *iclog,
1879	int record_cnt,
1880	int copy_bytes)
1881	{
1882	lockdep_assert_held(&log->l_icloglock);
1883
1884	be32_add_cpu(var: &iclog->ic_header.h_num_logops, val: record_cnt);
1885	iclog->ic_offset += copy_bytes;
1886	}
1887
1888	/*
1889	* print out info relating to regions written which consume
1890	* the reservation
1891	*/
1892	void
1893	xlog_print_tic_res(
1894	struct xfs_mount *mp,
1895	struct xlog_ticket *ticket)
1896	{
1897	xfs_warn(mp, "ticket reservation summary:");
1898	xfs_warn(mp, " unit res = %d bytes", ticket->t_unit_res);
1899	xfs_warn(mp, " current res = %d bytes", ticket->t_curr_res);
1900	xfs_warn(mp, " original count = %d", ticket->t_ocnt);
1901	xfs_warn(mp, " remaining count = %d", ticket->t_cnt);
1902	}
1903
1904	/*
1905	* Print a summary of the transaction.
1906	*/
1907	void
1908	xlog_print_trans(
1909	struct xfs_trans *tp)
1910	{
1911	struct xfs_mount *mp = tp->t_mountp;
1912	struct xfs_log_item *lip;
1913
1914	/ dump core transaction and ticket info /
1915	xfs_warn(mp, "transaction summary:");
1916	xfs_warn(mp, " log res = %d", tp->t_log_res);
1917	xfs_warn(mp, " log count = %d", tp->t_log_count);
1918	xfs_warn(mp, " flags = 0x%x", tp->t_flags);
1919
1920	xlog_print_tic_res(mp, ticket: tp->t_ticket);
1921
1922	/ dump each log item /
1923	list_for_each_entry(lip, &tp->t_items, li_trans) {
1924	struct xfs_log_vec *lv = lip->li_lv;
1925	struct xfs_log_iovec *vec;
1926	int i;
1927
1928	xfs_warn(mp, "log item: ");
1929	xfs_warn(mp, " type = 0x%x", lip->li_type);
1930	xfs_warn(mp, " flags = 0x%lx", lip->li_flags);
1931	if (!lv)
1932	continue;
1933	xfs_warn(mp, " niovecs = %d", lv->lv_niovecs);
1934	xfs_warn(mp, " size = %d", lv->lv_size);
1935	xfs_warn(mp, " bytes = %d", lv->lv_bytes);
1936	xfs_warn(mp, " buf len = %d", lv->lv_buf_len);
1937
1938	/ dump each iovec for the log item /
1939	vec = lv->lv_iovecp;
1940	for (i = `0`; i < lv->lv_niovecs; i++) {
1941	int dumplen = min(vec->i_len, `32`);
1942
1943	xfs_warn(mp, " iovec[%d]", i);
1944	xfs_warn(mp, " type = 0x%x", vec->i_type);
1945	xfs_warn(mp, " len = %d", vec->i_len);
1946	xfs_warn(mp, " first %d bytes of iovec[%d]:", dumplen, i);
1947	xfs_hex_dump(p: vec->i_addr, length: dumplen);
1948
1949	vec++;
1950	}
1951	}
1952	}
1953
1954	static inline void
1955	xlog_write_iovec(
1956	struct xlog_in_core *iclog,
1957	uint32_t *log_offset,
1958	void *data,
1959	uint32_t write_len,
1960	int *bytes_left,
1961	uint32_t *record_cnt,
1962	uint32_t *data_cnt)
1963	{
1964	ASSERT(*log_offset < iclog->ic_log->l_iclog_size);
1965	ASSERT(log_offset % sizeof*(int32_t) == `0`);
1966	ASSERT(write_len % sizeof(int32_t) == `0`);
1967
1968	memcpy(iclog->ic_datap + *log_offset, data, write_len);
1969	*log_offset += write_len;
1970	*bytes_left -= write_len;
1971	(*record_cnt)++;
1972	*data_cnt += write_len;
1973	}
1974
1975	/*
1976	* Write log vectors into a single iclog which is guaranteed by the caller
1977	* to have enough space to write the entire log vector into.
1978	*/
1979	static void
1980	xlog_write_full(
1981	struct xfs_log_vec *lv,
1982	struct xlog_ticket *ticket,
1983	struct xlog_in_core *iclog,
1984	uint32_t *log_offset,
1985	uint32_t *len,
1986	uint32_t *record_cnt,
1987	uint32_t *data_cnt)
1988	{
1989	int index;
1990
1991	ASSERT(log_offset + len <= iclog->ic_size \|\|
1992	iclog->ic_state == XLOG_STATE_WANT_SYNC);
1993
1994	/*
1995	* Ordered log vectors have no regions to write so this
1996	* loop will naturally skip them.
1997	*/
1998	for (index = `0`; index < lv->lv_niovecs; index++) {
1999	struct xfs_log_iovec *reg = &lv->lv_iovecp[index];
2000	struct xlog_op_header *ophdr = reg->i_addr;
2001
2002	ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
2003	xlog_write_iovec(iclog, log_offset, data: reg->i_addr,
2004	write_len: reg->i_len, bytes_left: len, record_cnt, data_cnt);
2005	}
2006	}
2007
2008	static int
2009	xlog_write_get_more_iclog_space(
2010	struct xlog_ticket *ticket,
2011	struct xlog_in_core **iclogp,
2012	uint32_t *log_offset,
2013	uint32_t len,
2014	uint32_t *record_cnt,
2015	uint32_t *data_cnt)
2016	{
2017	struct xlog_in_core iclog = iclogp;
2018	struct xlog *log = iclog->ic_log;
2019	int error;
2020
2021	spin_lock(lock: &log->l_icloglock);
2022	ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC);
2023	xlog_state_finish_copy(log, iclog, record_cnt: record_cnt, copy_bytes: data_cnt);
2024	error = xlog_state_release_iclog(log, iclog, ticket);
2025	spin_unlock(lock: &log->l_icloglock);
2026	if (error)
2027	return error;
2028
2029	error = xlog_state_get_iclog_space(log, len, iclog: &iclog, ticket,
2030	logoffsetp: log_offset);
2031	if (error)
2032	return error;
2033	*record_cnt = `0`;
2034	*data_cnt = `0`;
2035	*iclogp = iclog;
2036	return `0`;
2037	}
2038
2039	/*
2040	* Write log vectors into a single iclog which is smaller than the current chain
2041	* length. We write until we cannot fit a full record into the remaining space
2042	* and then stop. We return the log vector that is to be written that cannot
2043	* wholly fit in the iclog.
2044	*/
2045	static int
2046	xlog_write_partial(
2047	struct xfs_log_vec *lv,
2048	struct xlog_ticket *ticket,
2049	struct xlog_in_core **iclogp,
2050	uint32_t *log_offset,
2051	uint32_t *len,
2052	uint32_t *record_cnt,
2053	uint32_t *data_cnt)
2054	{
2055	struct xlog_in_core iclog = iclogp;
2056	struct xlog_op_header *ophdr;
2057	int index = `0`;
2058	uint32_t rlen;
2059	int error;
2060
2061	/ walk the logvec, copying until we run out of space in the iclog /
2062	for (index = `0`; index < lv->lv_niovecs; index++) {
2063	struct xfs_log_iovec *reg = &lv->lv_iovecp[index];
2064	uint32_t reg_offset = `0`;
2065
2066	/*
2067	* The first region of a continuation must have a non-zero
2068	* length otherwise log recovery will just skip over it and
2069	* start recovering from the next opheader it finds. Because we
2070	* mark the next opheader as a continuation, recovery will then
2071	* incorrectly add the continuation to the previous region and
2072	* that breaks stuff.
2073	*
2074	* Hence if there isn't space for region data after the
2075	* opheader, then we need to start afresh with a new iclog.
2076	*/
2077	if (iclog->ic_size - *log_offset <=
2078	sizeof(struct xlog_op_header)) {
2079	error = xlog_write_get_more_iclog_space(ticket,
2080	iclogp: &iclog, log_offset, len: *len, record_cnt,
2081	data_cnt);
2082	if (error)
2083	return error;
2084	}
2085
2086	ophdr = reg->i_addr;
2087	rlen = min_t(uint32_t, reg->i_len, iclog->ic_size - *log_offset);
2088
2089	ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
2090	ophdr->oh_len = cpu_to_be32(rlen - sizeof(struct xlog_op_header));
2091	if (rlen != reg->i_len)
2092	ophdr->oh_flags \|= XLOG_CONTINUE_TRANS;
2093
2094	xlog_write_iovec(iclog, log_offset, data: reg->i_addr,
2095	write_len: rlen, bytes_left: len, record_cnt, data_cnt);
2096
2097	/ If we wrote the whole region, move to the next. /
2098	if (rlen == reg->i_len)
2099	continue;
2100
2101	/*
2102	* We now have a partially written iovec, but it can span
2103	* multiple iclogs so we loop here. First we release the iclog
2104	* we currently have, then we get a new iclog and add a new
2105	* opheader. Then we continue copying from where we were until
2106	* we either complete the iovec or fill the iclog. If we
2107	* complete the iovec, then we increment the index and go right
2108	* back to the top of the outer loop. if we fill the iclog, we
2109	* run the inner loop again.
2110	*
2111	* This is complicated by the tail of a region using all the
2112	* space in an iclog and hence requiring us to release the iclog
2113	* and get a new one before returning to the outer loop. We must
2114	* always guarantee that we exit this inner loop with at least
2115	* space for log transaction opheaders left in the current
2116	* iclog, hence we cannot just terminate the loop at the end
2117	* of the of the continuation. So we loop while there is no
2118	* space left in the current iclog, and check for the end of the
2119	* continuation after getting a new iclog.
2120	*/
2121	do {
2122	/*
2123	* Ensure we include the continuation opheader in the
2124	* space we need in the new iclog by adding that size
2125	* to the length we require. This continuation opheader
2126	* needs to be accounted to the ticket as the space it
2127	* consumes hasn't been accounted to the lv we are
2128	* writing.
2129	*/
2130	error = xlog_write_get_more_iclog_space(ticket,
2131	&iclog, log_offset,
2132	len + sizeof(struct* xlog_op_header),
2133	record_cnt, data_cnt);
2134	if (error)
2135	return error;
2136
2137	ophdr = iclog->ic_datap + *log_offset;
2138	ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
2139	ophdr->oh_clientid = XFS_TRANSACTION;
2140	ophdr->oh_res2 = `0`;
2141	ophdr->oh_flags = XLOG_WAS_CONT_TRANS;
2142
2143	ticket->t_curr_res -= sizeof(struct xlog_op_header);
2144	log_offset += sizeof(struct* xlog_op_header);
2145	data_cnt += sizeof(struct* xlog_op_header);
2146
2147	/*
2148	* If rlen fits in the iclog, then end the region
2149	* continuation. Otherwise we're going around again.
2150	*/
2151	reg_offset += rlen;
2152	rlen = reg->i_len - reg_offset;
2153	if (rlen <= iclog->ic_size - *log_offset)
2154	ophdr->oh_flags \|= XLOG_END_TRANS;
2155	else
2156	ophdr->oh_flags \|= XLOG_CONTINUE_TRANS;
2157
2158	rlen = min_t(uint32_t, rlen, iclog->ic_size - *log_offset);
2159	ophdr->oh_len = cpu_to_be32(rlen);
2160
2161	xlog_write_iovec(iclog, log_offset,
2162	data: reg->i_addr + reg_offset,
2163	write_len: rlen, bytes_left: len, record_cnt, data_cnt);
2164
2165	} while (ophdr->oh_flags & XLOG_CONTINUE_TRANS);
2166	}
2167
2168	/*
2169	* No more iovecs remain in this logvec so return the next log vec to
2170	* the caller so it can go back to fast path copying.
2171	*/
2172	*iclogp = iclog;
2173	return `0`;
2174	}
2175
2176	/*
2177	* Write some region out to in-core log
2178	*
2179	* This will be called when writing externally provided regions or when
2180	* writing out a commit record for a given transaction.
2181	*
2182	* General algorithm:
2183	* 1. Find total length of this write. This may include adding to the
2184	* lengths passed in.
2185	* 2. Check whether we violate the tickets reservation.
2186	* 3. While writing to this iclog
2187	* A. Reserve as much space in this iclog as can get
2188	* B. If this is first write, save away start lsn
2189	* C. While writing this region:
2190	* 1. If first write of transaction, write start record
2191	* 2. Write log operation header (header per region)
2192	* 3. Find out if we can fit entire region into this iclog
2193	* 4. Potentially, verify destination memcpy ptr
2194	* 5. Memcpy (partial) region
2195	* 6. If partial copy, release iclog; otherwise, continue
2196	* copying more regions into current iclog
2197	* 4. Mark want sync bit (in simulation mode)
2198	* 5. Release iclog for potential flush to on-disk log.
2199	*
2200	* ERRORS:
2201	* 1. Panic if reservation is overrun. This should never happen since
2202	* reservation amounts are generated internal to the filesystem.
2203	* NOTES:
2204	* 1. Tickets are single threaded data structures.
2205	* 2. The XLOG_END_TRANS & XLOG_CONTINUE_TRANS flags are passed down to the
2206	* syncing routine. When a single log_write region needs to span
2207	* multiple in-core logs, the XLOG_CONTINUE_TRANS bit should be set
2208	* on all log operation writes which don't contain the end of the
2209	* region. The XLOG_END_TRANS bit is used for the in-core log
2210	* operation which contains the end of the continued log_write region.
2211	* 3. When xlog_state_get_iclog_space() grabs the rest of the current iclog,
2212	* we don't really know exactly how much space will be used. As a result,
2213	* we don't update ic_offset until the end when we know exactly how many
2214	* bytes have been written out.
2215	*/
2216	int
2217	xlog_write(
2218	struct xlog *log,
2219	struct xfs_cil_ctx *ctx,
2220	struct list_head *lv_chain,
2221	struct xlog_ticket *ticket,
2222	uint32_t len)
2223
2224	{
2225	struct xlog_in_core *iclog = NULL;
2226	struct xfs_log_vec *lv;
2227	uint32_t record_cnt = `0`;
2228	uint32_t data_cnt = `0`;
2229	int error = `0`;
2230	int log_offset;
2231
2232	if (ticket->t_curr_res < `0`) {
2233	xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES,
2234	"ctx ticket reservation ran out. Need to up reservation");
2235	xlog_print_tic_res(mp: log->l_mp, ticket);
2236	xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
2237	}
2238
2239	error = xlog_state_get_iclog_space(log, len, iclog: &iclog, ticket,
2240	logoffsetp: &log_offset);
2241	if (error)
2242	return error;
2243
2244	ASSERT(log_offset <= iclog->ic_size - `1`);
2245
2246	/*
2247	* If we have a context pointer, pass it the first iclog we are
2248	* writing to so it can record state needed for iclog write
2249	* ordering.
2250	*/
2251	if (ctx)
2252	xlog_cil_set_ctx_write_state(ctx, iclog);
2253
2254	list_for_each_entry(lv, lv_chain, lv_list) {
2255	/*
2256	* If the entire log vec does not fit in the iclog, punt it to
2257	* the partial copy loop which can handle this case.
2258	*/
2259	if (lv->lv_niovecs &&
2260	lv->lv_bytes > iclog->ic_size - log_offset) {
2261	error = xlog_write_partial(lv, ticket, iclogp: &iclog,
2262	log_offset: &log_offset, len: &len, record_cnt: &record_cnt,
2263	data_cnt: &data_cnt);
2264	if (error) {
2265	/*
2266	* We have no iclog to release, so just return
2267	* the error immediately.
2268	*/
2269	return error;
2270	}
2271	} else {
2272	xlog_write_full(lv, ticket, iclog, log_offset: &log_offset,
2273	len: &len, record_cnt: &record_cnt, data_cnt: &data_cnt);
2274	}
2275	}
2276	ASSERT(len == `0`);
2277
2278	/*
2279	* We've already been guaranteed that the last writes will fit inside
2280	* the current iclog, and hence it will already have the space used by
2281	* those writes accounted to it. Hence we do not need to update the
2282	* iclog with the number of bytes written here.
2283	*/
2284	spin_lock(lock: &log->l_icloglock);
2285	xlog_state_finish_copy(log, iclog, record_cnt, copy_bytes: `0`);
2286	error = xlog_state_release_iclog(log, iclog, ticket);
2287	spin_unlock(lock: &log->l_icloglock);
2288
2289	return error;
2290	}
2291
2292	static void
2293	xlog_state_activate_iclog(
2294	struct xlog_in_core *iclog,
2295	int *iclogs_changed)
2296	{
2297	ASSERT(list_empty_careful(&iclog->ic_callbacks));
2298	trace_xlog_iclog_activate(iclog, _RET_IP_);
2299
2300	/*
2301	* If the number of ops in this iclog indicate it just contains the
2302	* dummy transaction, we can change state into IDLE (the second time
2303	* around). Otherwise we should change the state into NEED a dummy.
2304	* We don't need to cover the dummy.
2305	*/
2306	if (*iclogs_changed == `0` &&
2307	iclog->ic_header.h_num_logops == cpu_to_be32(XLOG_COVER_OPS)) {
2308	*iclogs_changed = `1`;
2309	} else {
2310	/*
2311	* We have two dirty iclogs so start over. This could also be
2312	* num of ops indicating this is not the dummy going out.
2313	*/
2314	*iclogs_changed = `2`;
2315	}
2316
2317	iclog->ic_state = XLOG_STATE_ACTIVE;
2318	iclog->ic_offset = `0`;
2319	iclog->ic_header.h_num_logops = `0`;
2320	memset(iclog->ic_header.h_cycle_data, `0`,
2321	sizeof(iclog->ic_header.h_cycle_data));
2322	iclog->ic_header.h_lsn = `0`;
2323	iclog->ic_header.h_tail_lsn = `0`;
2324	}
2325
2326	/*
2327	* Loop through all iclogs and mark all iclogs currently marked DIRTY as
2328	* ACTIVE after iclog I/O has completed.
2329	*/
2330	static void
2331	xlog_state_activate_iclogs(
2332	struct xlog *log,
2333	int *iclogs_changed)
2334	{
2335	struct xlog_in_core *iclog = log->l_iclog;
2336
2337	do {
2338	if (iclog->ic_state == XLOG_STATE_DIRTY)
2339	xlog_state_activate_iclog(iclog, iclogs_changed);
2340	/*
2341	* The ordering of marking iclogs ACTIVE must be maintained, so
2342	* an iclog doesn't become ACTIVE beyond one that is SYNCING.
2343	*/
2344	else if (iclog->ic_state != XLOG_STATE_ACTIVE)
2345	break;
2346	} while ((iclog = iclog->ic_next) != log->l_iclog);
2347	}
2348
2349	static int
2350	xlog_covered_state(
2351	int prev_state,
2352	int iclogs_changed)
2353	{
2354	/*
2355	* We go to NEED for any non-covering writes. We go to NEED2 if we just
2356	* wrote the first covering record (DONE). We go to IDLE if we just
2357	* wrote the second covering record (DONE2) and remain in IDLE until a
2358	* non-covering write occurs.
2359	*/
2360	switch (prev_state) {
2361	case XLOG_STATE_COVER_IDLE:
2362	if (iclogs_changed == `1`)
2363	return XLOG_STATE_COVER_IDLE;
2364	fallthrough;
2365	case XLOG_STATE_COVER_NEED:
2366	case XLOG_STATE_COVER_NEED2:
2367	break;
2368	case XLOG_STATE_COVER_DONE:
2369	if (iclogs_changed == `1`)
2370	return XLOG_STATE_COVER_NEED2;
2371	break;
2372	case XLOG_STATE_COVER_DONE2:
2373	if (iclogs_changed == `1`)
2374	return XLOG_STATE_COVER_IDLE;
2375	break;
2376	default:
2377	ASSERT(`0`);
2378	}
2379
2380	return XLOG_STATE_COVER_NEED;
2381	}
2382
2383	STATIC void
2384	xlog_state_clean_iclog(
2385	struct xlog *log,
2386	struct xlog_in_core *dirty_iclog)
2387	{
2388	int iclogs_changed = `0`;
2389
2390	trace_xlog_iclog_clean(iclog: dirty_iclog, _RET_IP_);
2391
2392	dirty_iclog->ic_state = XLOG_STATE_DIRTY;
2393
2394	xlog_state_activate_iclogs(log, iclogs_changed: &iclogs_changed);
2395	wake_up_all(&dirty_iclog->ic_force_wait);
2396
2397	if (iclogs_changed) {
2398	log->l_covered_state = xlog_covered_state(prev_state: log->l_covered_state,
2399	iclogs_changed);
2400	}
2401	}
2402
2403	STATIC xfs_lsn_t
2404	xlog_get_lowest_lsn(
2405	struct xlog *log)
2406	{
2407	struct xlog_in_core *iclog = log->l_iclog;
2408	xfs_lsn_t lowest_lsn = `0`, lsn;
2409
2410	do {
2411	if (iclog->ic_state == XLOG_STATE_ACTIVE \|\|
2412	iclog->ic_state == XLOG_STATE_DIRTY)
2413	continue;
2414
2415	lsn = be64_to_cpu(iclog->ic_header.h_lsn);
2416	if ((lsn && !lowest_lsn) \|\| XFS_LSN_CMP(lsn, lowest_lsn) < `0`)
2417	lowest_lsn = lsn;
2418	} while ((iclog = iclog->ic_next) != log->l_iclog);
2419
2420	return lowest_lsn;
2421	}
2422
2423	/*
2424	* Return true if we need to stop processing, false to continue to the next
2425	* iclog. The caller will need to run callbacks if the iclog is returned in the
2426	* XLOG_STATE_CALLBACK state.
2427	*/
2428	static bool
2429	xlog_state_iodone_process_iclog(
2430	struct xlog *log,
2431	struct xlog_in_core *iclog)
2432	{
2433	xfs_lsn_t lowest_lsn;
2434	xfs_lsn_t header_lsn;
2435
2436	switch (iclog->ic_state) {
2437	case XLOG_STATE_ACTIVE:
2438	case XLOG_STATE_DIRTY:
2439	/*
2440	* Skip all iclogs in the ACTIVE & DIRTY states:
2441	*/
2442	return false;
2443	case XLOG_STATE_DONE_SYNC:
2444	/*
2445	* Now that we have an iclog that is in the DONE_SYNC state, do
2446	* one more check here to see if we have chased our tail around.
2447	* If this is not the lowest lsn iclog, then we will leave it
2448	* for another completion to process.
2449	*/
2450	header_lsn = be64_to_cpu(iclog->ic_header.h_lsn);
2451	lowest_lsn = xlog_get_lowest_lsn(log);
2452	if (lowest_lsn && XFS_LSN_CMP(lowest_lsn, header_lsn) < `0`)
2453	return false;
2454	/*
2455	* If there are no callbacks on this iclog, we can mark it clean
2456	* immediately and return. Otherwise we need to run the
2457	* callbacks.
2458	*/
2459	if (list_empty(head: &iclog->ic_callbacks)) {
2460	xlog_state_clean_iclog(log, dirty_iclog: iclog);
2461	return false;
2462	}
2463	trace_xlog_iclog_callback(iclog, _RET_IP_);
2464	iclog->ic_state = XLOG_STATE_CALLBACK;
2465	return false;
2466	default:
2467	/*
2468	* Can only perform callbacks in order. Since this iclog is not
2469	* in the DONE_SYNC state, we skip the rest and just try to
2470	* clean up.
2471	*/
2472	return true;
2473	}
2474	}
2475
2476	/*
2477	* Loop over all the iclogs, running attached callbacks on them. Return true if
2478	* we ran any callbacks, indicating that we dropped the icloglock. We don't need
2479	* to handle transient shutdown state here at all because
2480	* xlog_state_shutdown_callbacks() will be run to do the necessary shutdown
2481	* cleanup of the callbacks.
2482	*/
2483	static bool
2484	xlog_state_do_iclog_callbacks(
2485	struct xlog *log)
2486	__releases(&log->l_icloglock)
2487	__acquires(&log->l_icloglock)
2488	{
2489	struct xlog_in_core *first_iclog = log->l_iclog;
2490	struct xlog_in_core *iclog = first_iclog;
2491	bool ran_callback = false;
2492
2493	do {
2494	LIST_HEAD(cb_list);
2495
2496	if (xlog_state_iodone_process_iclog(log, iclog))
2497	break;
2498	if (iclog->ic_state != XLOG_STATE_CALLBACK) {
2499	iclog = iclog->ic_next;
2500	continue;
2501	}
2502	list_splice_init(list: &iclog->ic_callbacks, head: &cb_list);
2503	spin_unlock(lock: &log->l_icloglock);
2504
2505	trace_xlog_iclog_callbacks_start(iclog, _RET_IP_);
2506	xlog_cil_process_committed(list: &cb_list);
2507	trace_xlog_iclog_callbacks_done(iclog, _RET_IP_);
2508	ran_callback = true;
2509
2510	spin_lock(lock: &log->l_icloglock);
2511	xlog_state_clean_iclog(log, dirty_iclog: iclog);
2512	iclog = iclog->ic_next;
2513	} while (iclog != first_iclog);
2514
2515	return ran_callback;
2516	}
2517
2518
2519	/*
2520	* Loop running iclog completion callbacks until there are no more iclogs in a
2521	* state that can run callbacks.
2522	*/
2523	STATIC void
2524	xlog_state_do_callback(
2525	struct xlog *log)
2526	{
2527	int flushcnt = `0`;
2528	int repeats = `0`;
2529
2530	spin_lock(lock: &log->l_icloglock);
2531	while (xlog_state_do_iclog_callbacks(log)) {
2532	if (xlog_is_shutdown(log))
2533	break;
2534
2535	if (++repeats > `5000`) {
2536	flushcnt += repeats;
2537	repeats = `0`;
2538	xfs_warn(log->l_mp,
2539	"%s: possible infinite loop (%d iterations)",
2540	__func__, flushcnt);
2541	}
2542	}
2543
2544	if (log->l_iclog->ic_state == XLOG_STATE_ACTIVE)
2545	wake_up_all(&log->l_flush_wait);
2546
2547	spin_unlock(lock: &log->l_icloglock);
2548	}
2549
2550
2551	/*
2552	* Finish transitioning this iclog to the dirty state.
2553	*
2554	* Callbacks could take time, so they are done outside the scope of the
2555	* global state machine log lock.
2556	*/
2557	STATIC void
2558	xlog_state_done_syncing(
2559	struct xlog_in_core *iclog)
2560	{
2561	struct xlog *log = iclog->ic_log;
2562
2563	spin_lock(lock: &log->l_icloglock);
2564	ASSERT(atomic_read(&iclog->ic_refcnt) == `0`);
2565	trace_xlog_iclog_sync_done(iclog, _RET_IP_);
2566
2567	/*
2568	* If we got an error, either on the first buffer, or in the case of
2569	* split log writes, on the second, we shut down the file system and
2570	* no iclogs should ever be attempted to be written to disk again.
2571	*/
2572	if (!xlog_is_shutdown(log)) {
2573	ASSERT(iclog->ic_state == XLOG_STATE_SYNCING);
2574	iclog->ic_state = XLOG_STATE_DONE_SYNC;
2575	}
2576
2577	/*
2578	* Someone could be sleeping prior to writing out the next
2579	* iclog buffer, we wake them all, one will get to do the
2580	* I/O, the others get to wait for the result.
2581	*/
2582	wake_up_all(&iclog->ic_write_wait);
2583	spin_unlock(lock: &log->l_icloglock);
2584	xlog_state_do_callback(log);
2585	}
2586
2587	/*
2588	* If the head of the in-core log ring is not (ACTIVE or DIRTY), then we must
2589	* sleep. We wait on the flush queue on the head iclog as that should be
2590	* the first iclog to complete flushing. Hence if all iclogs are syncing,
2591	* we will wait here and all new writes will sleep until a sync completes.
2592	*
2593	* The in-core logs are used in a circular fashion. They are not used
2594	* out-of-order even when an iclog past the head is free.
2595	*
2596	* return:
2597	* * log_offset where xlog_write() can start writing into the in-core
2598	* log's data space.
2599	* * in-core log pointer to which xlog_write() should write.
2600	* * boolean indicating this is a continued write to an in-core log.
2601	* If this is the last write, then the in-core log's offset field
2602	* needs to be incremented, depending on the amount of data which
2603	* is copied.
2604	*/
2605	STATIC int
2606	xlog_state_get_iclog_space(
2607	struct xlog *log,
2608	int len,
2609	struct xlog_in_core **iclogp,
2610	struct xlog_ticket *ticket,
2611	int *logoffsetp)
2612	{
2613	int log_offset;
2614	xlog_rec_header_t *head;
2615	xlog_in_core_t *iclog;
2616
2617	restart:
2618	spin_lock(lock: &log->l_icloglock);
2619	if (xlog_is_shutdown(log)) {
2620	spin_unlock(lock: &log->l_icloglock);
2621	return -EIO;
2622	}
2623
2624	iclog = log->l_iclog;
2625	if (iclog->ic_state != XLOG_STATE_ACTIVE) {
2626	XFS_STATS_INC(log->l_mp, xs_log_noiclogs);
2627
2628	/ Wait for log writes to have flushed /
2629	xlog_wait(wq: &log->l_flush_wait, lock: &log->l_icloglock);
2630	goto restart;
2631	}
2632
2633	head = &iclog->ic_header;
2634
2635	atomic_inc(v: &iclog->ic_refcnt); / prevents sync /
2636	log_offset = iclog->ic_offset;
2637
2638	trace_xlog_iclog_get_space(iclog, _RET_IP_);
2639
2640	/ On the 1st write to an iclog, figure out lsn. This works*
2641	* if iclogs marked XLOG_STATE_WANT_SYNC always write out what they are
2642	* committing to. If the offset is set, that's how many blocks
2643	* must be written.
2644	*/
2645	if (log_offset == `0`) {
2646	ticket->t_curr_res -= log->l_iclog_hsize;
2647	head->h_cycle = cpu_to_be32(log->l_curr_cycle);
2648	head->h_lsn = cpu_to_be64(
2649	xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block));
2650	ASSERT(log->l_curr_block >= `0`);
2651	}
2652
2653	/ If there is enough room to write everything, then do it. Otherwise,*
2654	* claim the rest of the region and make sure the XLOG_STATE_WANT_SYNC
2655	* bit is on, so this will get flushed out. Don't update ic_offset
2656	* until you know exactly how many bytes get copied. Therefore, wait
2657	* until later to update ic_offset.
2658	*
2659	* xlog_write() algorithm assumes that at least 2 xlog_op_header_t's
2660	* can fit into remaining data section.
2661	*/
2662	if (iclog->ic_size - iclog->ic_offset < `2`*sizeof(xlog_op_header_t)) {
2663	int error = `0`;
2664
2665	xlog_state_switch_iclogs(log, iclog, eventual_size: iclog->ic_size);
2666
2667	/*
2668	* If we are the only one writing to this iclog, sync it to
2669	* disk. We need to do an atomic compare and decrement here to
2670	* avoid racing with concurrent atomic_dec_and_lock() calls in
2671	* xlog_state_release_iclog() when there is more than one
2672	* reference to the iclog.
2673	*/
2674	if (!atomic_add_unless(v: &iclog->ic_refcnt, a: -`1`, u: `1`))
2675	error = xlog_state_release_iclog(log, iclog, ticket);
2676	spin_unlock(lock: &log->l_icloglock);
2677	if (error)
2678	return error;
2679	goto restart;
2680	}
2681
2682	/ Do we have enough room to write the full amount in the remainder*
2683	* of this iclog? Or must we continue a write on the next iclog and
2684	* mark this iclog as completely taken? In the case where we switch
2685	* iclogs (to mark it taken), this particular iclog will release/sync
2686	* to disk in xlog_write().
2687	*/
2688	if (len <= iclog->ic_size - iclog->ic_offset)
2689	iclog->ic_offset += len;
2690	else
2691	xlog_state_switch_iclogs(log, iclog, eventual_size: iclog->ic_size);
2692	*iclogp = iclog;
2693
2694	ASSERT(iclog->ic_offset <= iclog->ic_size);
2695	spin_unlock(lock: &log->l_icloglock);
2696
2697	*logoffsetp = log_offset;
2698	return `0`;
2699	}
2700
2701	/*
2702	* The first cnt-1 times a ticket goes through here we don't need to move the
2703	* grant write head because the permanent reservation has reserved cnt times the
2704	* unit amount. Release part of current permanent unit reservation and reset
2705	* current reservation to be one units worth. Also move grant reservation head
2706	* forward.
2707	*/
2708	void
2709	xfs_log_ticket_regrant(
2710	struct xlog *log,
2711	struct xlog_ticket *ticket)
2712	{
2713	trace_xfs_log_ticket_regrant(log, tic: ticket);
2714
2715	if (ticket->t_cnt > `0`)
2716	ticket->t_cnt--;
2717
2718	xlog_grant_sub_space(head: &log->l_reserve_head, bytes: ticket->t_curr_res);
2719	xlog_grant_sub_space(head: &log->l_write_head, bytes: ticket->t_curr_res);
2720	ticket->t_curr_res = ticket->t_unit_res;
2721
2722	trace_xfs_log_ticket_regrant_sub(log, tic: ticket);
2723
2724	/ just return if we still have some of the pre-reserved space /
2725	if (!ticket->t_cnt) {
2726	xlog_grant_add_space(head: &log->l_reserve_head, bytes: ticket->t_unit_res);
2727	trace_xfs_log_ticket_regrant_exit(log, tic: ticket);
2728	}
2729
2730	xfs_log_ticket_put(ticket);
2731	}
2732
2733	/*
2734	* Give back the space left from a reservation.
2735	*
2736	* All the information we need to make a correct determination of space left
2737	* is present. For non-permanent reservations, things are quite easy. The
2738	* count should have been decremented to zero. We only need to deal with the
2739	* space remaining in the current reservation part of the ticket. If the
2740	* ticket contains a permanent reservation, there may be left over space which
2741	* needs to be released. A count of N means that N-1 refills of the current
2742	* reservation can be done before we need to ask for more space. The first
2743	* one goes to fill up the first current reservation. Once we run out of
2744	* space, the count will stay at zero and the only space remaining will be
2745	* in the current reservation field.
2746	*/
2747	void
2748	xfs_log_ticket_ungrant(
2749	struct xlog *log,
2750	struct xlog_ticket *ticket)
2751	{
2752	int bytes;
2753
2754	trace_xfs_log_ticket_ungrant(log, tic: ticket);
2755
2756	if (ticket->t_cnt > `0`)
2757	ticket->t_cnt--;
2758
2759	trace_xfs_log_ticket_ungrant_sub(log, tic: ticket);
2760
2761	/*
2762	* If this is a permanent reservation ticket, we may be able to free
2763	* up more space based on the remaining count.
2764	*/
2765	bytes = ticket->t_curr_res;
2766	if (ticket->t_cnt > `0`) {
2767	ASSERT(ticket->t_flags & XLOG_TIC_PERM_RESERV);
2768	bytes += ticket->t_unit_res*ticket->t_cnt;
2769	}
2770
2771	xlog_grant_sub_space(head: &log->l_reserve_head, bytes);
2772	xlog_grant_sub_space(head: &log->l_write_head, bytes);
2773
2774	trace_xfs_log_ticket_ungrant_exit(log, tic: ticket);
2775
2776	xfs_log_space_wake(mp: log->l_mp);
2777	xfs_log_ticket_put(ticket);
2778	}
2779
2780	/*
2781	* This routine will mark the current iclog in the ring as WANT_SYNC and move
2782	* the current iclog pointer to the next iclog in the ring.
2783	*/
2784	void
2785	xlog_state_switch_iclogs(
2786	struct xlog *log,
2787	struct xlog_in_core *iclog,
2788	int eventual_size)
2789	{
2790	ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
2791	assert_spin_locked(&log->l_icloglock);
2792	trace_xlog_iclog_switch(iclog, _RET_IP_);
2793
2794	if (!eventual_size)
2795	eventual_size = iclog->ic_offset;
2796	iclog->ic_state = XLOG_STATE_WANT_SYNC;
2797	iclog->ic_header.h_prev_block = cpu_to_be32(log->l_prev_block);
2798	log->l_prev_block = log->l_curr_block;
2799	log->l_prev_cycle = log->l_curr_cycle;
2800
2801	/ roll log?: ic_offset changed later /
2802	log->l_curr_block += BTOBB(eventual_size)+BTOBB(log->l_iclog_hsize);
2803
2804	/ Round up to next log-sunit /
2805	if (log->l_iclog_roundoff > BBSIZE) {
2806	uint32_t sunit_bb = BTOBB(log->l_iclog_roundoff);
2807	log->l_curr_block = roundup(log->l_curr_block, sunit_bb);
2808	}
2809
2810	if (log->l_curr_block >= log->l_logBBsize) {
2811	/*
2812	* Rewind the current block before the cycle is bumped to make
2813	* sure that the combined LSN never transiently moves forward
2814	* when the log wraps to the next cycle. This is to support the
2815	* unlocked sample of these fields from xlog_valid_lsn(). Most
2816	* other cases should acquire l_icloglock.
2817	*/
2818	log->l_curr_block -= log->l_logBBsize;
2819	ASSERT(log->l_curr_block >= `0`);
2820	smp_wmb();
2821	log->l_curr_cycle++;
2822	if (log->l_curr_cycle == XLOG_HEADER_MAGIC_NUM)
2823	log->l_curr_cycle++;
2824	}
2825	ASSERT(iclog == log->l_iclog);
2826	log->l_iclog = iclog->ic_next;
2827	}
2828
2829	/*
2830	* Force the iclog to disk and check if the iclog has been completed before
2831	* xlog_force_iclog() returns. This can happen on synchronous (e.g.
2832	* pmem) or fast async storage because we drop the icloglock to issue the IO.
2833	* If completion has already occurred, tell the caller so that it can avoid an
2834	* unnecessary wait on the iclog.
2835	*/
2836	static int
2837	xlog_force_and_check_iclog(
2838	struct xlog_in_core *iclog,
2839	bool *completed)
2840	{
2841	xfs_lsn_t lsn = be64_to_cpu(iclog->ic_header.h_lsn);
2842	int error;
2843
2844	*completed = false;
2845	error = xlog_force_iclog(iclog);
2846	if (error)
2847	return error;
2848
2849	/*
2850	* If the iclog has already been completed and reused the header LSN
2851	* will have been rewritten by completion
2852	*/
2853	if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn)
2854	*completed = true;
2855	return `0`;
2856	}
2857
2858	/*
2859	* Write out all data in the in-core log as of this exact moment in time.
2860	*
2861	* Data may be written to the in-core log during this call. However,
2862	* we don't guarantee this data will be written out. A change from past
2863	* implementation means this routine will not write out zero length LRs.
2864	*
2865	* Basically, we try and perform an intelligent scan of the in-core logs.
2866	* If we determine there is no flushable data, we just return. There is no
2867	* flushable data if:
2868	*
2869	* 1. the current iclog is active and has no data; the previous iclog
2870	* is in the active or dirty state.
2871	* 2. the current iclog is dirty, and the previous iclog is in the
2872	* active or dirty state.
2873	*
2874	* We may sleep if:
2875	*
2876	* 1. the current iclog is not in the active nor dirty state.
2877	* 2. the current iclog dirty, and the previous iclog is not in the
2878	* active nor dirty state.
2879	* 3. the current iclog is active, and there is another thread writing
2880	* to this particular iclog.
2881	* 4. a) the current iclog is active and has no other writers
2882	* b) when we return from flushing out this iclog, it is still
2883	* not in the active nor dirty state.
2884	*/
2885	int
2886	xfs_log_force(
2887	struct xfs_mount *mp,
2888	uint flags)
2889	{
2890	struct xlog *log = mp->m_log;
2891	struct xlog_in_core *iclog;
2892
2893	XFS_STATS_INC(mp, xs_log_force);
2894	trace_xfs_log_force(mp, `0`, _RET_IP_);
2895
2896	xlog_cil_force(log);
2897
2898	spin_lock(lock: &log->l_icloglock);
2899	if (xlog_is_shutdown(log))
2900	goto out_error;
2901
2902	iclog = log->l_iclog;
2903	trace_xlog_iclog_force(iclog, _RET_IP_);
2904
2905	if (iclog->ic_state == XLOG_STATE_DIRTY \|\|
2906	(iclog->ic_state == XLOG_STATE_ACTIVE &&
2907	atomic_read(v: &iclog->ic_refcnt) == `0` && iclog->ic_offset == `0`)) {
2908	/*
2909	* If the head is dirty or (active and empty), then we need to
2910	* look at the previous iclog.
2911	*
2912	* If the previous iclog is active or dirty we are done. There
2913	* is nothing to sync out. Otherwise, we attach ourselves to the
2914	* previous iclog and go to sleep.
2915	*/
2916	iclog = iclog->ic_prev;
2917	} else if (iclog->ic_state == XLOG_STATE_ACTIVE) {
2918	if (atomic_read(v: &iclog->ic_refcnt) == `0`) {
2919	/ We have exclusive access to this iclog. /
2920	bool completed;
2921
2922	if (xlog_force_and_check_iclog(iclog, completed: &completed))
2923	goto out_error;
2924
2925	if (completed)
2926	goto out_unlock;
2927	} else {
2928	/*
2929	* Someone else is still writing to this iclog, so we
2930	* need to ensure that when they release the iclog it
2931	* gets synced immediately as we may be waiting on it.
2932	*/
2933	xlog_state_switch_iclogs(log, iclog, eventual_size: `0`);
2934	}
2935	}
2936
2937	/*
2938	* The iclog we are about to wait on may contain the checkpoint pushed
2939	* by the above xlog_cil_force() call, but it may not have been pushed
2940	* to disk yet. Like the ACTIVE case above, we need to make sure caches
2941	* are flushed when this iclog is written.
2942	*/
2943	if (iclog->ic_state == XLOG_STATE_WANT_SYNC)
2944	iclog->ic_flags \|= XLOG_ICL_NEED_FLUSH \| XLOG_ICL_NEED_FUA;
2945
2946	if (flags & XFS_LOG_SYNC)
2947	return xlog_wait_on_iclog(iclog);
2948	out_unlock:
2949	spin_unlock(lock: &log->l_icloglock);
2950	return `0`;
2951	out_error:
2952	spin_unlock(lock: &log->l_icloglock);
2953	return -EIO;
2954	}
2955
2956	/*
2957	* Force the log to a specific LSN.
2958	*
2959	* If an iclog with that lsn can be found:
2960	* If it is in the DIRTY state, just return.
2961	* If it is in the ACTIVE state, move the in-core log into the WANT_SYNC
2962	* state and go to sleep or return.
2963	* If it is in any other state, go to sleep or return.
2964	*
2965	* Synchronous forces are implemented with a wait queue. All callers trying
2966	* to force a given lsn to disk must wait on the queue attached to the
2967	* specific in-core log. When given in-core log finally completes its write
2968	* to disk, that thread will wake up all threads waiting on the queue.
2969	*/
2970	static int
2971	xlog_force_lsn(
2972	struct xlog *log,
2973	xfs_lsn_t lsn,
2974	uint flags,
2975	int *log_flushed,
2976	bool already_slept)
2977	{
2978	struct xlog_in_core *iclog;
2979	bool completed;
2980
2981	spin_lock(lock: &log->l_icloglock);
2982	if (xlog_is_shutdown(log))
2983	goto out_error;
2984
2985	iclog = log->l_iclog;
2986	while (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) {
2987	trace_xlog_iclog_force_lsn(iclog, _RET_IP_);
2988	iclog = iclog->ic_next;
2989	if (iclog == log->l_iclog)
2990	goto out_unlock;
2991	}
2992
2993	switch (iclog->ic_state) {
2994	case XLOG_STATE_ACTIVE:
2995	/*
2996	* We sleep here if we haven't already slept (e.g. this is the
2997	* first time we've looked at the correct iclog buf) and the
2998	* buffer before us is going to be sync'ed. The reason for this
2999	* is that if we are doing sync transactions here, by waiting
3000	* for the previous I/O to complete, we can allow a few more
3001	* transactions into this iclog before we close it down.
3002	*
3003	* Otherwise, we mark the buffer WANT_SYNC, and bump up the
3004	* refcnt so we can release the log (which drops the ref count).
3005	* The state switch keeps new transaction commits from using
3006	* this buffer. When the current commits finish writing into
3007	* the buffer, the refcount will drop to zero and the buffer
3008	* will go out then.
3009	*/
3010	if (!already_slept &&
3011	(iclog->ic_prev->ic_state == XLOG_STATE_WANT_SYNC \|\|
3012	iclog->ic_prev->ic_state == XLOG_STATE_SYNCING)) {
3013	xlog_wait(wq: &iclog->ic_prev->ic_write_wait,
3014	lock: &log->l_icloglock);
3015	return -EAGAIN;
3016	}
3017	if (xlog_force_and_check_iclog(iclog, completed: &completed))
3018	goto out_error;
3019	if (log_flushed)
3020	*log_flushed = `1`;
3021	if (completed)
3022	goto out_unlock;
3023	break;
3024	case XLOG_STATE_WANT_SYNC:
3025	/*
3026	* This iclog may contain the checkpoint pushed by the
3027	* xlog_cil_force_seq() call, but there are other writers still
3028	* accessing it so it hasn't been pushed to disk yet. Like the
3029	* ACTIVE case above, we need to make sure caches are flushed
3030	* when this iclog is written.
3031	*/
3032	iclog->ic_flags \|= XLOG_ICL_NEED_FLUSH \| XLOG_ICL_NEED_FUA;
3033	break;
3034	default:
3035	/*
3036	* The entire checkpoint was written by the CIL force and is on
3037	* its way to disk already. It will be stable when it
3038	* completes, so we don't need to manipulate caches here at all.
3039	* We just need to wait for completion if necessary.
3040	*/
3041	break;
3042	}
3043
3044	if (flags & XFS_LOG_SYNC)
3045	return xlog_wait_on_iclog(iclog);
3046	out_unlock:
3047	spin_unlock(lock: &log->l_icloglock);
3048	return `0`;
3049	out_error:
3050	spin_unlock(lock: &log->l_icloglock);
3051	return -EIO;
3052	}
3053
3054	/*
3055	* Force the log to a specific checkpoint sequence.
3056	*
3057	* First force the CIL so that all the required changes have been flushed to the
3058	* iclogs. If the CIL force completed it will return a commit LSN that indicates
3059	* the iclog that needs to be flushed to stable storage. If the caller needs
3060	* a synchronous log force, we will wait on the iclog with the LSN returned by
3061	* xlog_cil_force_seq() to be completed.
3062	*/
3063	int
3064	xfs_log_force_seq(
3065	struct xfs_mount *mp,
3066	xfs_csn_t seq,
3067	uint flags,
3068	int *log_flushed)
3069	{
3070	struct xlog *log = mp->m_log;
3071	xfs_lsn_t lsn;
3072	int ret;
3073	ASSERT(seq != `0`);
3074
3075	XFS_STATS_INC(mp, xs_log_force);
3076	trace_xfs_log_force(mp, seq, _RET_IP_);
3077
3078	lsn = xlog_cil_force_seq(log, seq);
3079	if (lsn == NULLCOMMITLSN)
3080	return `0`;
3081
3082	ret = xlog_force_lsn(log, lsn, flags, log_flushed, false);
3083	if (ret == -EAGAIN) {
3084	XFS_STATS_INC(mp, xs_log_force_sleep);
3085	ret = xlog_force_lsn(log, lsn, flags, log_flushed, true);
3086	}
3087	return ret;
3088	}
3089
3090	/*
3091	* Free a used ticket when its refcount falls to zero.
3092	*/
3093	void
3094	xfs_log_ticket_put(
3095	xlog_ticket_t *ticket)
3096	{
3097	ASSERT(atomic_read(&ticket->t_ref) > `0`);
3098	if (atomic_dec_and_test(v: &ticket->t_ref))
3099	kmem_cache_free(s: xfs_log_ticket_cache, objp: ticket);
3100	}
3101
3102	xlog_ticket_t *
3103	xfs_log_ticket_get(
3104	xlog_ticket_t *ticket)
3105	{
3106	ASSERT(atomic_read(&ticket->t_ref) > `0`);
3107	atomic_inc(v: &ticket->t_ref);
3108	return ticket;
3109	}
3110
3111	/*
3112	* Figure out the total log space unit (in bytes) that would be
3113	* required for a log ticket.
3114	*/
3115	static int
3116	xlog_calc_unit_res(
3117	struct xlog *log,
3118	int unit_bytes,
3119	int *niclogs)
3120	{
3121	int iclog_space;
3122	uint num_headers;
3123
3124	/*
3125	* Permanent reservations have up to 'cnt'-1 active log operations
3126	* in the log. A unit in this case is the amount of space for one
3127	* of these log operations. Normal reservations have a cnt of 1
3128	* and their unit amount is the total amount of space required.
3129	*
3130	* The following lines of code account for non-transaction data
3131	* which occupy space in the on-disk log.
3132	*
3133	* Normal form of a transaction is:
3134	* <oph><trans-hdr><start-oph><reg1-oph><reg1><reg2-oph>...<commit-oph>
3135	* and then there are LR hdrs, split-recs and roundoff at end of syncs.
3136	*
3137	* We need to account for all the leadup data and trailer data
3138	* around the transaction data.
3139	* And then we need to account for the worst case in terms of using
3140	* more space.
3141	* The worst case will happen if:
3142	* - the placement of the transaction happens to be such that the
3143	* roundoff is at its maximum
3144	* - the transaction data is synced before the commit record is synced
3145	* i.e. <transaction-data><roundoff> \| <commit-rec><roundoff>
3146	* Therefore the commit record is in its own Log Record.
3147	* This can happen as the commit record is called with its
3148	* own region to xlog_write().
3149	* This then means that in the worst case, roundoff can happen for
3150	* the commit-rec as well.
3151	* The commit-rec is smaller than padding in this scenario and so it is
3152	* not added separately.
3153	*/
3154
3155	/ for trans header /
3156	unit_bytes += sizeof(xlog_op_header_t);
3157	unit_bytes += sizeof(xfs_trans_header_t);
3158
3159	/ for start-rec /
3160	unit_bytes += sizeof(xlog_op_header_t);
3161
3162	/*
3163	* for LR headers - the space for data in an iclog is the size minus
3164	* the space used for the headers. If we use the iclog size, then we
3165	* undercalculate the number of headers required.
3166	*
3167	* Furthermore - the addition of op headers for split-recs might
3168	* increase the space required enough to require more log and op
3169	* headers, so take that into account too.
3170	*
3171	* IMPORTANT: This reservation makes the assumption that if this
3172	* transaction is the first in an iclog and hence has the LR headers
3173	* accounted to it, then the remaining space in the iclog is
3174	* exclusively for this transaction. i.e. if the transaction is larger
3175	* than the iclog, it will be the only thing in that iclog.
3176	* Fundamentally, this means we must pass the entire log vector to
3177	* xlog_write to guarantee this.
3178	*/
3179	iclog_space = log->l_iclog_size - log->l_iclog_hsize;
3180	num_headers = howmany(unit_bytes, iclog_space);
3181
3182	/ for split-recs - ophdrs added when data split over LRs /
3183	unit_bytes += sizeof(xlog_op_header_t) * num_headers;
3184
3185	/ add extra header reservations if we overrun /
3186	while (!num_headers \|\|
3187	howmany(unit_bytes, iclog_space) > num_headers) {
3188	unit_bytes += sizeof(xlog_op_header_t);
3189	num_headers++;
3190	}
3191	unit_bytes += log->l_iclog_hsize * num_headers;
3192
3193	/ for commit-rec LR header - note: padding will subsume the ophdr /
3194	unit_bytes += log->l_iclog_hsize;
3195
3196	/ roundoff padding for transaction data and one for commit record /
3197	unit_bytes += `2` * log->l_iclog_roundoff;
3198
3199	if (niclogs)
3200	*niclogs = num_headers;
3201	return unit_bytes;
3202	}
3203
3204	int
3205	xfs_log_calc_unit_res(
3206	struct xfs_mount *mp,
3207	int unit_bytes)
3208	{
3209	return xlog_calc_unit_res(log: mp->m_log, unit_bytes, NULL);
3210	}
3211
3212	/*
3213	* Allocate and initialise a new log ticket.
3214	*/
3215	struct xlog_ticket *
3216	xlog_ticket_alloc(
3217	struct xlog *log,
3218	int unit_bytes,
3219	int cnt,
3220	bool permanent)
3221	{
3222	struct xlog_ticket *tic;
3223	int unit_res;
3224
3225	tic = kmem_cache_zalloc(xfs_log_ticket_cache,
3226	GFP_KERNEL \| __GFP_NOFAIL);
3227
3228	unit_res = xlog_calc_unit_res(log, unit_bytes, niclogs: &tic->t_iclog_hdrs);
3229
3230	atomic_set(v: &tic->t_ref, i: `1`);
3231	tic->t_task = current;
3232	INIT_LIST_HEAD(list: &tic->t_queue);
3233	tic->t_unit_res = unit_res;
3234	tic->t_curr_res = unit_res;
3235	tic->t_cnt = cnt;
3236	tic->t_ocnt = cnt;
3237	tic->t_tid = get_random_u32();
3238	if (permanent)
3239	tic->t_flags \|= XLOG_TIC_PERM_RESERV;
3240
3241	return tic;
3242	}
3243
3244	#if defined(DEBUG)
3245	static void
3246	xlog_verify_dump_tail(
3247	struct xlog *log,
3248	struct xlog_in_core *iclog)
3249	{
3250	xfs_alert(log->l_mp,
3251	"ran out of log space tail 0x%llx/0x%llx, head lsn 0x%llx, head 0x%x/0x%x, prev head 0x%x/0x%x",
3252	iclog ? be64_to_cpu(iclog->ic_header.h_tail_lsn) : -`1`,
3253	atomic64_read(&log->l_tail_lsn),
3254	log->l_ailp->ail_head_lsn,
3255	log->l_curr_cycle, log->l_curr_block,
3256	log->l_prev_cycle, log->l_prev_block);
3257	xfs_alert(log->l_mp,
3258	"write grant 0x%llx, reserve grant 0x%llx, tail_space 0x%llx, size 0x%x, iclog flags 0x%x",
3259	atomic64_read(&log->l_write_head.grant),
3260	atomic64_read(&log->l_reserve_head.grant),
3261	log->l_tail_space, log->l_logsize,
3262	iclog ? iclog->ic_flags : -`1`);
3263	}
3264
3265	/ Check if the new iclog will fit in the log. /
3266	STATIC void
3267	xlog_verify_tail_lsn(
3268	struct xlog *log,
3269	struct xlog_in_core *iclog)
3270	{
3271	xfs_lsn_t tail_lsn = be64_to_cpu(iclog->ic_header.h_tail_lsn);
3272	int blocks;
3273
3274	if (CYCLE_LSN(tail_lsn) == log->l_prev_cycle) {
3275	blocks = log->l_logBBsize -
3276	(log->l_prev_block - BLOCK_LSN(tail_lsn));
3277	if (blocks < BTOBB(iclog->ic_offset) +
3278	BTOBB(log->l_iclog_hsize)) {
3279	xfs_emerg(log->l_mp,
3280	"%s: ran out of log space", __func__);
3281	xlog_verify_dump_tail(log, iclog);
3282	}
3283	return;
3284	}
3285
3286	if (CYCLE_LSN(tail_lsn) + `1` != log->l_prev_cycle) {
3287	xfs_emerg(log->l_mp, "%s: head has wrapped tail.", __func__);
3288	xlog_verify_dump_tail(log, iclog);
3289	return;
3290	}
3291	if (BLOCK_LSN(tail_lsn) == log->l_prev_block) {
3292	xfs_emerg(log->l_mp, "%s: tail wrapped", __func__);
3293	xlog_verify_dump_tail(log, iclog);
3294	return;
3295	}
3296
3297	blocks = BLOCK_LSN(tail_lsn) - log->l_prev_block;
3298	if (blocks < BTOBB(iclog->ic_offset) + `1`) {
3299	xfs_emerg(log->l_mp, "%s: ran out of iclog space", __func__);
3300	xlog_verify_dump_tail(log, iclog);
3301	}
3302	}
3303
3304	/*
3305	* Perform a number of checks on the iclog before writing to disk.
3306	*
3307	* 1. Make sure the iclogs are still circular
3308	* 2. Make sure we have a good magic number
3309	* 3. Make sure we don't have magic numbers in the data
3310	* 4. Check fields of each log operation header for:
3311	* A. Valid client identifier
3312	* B. tid ptr value falls in valid ptr space (user space code)
3313	* C. Length in log record header is correct according to the
3314	* individual operation headers within record.
3315	* 5. When a bwrite will occur within 5 blocks of the front of the physical
3316	* log, check the preceding blocks of the physical log to make sure all
3317	* the cycle numbers agree with the current cycle number.
3318	*/
3319	STATIC void
3320	xlog_verify_iclog(
3321	struct xlog *log,
3322	struct xlog_in_core *iclog,
3323	int count)
3324	{
3325	xlog_op_header_t *ophead;
3326	xlog_in_core_t *icptr;
3327	xlog_in_core_2_t *xhdr;
3328	void base_ptr, ptr, *p;
3329	ptrdiff_t field_offset;
3330	uint8_t clientid;
3331	int len, i, j, k, op_len;
3332	int idx;
3333
3334	/ check validity of iclog pointers /
3335	spin_lock(lock: &log->l_icloglock);
3336	icptr = log->l_iclog;
3337	for (i = `0`; i < log->l_iclog_bufs; i++, icptr = icptr->ic_next)
3338	ASSERT(icptr);
3339
3340	if (icptr != log->l_iclog)
3341	xfs_emerg(log->l_mp, "%s: corrupt iclog ring", __func__);
3342	spin_unlock(lock: &log->l_icloglock);
3343
3344	/ check log magic numbers /
3345	if (iclog->ic_header.h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM))
3346	xfs_emerg(log->l_mp, "%s: invalid magic num", __func__);
3347
3348	base_ptr = ptr = &iclog->ic_header;
3349	p = &iclog->ic_header;
3350	for (ptr += BBSIZE; ptr < base_ptr + count; ptr += BBSIZE) {
3351	if ((__be32 )ptr == cpu_to_be32(XLOG_HEADER_MAGIC_NUM))
3352	xfs_emerg(log->l_mp, "%s: unexpected magic num",
3353	__func__);
3354	}
3355
3356	/ check fields /
3357	len = be32_to_cpu(iclog->ic_header.h_num_logops);
3358	base_ptr = ptr = iclog->ic_datap;
3359	ophead = ptr;
3360	xhdr = iclog->ic_data;
3361	for (i = `0`; i < len; i++) {
3362	ophead = ptr;
3363
3364	/ clientid is only 1 byte /
3365	p = &ophead->oh_clientid;
3366	field_offset = p - base_ptr;
3367	if (field_offset & `0x1ff`) {
3368	clientid = ophead->oh_clientid;
3369	} else {
3370	idx = BTOBBT((void *)&ophead->oh_clientid - iclog->ic_datap);
3371	if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) {
3372	j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3373	k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3374	clientid = xlog_get_client_id(
3375	xhdr[j].hic_xheader.xh_cycle_data[k]);
3376	} else {
3377	clientid = xlog_get_client_id(
3378	i: iclog->ic_header.h_cycle_data[idx]);
3379	}
3380	}
3381	if (clientid != XFS_TRANSACTION && clientid != XFS_LOG) {
3382	xfs_warn(log->l_mp,
3383	"%s: op %d invalid clientid %d op "PTR_FMT" offset 0x%lx",
3384	__func__, i, clientid, ophead,
3385	(unsigned long)field_offset);
3386	}
3387
3388	/ check length /
3389	p = &ophead->oh_len;
3390	field_offset = p - base_ptr;
3391	if (field_offset & `0x1ff`) {
3392	op_len = be32_to_cpu(ophead->oh_len);
3393	} else {
3394	idx = BTOBBT((void *)&ophead->oh_len - iclog->ic_datap);
3395	if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) {
3396	j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3397	k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3398	op_len = be32_to_cpu(xhdr[j].hic_xheader.xh_cycle_data[k]);
3399	} else {
3400	op_len = be32_to_cpu(iclog->ic_header.h_cycle_data[idx]);
3401	}
3402	}
3403	ptr += sizeof(xlog_op_header_t) + op_len;
3404	}
3405	}
3406	#endif
3407
3408	/*
3409	* Perform a forced shutdown on the log.
3410	*
3411	* This can be called from low level log code to trigger a shutdown, or from the
3412	* high level mount shutdown code when the mount shuts down.
3413	*
3414	* Our main objectives here are to make sure that:
3415	* a. if the shutdown was not due to a log IO error, flush the logs to
3416	* disk. Anything modified after this is ignored.
3417	* b. the log gets atomically marked 'XLOG_IO_ERROR' for all interested
3418	* parties to find out. Nothing new gets queued after this is done.
3419	* c. Tasks sleeping on log reservations, pinned objects and
3420	* other resources get woken up.
3421	* d. The mount is also marked as shut down so that log triggered shutdowns
3422	* still behave the same as if they called xfs_forced_shutdown().
3423	*
3424	* Return true if the shutdown cause was a log IO error and we actually shut the
3425	* log down.
3426	*/
3427	bool
3428	xlog_force_shutdown(
3429	struct xlog *log,
3430	uint32_t shutdown_flags)
3431	{
3432	bool log_error = (shutdown_flags & SHUTDOWN_LOG_IO_ERROR);
3433
3434	if (!log)
3435	return false;
3436
3437	/*
3438	* Ensure that there is only ever one log shutdown being processed.
3439	* If we allow the log force below on a second pass after shutting
3440	* down the log, we risk deadlocking the CIL push as it may require
3441	* locks on objects the current shutdown context holds (e.g. taking
3442	* buffer locks to abort buffers on last unpin of buf log items).
3443	*/
3444	if (test_and_set_bit(XLOG_SHUTDOWN_STARTED, addr: &log->l_opstate))
3445	return false;
3446
3447	/*
3448	* Flush all the completed transactions to disk before marking the log
3449	* being shut down. We need to do this first as shutting down the log
3450	* before the force will prevent the log force from flushing the iclogs
3451	* to disk.
3452	*
3453	* When we are in recovery, there are no transactions to flush, and
3454	* we don't want to touch the log because we don't want to perturb the
3455	* current head/tail for future recovery attempts. Hence we need to
3456	* avoid a log force in this case.
3457	*
3458	* If we are shutting down due to a log IO error, then we must avoid
3459	* trying to write the log as that may just result in more IO errors and
3460	* an endless shutdown/force loop.
3461	*/
3462	if (!log_error && !xlog_in_recovery(log))
3463	xfs_log_force(mp: log->l_mp, XFS_LOG_SYNC);
3464
3465	/*
3466	* Atomically set the shutdown state. If the shutdown state is already
3467	* set, there someone else is performing the shutdown and so we are done
3468	* here. This should never happen because we should only ever get called
3469	* once by the first shutdown caller.
3470	*
3471	* Much of the log state machine transitions assume that shutdown state
3472	* cannot change once they hold the log->l_icloglock. Hence we need to
3473	* hold that lock here, even though we use the atomic test_and_set_bit()
3474	* operation to set the shutdown state.
3475	*/
3476	spin_lock(lock: &log->l_icloglock);
3477	if (test_and_set_bit(XLOG_IO_ERROR, addr: &log->l_opstate)) {
3478	spin_unlock(lock: &log->l_icloglock);
3479	ASSERT(`0`);
3480	return false;
3481	}
3482	spin_unlock(lock: &log->l_icloglock);
3483
3484	/*
3485	* If this log shutdown also sets the mount shutdown state, issue a
3486	* shutdown warning message.
3487	*/
3488	if (!xfs_set_shutdown(mp: log->l_mp)) {
3489	xfs_alert_tag(log->l_mp, XFS_PTAG_SHUTDOWN_LOGERROR,
3490	"Filesystem has been shut down due to log error (0x%x).",
3491	shutdown_flags);
3492	xfs_alert(log->l_mp,
3493	"Please unmount the filesystem and rectify the problem(s).");
3494	if (xfs_error_level >= XFS_ERRLEVEL_HIGH)
3495	xfs_stack_trace();
3496	}
3497
3498	/*
3499	* We don't want anybody waiting for log reservations after this. That
3500	* means we have to wake up everybody queued up on reserveq as well as
3501	* writeq. In addition, we make sure in xlog_{re}grant_log_space that
3502	* we don't enqueue anything once the SHUTDOWN flag is set, and this
3503	* action is protected by the grant locks.
3504	*/
3505	xlog_grant_head_wake_all(head: &log->l_reserve_head);
3506	xlog_grant_head_wake_all(head: &log->l_write_head);
3507
3508	/*
3509	* Wake up everybody waiting on xfs_log_force. Wake the CIL push first
3510	* as if the log writes were completed. The abort handling in the log
3511	* item committed callback functions will do this again under lock to
3512	* avoid races.
3513	*/
3514	spin_lock(lock: &log->l_cilp->xc_push_lock);
3515	wake_up_all(&log->l_cilp->xc_start_wait);
3516	wake_up_all(&log->l_cilp->xc_commit_wait);
3517	spin_unlock(lock: &log->l_cilp->xc_push_lock);
3518
3519	spin_lock(lock: &log->l_icloglock);
3520	xlog_state_shutdown_callbacks(log);
3521	spin_unlock(lock: &log->l_icloglock);
3522
3523	wake_up_var(var: &log->l_opstate);
3524	if (IS_ENABLED(CONFIG_XFS_RT) && xfs_has_zoned(mp: log->l_mp))
3525	xfs_zoned_wake_all(mp: log->l_mp);
3526
3527	return log_error;
3528	}
3529
3530	STATIC int
3531	xlog_iclogs_empty(
3532	struct xlog *log)
3533	{
3534	xlog_in_core_t *iclog;
3535
3536	iclog = log->l_iclog;
3537	do {
3538	/ endianness does not matter here, zero is zero in*
3539	* any language.
3540	*/
3541	if (iclog->ic_header.h_num_logops)
3542	return `0`;
3543	iclog = iclog->ic_next;
3544	} while (iclog != log->l_iclog);
3545	return `1`;
3546	}
3547
3548	/*
3549	* Verify that an LSN stamped into a piece of metadata is valid. This is
3550	* intended for use in read verifiers on v5 superblocks.
3551	*/
3552	bool
3553	xfs_log_check_lsn(
3554	struct xfs_mount *mp,
3555	xfs_lsn_t lsn)
3556	{
3557	struct xlog *log = mp->m_log;
3558	bool valid;
3559
3560	/*
3561	* norecovery mode skips mount-time log processing and unconditionally
3562	* resets the in-core LSN. We can't validate in this mode, but
3563	* modifications are not allowed anyways so just return true.
3564	*/
3565	if (xfs_has_norecovery(mp))
3566	return true;
3567
3568	/*
3569	* Some metadata LSNs are initialized to NULL (e.g., the agfl). This is
3570	* handled by recovery and thus safe to ignore here.
3571	*/
3572	if (lsn == NULLCOMMITLSN)
3573	return true;
3574
3575	valid = xlog_valid_lsn(mp->m_log, lsn);
3576
3577	/ warn the user about what's gone wrong before verifier failure /
3578	if (!valid) {
3579	spin_lock(lock: &log->l_icloglock);
3580	xfs_warn(mp,
3581	"Corruption warning: Metadata has LSN (%d:%d) ahead of current LSN (%d:%d). "
3582	"Please unmount and run xfs_repair (>= v4.3) to resolve.",
3583	CYCLE_LSN(lsn), BLOCK_LSN(lsn),
3584	log->l_curr_cycle, log->l_curr_block);
3585	spin_unlock(lock: &log->l_icloglock);
3586	}
3587
3588	return valid;
3589	}
3590

source code of linux/fs/xfs/xfs_log.c