pipe.c source code [linux/fs/pipe.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* linux/fs/pipe.c
4	*
5	* Copyright (C) 1991, 1992, 1999 Linus Torvalds
6	*/
7
8	#include <linux/mm.h>
9	#include <linux/file.h>
10	#include <linux/poll.h>
11	#include <linux/slab.h>
12	#include <linux/module.h>
13	#include <linux/init.h>
14	#include <linux/fs.h>
15	#include <linux/log2.h>
16	#include <linux/mount.h>
17	#include <linux/pseudo_fs.h>
18	#include <linux/magic.h>
19	#include <linux/pipe_fs_i.h>
20	#include <linux/uio.h>
21	#include <linux/highmem.h>
22	#include <linux/pagemap.h>
23	#include <linux/audit.h>
24	#include <linux/syscalls.h>
25	#include <linux/fcntl.h>
26	#include <linux/memcontrol.h>
27	#include <linux/watch_queue.h>
28	#include <linux/sysctl.h>
29	#include <linux/sort.h>
30
31	#include <linux/uaccess.h>
32	#include <asm/ioctls.h>
33
34	#include "internal.h"
35
36	/*
37	* New pipe buffers will be restricted to this size while the user is exceeding
38	* their pipe buffer quota. The general pipe use case needs at least two
39	* buffers: one for data yet to be read, and one for new data. If this is less
40	* than two, then a write to a non-empty pipe may block even if the pipe is not
41	* full. This can occur with GNU make jobserver or similar uses of pipes as
42	* semaphores: multiple processes may be waiting to write tokens back to the
43	* pipe before reading tokens: https://lore.kernel.org/lkml/1628086770.5rn8p04n6j.none@localhost/.
44	*
45	* Users can reduce their pipe buffers with F_SETPIPE_SZ below this at their
46	* own risk, namely: pipe writes to non-full pipes may block until the pipe is
47	* emptied.
48	*/
49	#define PIPE_MIN_DEF_BUFFERS 2
50
51	/*
52	* The max size that a non-root user is allowed to grow the pipe. Can
53	* be set by root in /proc/sys/fs/pipe-max-size
54	*/
55	static unsigned int pipe_max_size = `1048576`;
56
57	/ Maximum allocatable pages per user. Hard limit is unset by default, soft*
58	* matches default values.
59	*/
60	static unsigned long pipe_user_pages_hard;
61	static unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR;
62
63	/*
64	* We use head and tail indices that aren't masked off, except at the point of
65	* dereference, but rather they're allowed to wrap naturally. This means there
66	* isn't a dead spot in the buffer, but the ring has to be a power of two and
67	* <= 2^31.
68	* -- David Howells 2019-09-23.
69	*
70	* Reads with count = 0 should always return 0.
71	* -- Julian Bradfield 1999-06-07.
72	*
73	* FIFOs and Pipes now generate SIGIO for both readers and writers.
74	* -- Jeremy Elson <jelson@circlemud.org> 2001-08-16
75	*
76	* pipe_read & write cleanup
77	* -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09
78	*/
79
80	#ifdef CONFIG_PROVE_LOCKING
81	static int pipe_lock_cmp_fn(const struct lockdep_map *a,
82	const struct lockdep_map *b)
83	{
84	return cmp_int((unsigned long) a, (unsigned long) b);
85	}
86	#endif
87
88	void pipe_lock(struct pipe_inode_info *pipe)
89	{
90	if (pipe->files)
91	mutex_lock(&pipe->mutex);
92	}
93	EXPORT_SYMBOL(pipe_lock);
94
95	void pipe_unlock(struct pipe_inode_info *pipe)
96	{
97	if (pipe->files)
98	mutex_unlock(lock: &pipe->mutex);
99	}
100	EXPORT_SYMBOL(pipe_unlock);
101
102	void pipe_double_lock(struct pipe_inode_info *pipe1,
103	struct pipe_inode_info *pipe2)
104	{
105	BUG_ON(pipe1 == pipe2);
106
107	if (pipe1 > pipe2)
108	swap(pipe1, pipe2);
109
110	pipe_lock(pipe1);
111	pipe_lock(pipe2);
112	}
113
114	static struct page anon_pipe_get_page(struct* pipe_inode_info *pipe)
115	{
116	for (int i = `0`; i < ARRAY_SIZE(pipe->tmp_page); i++) {
117	if (pipe->tmp_page[i]) {
118	struct page *page = pipe->tmp_page[i];
119	pipe->tmp_page[i] = NULL;
120	return page;
121	}
122	}
123
124	return alloc_page(GFP_HIGHUSER \| __GFP_ACCOUNT);
125	}
126
127	static void anon_pipe_put_page(struct pipe_inode_info *pipe,
128	struct page *page)
129	{
130	if (page_count(page) == `1`) {
131	for (int i = `0`; i < ARRAY_SIZE(pipe->tmp_page); i++) {
132	if (!pipe->tmp_page[i]) {
133	pipe->tmp_page[i] = page;
134	return;
135	}
136	}
137	}
138
139	put_page(page);
140	}
141
142	static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
143	struct pipe_buffer *buf)
144	{
145	struct page *page = buf->page;
146
147	anon_pipe_put_page(pipe, page);
148	}
149
150	static bool anon_pipe_buf_try_steal(struct pipe_inode_info *pipe,
151	struct pipe_buffer *buf)
152	{
153	struct page *page = buf->page;
154
155	if (page_count(page) != `1`)
156	return false;
157	memcg_kmem_uncharge_page(page, order: `0`);
158	__SetPageLocked(page);
159	return true;
160	}
161
162	/**
163	* generic_pipe_buf_try_steal - attempt to take ownership of a &pipe_buffer
164	* @pipe: the pipe that the buffer belongs to
165	* @buf: the buffer to attempt to steal
166	*
167	* Description:
168	* This function attempts to steal the &struct page attached to
169	* @buf. If successful, this function returns 0 and returns with
170	* the page locked. The caller may then reuse the page for whatever
171	* he wishes; the typical use is insertion into a different file
172	* page cache.
173	*/
174	bool generic_pipe_buf_try_steal(struct pipe_inode_info *pipe,
175	struct pipe_buffer *buf)
176	{
177	struct page *page = buf->page;
178
179	/*
180	* A reference of one is golden, that means that the owner of this
181	* page is the only one holding a reference to it. lock the page
182	* and return OK.
183	*/
184	if (page_count(page) == `1`) {
185	lock_page(page);
186	return true;
187	}
188	return false;
189	}
190	EXPORT_SYMBOL(generic_pipe_buf_try_steal);
191
192	/**
193	* generic_pipe_buf_get - get a reference to a &struct pipe_buffer
194	* @pipe: the pipe that the buffer belongs to
195	* @buf: the buffer to get a reference to
196	*
197	* Description:
198	* This function grabs an extra reference to @buf. It's used in
199	* the tee() system call, when we duplicate the buffers in one
200	* pipe into another.
201	*/
202	bool generic_pipe_buf_get(struct pipe_inode_info pipe, struct* pipe_buffer *buf)
203	{
204	return try_get_page(page: buf->page);
205	}
206	EXPORT_SYMBOL(generic_pipe_buf_get);
207
208	/**
209	* generic_pipe_buf_release - put a reference to a &struct pipe_buffer
210	* @pipe: the pipe that the buffer belongs to
211	* @buf: the buffer to put a reference to
212	*
213	* Description:
214	* This function releases a reference to @buf.
215	*/
216	void generic_pipe_buf_release(struct pipe_inode_info *pipe,
217	struct pipe_buffer *buf)
218	{
219	put_page(page: buf->page);
220	}
221	EXPORT_SYMBOL(generic_pipe_buf_release);
222
223	static const struct pipe_buf_operations anon_pipe_buf_ops = {
224	.release = anon_pipe_buf_release,
225	.try_steal = anon_pipe_buf_try_steal,
226	.get = generic_pipe_buf_get,
227	};
228
229	/ Done while waiting without holding the pipe lock - thus the READ_ONCE() /
230	static inline bool pipe_readable(const struct pipe_inode_info *pipe)
231	{
232	union pipe_index idx = { .head_tail = READ_ONCE(pipe->head_tail) };
233	unsigned int writers = READ_ONCE(pipe->writers);
234
235	return !pipe_empty(head: idx.head, tail: idx.tail) \|\| !writers;
236	}
237
238	static inline unsigned int pipe_update_tail(struct pipe_inode_info *pipe,
239	struct pipe_buffer *buf,
240	unsigned int tail)
241	{
242	pipe_buf_release(pipe, buf);
243
244	/*
245	* If the pipe has a watch_queue, we need additional protection
246	* by the spinlock because notifications get posted with only
247	* this spinlock, no mutex
248	*/
249	if (pipe_has_watch_queue(pipe)) {
250	spin_lock_irq(lock: &pipe->rd_wait.lock);
251	#ifdef CONFIG_WATCH_QUEUE
252	if (buf->flags & PIPE_BUF_FLAG_LOSS)
253	pipe->note_loss = true;
254	#endif
255	pipe->tail = ++tail;
256	spin_unlock_irq(lock: &pipe->rd_wait.lock);
257	return tail;
258	}
259
260	/*
261	* Without a watch_queue, we can simply increment the tail
262	* without the spinlock - the mutex is enough.
263	*/
264	pipe->tail = ++tail;
265	return tail;
266	}
267
268	static ssize_t
269	anon_pipe_read(struct kiocb iocb, struct* iov_iter *to)
270	{
271	size_t total_len = iov_iter_count(i: to);
272	struct file *filp = iocb->ki_filp;
273	struct pipe_inode_info *pipe = filp->private_data;
274	bool wake_writer = false, wake_next_reader = false;
275	ssize_t ret;
276
277	/ Null read succeeds. /
278	if (unlikely(total_len == `0`))
279	return `0`;
280
281	ret = `0`;
282	mutex_lock(&pipe->mutex);
283
284	/*
285	* We only wake up writers if the pipe was full when we started reading
286	* and it is no longer full after reading to avoid unnecessary wakeups.
287	*
288	* But when we do wake up writers, we do so using a sync wakeup
289	* (WF_SYNC), because we want them to get going and generate more
290	* data for us.
291	*/
292	for (;;) {
293	/ Read ->head with a barrier vs post_one_notification() /
294	unsigned int head = smp_load_acquire(&pipe->head);
295	unsigned int tail = pipe->tail;
296
297	#ifdef CONFIG_WATCH_QUEUE
298	if (pipe->note_loss) {
299	struct watch_notification n;
300
301	if (total_len < `8`) {
302	if (ret == `0`)
303	ret = -ENOBUFS;
304	break;
305	}
306
307	n.type = WATCH_TYPE_META;
308	n.subtype = WATCH_META_LOSS_NOTIFICATION;
309	n.info = watch_sizeof(n);
310	if (copy_to_iter(addr: &n, bytes: sizeof(n), i: to) != sizeof(n)) {
311	if (ret == `0`)
312	ret = -EFAULT;
313	break;
314	}
315	ret += sizeof(n);
316	total_len -= sizeof(n);
317	pipe->note_loss = false;
318	}
319	#endif
320
321	if (!pipe_empty(head, tail)) {
322	struct pipe_buffer *buf = pipe_buf(pipe, slot: tail);
323	size_t chars = buf->len;
324	size_t written;
325	int error;
326
327	if (chars > total_len) {
328	if (buf->flags & PIPE_BUF_FLAG_WHOLE) {
329	if (ret == `0`)
330	ret = -ENOBUFS;
331	break;
332	}
333	chars = total_len;
334	}
335
336	error = pipe_buf_confirm(pipe, buf);
337	if (error) {
338	if (!ret)
339	ret = error;
340	break;
341	}
342
343	written = copy_page_to_iter(page: buf->page, offset: buf->offset, bytes: chars, i: to);
344	if (unlikely(written < chars)) {
345	if (!ret)
346	ret = -EFAULT;
347	break;
348	}
349	ret += chars;
350	buf->offset += chars;
351	buf->len -= chars;
352
353	/ Was it a packet buffer? Clean up and exit /
354	if (buf->flags & PIPE_BUF_FLAG_PACKET) {
355	total_len = chars;
356	buf->len = `0`;
357	}
358
359	if (!buf->len) {
360	wake_writer \|= pipe_full(head, tail, limit: pipe->max_usage);
361	tail = pipe_update_tail(pipe, buf, tail);
362	}
363	total_len -= chars;
364	if (!total_len)
365	break; / common path: read succeeded /
366	if (!pipe_empty(head, tail)) / More to do? /
367	continue;
368	}
369
370	if (!pipe->writers)
371	break;
372	if (ret)
373	break;
374	if ((filp->f_flags & O_NONBLOCK) \|\|
375	(iocb->ki_flags & IOCB_NOWAIT)) {
376	ret = -EAGAIN;
377	break;
378	}
379	mutex_unlock(lock: &pipe->mutex);
380	/*
381	* We only get here if we didn't actually read anything.
382	*
383	* But because we didn't read anything, at this point we can
384	* just return directly with -ERESTARTSYS if we're interrupted,
385	* since we've done any required wakeups and there's no need
386	* to mark anything accessed. And we've dropped the lock.
387	*/
388	if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < `0`)
389	return -ERESTARTSYS;
390
391	wake_next_reader = true;
392	mutex_lock(&pipe->mutex);
393	}
394	if (pipe_is_empty(pipe))
395	wake_next_reader = false;
396	mutex_unlock(lock: &pipe->mutex);
397
398	if (wake_writer)
399	wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT \| EPOLLWRNORM);
400	if (wake_next_reader)
401	wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN \| EPOLLRDNORM);
402	kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
403	return ret;
404	}
405
406	static ssize_t
407	fifo_pipe_read(struct kiocb iocb, struct* iov_iter *to)
408	{
409	int ret = anon_pipe_read(iocb, to);
410	if (ret > `0`)
411	file_accessed(file: iocb->ki_filp);
412	return ret;
413	}
414
415	static inline int is_packetized(struct file *file)
416	{
417	return (file->f_flags & O_DIRECT) != `0`;
418	}
419
420	/ Done while waiting without holding the pipe lock - thus the READ_ONCE() /
421	static inline bool pipe_writable(const struct pipe_inode_info *pipe)
422	{
423	union pipe_index idx = { .head_tail = READ_ONCE(pipe->head_tail) };
424	unsigned int max_usage = READ_ONCE(pipe->max_usage);
425
426	return !pipe_full(head: idx.head, tail: idx.tail, limit: max_usage) \|\|
427	!READ_ONCE(pipe->readers);
428	}
429
430	static ssize_t
431	anon_pipe_write(struct kiocb iocb, struct* iov_iter *from)
432	{
433	struct file *filp = iocb->ki_filp;
434	struct pipe_inode_info *pipe = filp->private_data;
435	unsigned int head;
436	ssize_t ret = `0`;
437	size_t total_len = iov_iter_count(i: from);
438	ssize_t chars;
439	bool was_empty = false;
440	bool wake_next_writer = false;
441
442	/*
443	* Reject writing to watch queue pipes before the point where we lock
444	* the pipe.
445	* Otherwise, lockdep would be unhappy if the caller already has another
446	* pipe locked.
447	* If we had to support locking a normal pipe and a notification pipe at
448	* the same time, we could set up lockdep annotations for that, but
449	* since we don't actually need that, it's simpler to just bail here.
450	*/
451	if (pipe_has_watch_queue(pipe))
452	return -EXDEV;
453
454	/ Null write succeeds. /
455	if (unlikely(total_len == `0`))
456	return `0`;
457
458	mutex_lock(&pipe->mutex);
459
460	if (!pipe->readers) {
461	if ((iocb->ki_flags & IOCB_NOSIGNAL) == `0`)
462	send_sig(SIGPIPE, current, `0`);
463	ret = -EPIPE;
464	goto out;
465	}
466
467	/*
468	* If it wasn't empty we try to merge new data into
469	* the last buffer.
470	*
471	* That naturally merges small writes, but it also
472	* page-aligns the rest of the writes for large writes
473	* spanning multiple pages.
474	*/
475	head = pipe->head;
476	was_empty = pipe_empty(head, tail: pipe->tail);
477	chars = total_len & (PAGE_SIZE-`1`);
478	if (chars && !was_empty) {
479	struct pipe_buffer *buf = pipe_buf(pipe, slot: head - `1`);
480	int offset = buf->offset + buf->len;
481
482	if ((buf->flags & PIPE_BUF_FLAG_CAN_MERGE) &&
483	offset + chars <= PAGE_SIZE) {
484	ret = pipe_buf_confirm(pipe, buf);
485	if (ret)
486	goto out;
487
488	ret = copy_page_from_iter(page: buf->page, offset, bytes: chars, i: from);
489	if (unlikely(ret < chars)) {
490	ret = -EFAULT;
491	goto out;
492	}
493
494	buf->len += ret;
495	if (!iov_iter_count(i: from))
496	goto out;
497	}
498	}
499
500	for (;;) {
501	if (!pipe->readers) {
502	if ((iocb->ki_flags & IOCB_NOSIGNAL) == `0`)
503	send_sig(SIGPIPE, current, `0`);
504	if (!ret)
505	ret = -EPIPE;
506	break;
507	}
508
509	head = pipe->head;
510	if (!pipe_full(head, tail: pipe->tail, limit: pipe->max_usage)) {
511	struct pipe_buffer *buf;
512	struct page *page;
513	int copied;
514
515	page = anon_pipe_get_page(pipe);
516	if (unlikely(!page)) {
517	if (!ret)
518	ret = -ENOMEM;
519	break;
520	}
521
522	copied = copy_page_from_iter(page, offset: `0`, PAGE_SIZE, i: from);
523	if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) {
524	anon_pipe_put_page(pipe, page);
525	if (!ret)
526	ret = -EFAULT;
527	break;
528	}
529
530	pipe->head = head + `1`;
531	/ Insert it into the buffer array /
532	buf = pipe_buf(pipe, slot: head);
533	buf->page = page;
534	buf->ops = &anon_pipe_buf_ops;
535	buf->offset = `0`;
536	if (is_packetized(file: filp))
537	buf->flags = PIPE_BUF_FLAG_PACKET;
538	else
539	buf->flags = PIPE_BUF_FLAG_CAN_MERGE;
540
541	buf->len = copied;
542	ret += copied;
543
544	if (!iov_iter_count(i: from))
545	break;
546
547	continue;
548	}
549
550	/ Wait for buffer space to become available. /
551	if ((filp->f_flags & O_NONBLOCK) \|\|
552	(iocb->ki_flags & IOCB_NOWAIT)) {
553	if (!ret)
554	ret = -EAGAIN;
555	break;
556	}
557	if (signal_pending(current)) {
558	if (!ret)
559	ret = -ERESTARTSYS;
560	break;
561	}
562
563	/*
564	* We're going to release the pipe lock and wait for more
565	* space. We wake up any readers if necessary, and then
566	* after waiting we need to re-check whether the pipe
567	* become empty while we dropped the lock.
568	*/
569	mutex_unlock(lock: &pipe->mutex);
570	if (was_empty)
571	wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN \| EPOLLRDNORM);
572	kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
573	wait_event_interruptible_exclusive(pipe->wr_wait, pipe_writable(pipe));
574	mutex_lock(&pipe->mutex);
575	was_empty = pipe_is_empty(pipe);
576	wake_next_writer = true;
577	}
578	out:
579	if (pipe_is_full(pipe))
580	wake_next_writer = false;
581	mutex_unlock(lock: &pipe->mutex);
582
583	/*
584	* If we do do a wakeup event, we do a 'sync' wakeup, because we
585	* want the reader to start processing things asap, rather than
586	* leave the data pending.
587	*
588	* This is particularly important for small writes, because of
589	* how (for example) the GNU make jobserver uses small writes to
590	* wake up pending jobs
591	*
592	* Epoll nonsensically wants a wakeup whether the pipe
593	* was already empty or not.
594	*/
595	if (was_empty \|\| pipe->poll_usage)
596	wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN \| EPOLLRDNORM);
597	kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
598	if (wake_next_writer)
599	wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT \| EPOLLWRNORM);
600	return ret;
601	}
602
603	static ssize_t
604	fifo_pipe_write(struct kiocb iocb, struct* iov_iter *from)
605	{
606	int ret = anon_pipe_write(iocb, from);
607	if (ret > `0`) {
608	struct file *filp = iocb->ki_filp;
609	if (sb_start_write_trylock(sb: file_inode(f: filp)->i_sb)) {
610	int err = file_update_time(file: filp);
611	if (err)
612	ret = err;
613	sb_end_write(sb: file_inode(f: filp)->i_sb);
614	}
615	}
616	return ret;
617	}
618
619	static long pipe_ioctl(struct file filp, unsigned* int cmd, unsigned long arg)
620	{
621	struct pipe_inode_info *pipe = filp->private_data;
622	unsigned int count, head, tail;
623
624	switch (cmd) {
625	case FIONREAD:
626	mutex_lock(&pipe->mutex);
627	count = `0`;
628	head = pipe->head;
629	tail = pipe->tail;
630
631	while (!pipe_empty(head, tail)) {
632	count += pipe_buf(pipe, slot: tail)->len;
633	tail++;
634	}
635	mutex_unlock(lock: &pipe->mutex);
636
637	return put_user(count, (int __user *)arg);
638
639	#ifdef CONFIG_WATCH_QUEUE
640	case IOC_WATCH_QUEUE_SET_SIZE: {
641	int ret;
642	mutex_lock(&pipe->mutex);
643	ret = watch_queue_set_size(pipe, arg);
644	mutex_unlock(lock: &pipe->mutex);
645	return ret;
646	}
647
648	case IOC_WATCH_QUEUE_SET_FILTER:
649	return watch_queue_set_filter(
650	pipe, (struct watch_notification_filter __user *)arg);
651	#endif
652
653	default:
654	return -ENOIOCTLCMD;
655	}
656	}
657
658	/ No kernel lock held - fine /
659	static __poll_t
660	pipe_poll(struct file filp, poll_table wait)
661	{
662	__poll_t mask;
663	struct pipe_inode_info *pipe = filp->private_data;
664	union pipe_index idx;
665
666	/ Epoll has some historical nasty semantics, this enables them /
667	WRITE_ONCE(pipe->poll_usage, true);
668
669	/*
670	* Reading pipe state only -- no need for acquiring the semaphore.
671	*
672	* But because this is racy, the code has to add the
673	* entry to the poll table _first_ ..
674	*/
675	if (filp->f_mode & FMODE_READ)
676	poll_wait(filp, wait_address: &pipe->rd_wait, p: wait);
677	if (filp->f_mode & FMODE_WRITE)
678	poll_wait(filp, wait_address: &pipe->wr_wait, p: wait);
679
680	/*
681	* .. and only then can you do the racy tests. That way,
682	* if something changes and you got it wrong, the poll
683	* table entry will wake you up and fix it.
684	*/
685	idx.head_tail = READ_ONCE(pipe->head_tail);
686
687	mask = `0`;
688	if (filp->f_mode & FMODE_READ) {
689	if (!pipe_empty(head: idx.head, tail: idx.tail))
690	mask \|= EPOLLIN \| EPOLLRDNORM;
691	if (!pipe->writers && filp->f_pipe != pipe->w_counter)
692	mask \|= EPOLLHUP;
693	}
694
695	if (filp->f_mode & FMODE_WRITE) {
696	if (!pipe_full(head: idx.head, tail: idx.tail, limit: pipe->max_usage))
697	mask \|= EPOLLOUT \| EPOLLWRNORM;
698	/*
699	* Most Unices do not set EPOLLERR for FIFOs but on Linux they
700	* behave exactly like pipes for poll().
701	*/
702	if (!pipe->readers)
703	mask \|= EPOLLERR;
704	}
705
706	return mask;
707	}
708
709	static void put_pipe_info(struct inode inode, struct* pipe_inode_info *pipe)
710	{
711	int kill = `0`;
712
713	spin_lock(lock: &inode->i_lock);
714	if (!--pipe->files) {
715	inode->i_pipe = NULL;
716	kill = `1`;
717	}
718	spin_unlock(lock: &inode->i_lock);
719
720	if (kill)
721	free_pipe_info(pipe);
722	}
723
724	static int
725	pipe_release(struct inode inode, struct* file *file)
726	{
727	struct pipe_inode_info *pipe = file->private_data;
728
729	mutex_lock(&pipe->mutex);
730	if (file->f_mode & FMODE_READ)
731	pipe->readers--;
732	if (file->f_mode & FMODE_WRITE)
733	pipe->writers--;
734
735	/ Was that the last reader or writer, but not the other side? /
736	if (!pipe->readers != !pipe->writers) {
737	wake_up_interruptible_all(&pipe->rd_wait);
738	wake_up_interruptible_all(&pipe->wr_wait);
739	kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
740	kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
741	}
742	mutex_unlock(lock: &pipe->mutex);
743
744	put_pipe_info(inode, pipe);
745	return `0`;
746	}
747
748	static int
749	pipe_fasync(int fd, struct file filp, int* on)
750	{
751	struct pipe_inode_info *pipe = filp->private_data;
752	int retval = `0`;
753
754	mutex_lock(&pipe->mutex);
755	if (filp->f_mode & FMODE_READ)
756	retval = fasync_helper(fd, filp, on, &pipe->fasync_readers);
757	if ((filp->f_mode & FMODE_WRITE) && retval >= `0`) {
758	retval = fasync_helper(fd, filp, on, &pipe->fasync_writers);
759	if (retval < `0` && (filp->f_mode & FMODE_READ))
760	/ this can happen only if on == T /
761	fasync_helper(-`1`, filp, `0`, &pipe->fasync_readers);
762	}
763	mutex_unlock(lock: &pipe->mutex);
764	return retval;
765	}
766
767	unsigned long account_pipe_buffers(struct user_struct *user,
768	unsigned long old, unsigned long new)
769	{
770	return atomic_long_add_return(i: new - old, v: &user->pipe_bufs);
771	}
772
773	bool too_many_pipe_buffers_soft(unsigned long user_bufs)
774	{
775	unsigned long soft_limit = READ_ONCE(pipe_user_pages_soft);
776
777	return soft_limit && user_bufs > soft_limit;
778	}
779
780	bool too_many_pipe_buffers_hard(unsigned long user_bufs)
781	{
782	unsigned long hard_limit = READ_ONCE(pipe_user_pages_hard);
783
784	return hard_limit && user_bufs > hard_limit;
785	}
786
787	bool pipe_is_unprivileged_user(void)
788	{
789	return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
790	}
791
792	struct pipe_inode_info alloc_pipe_info(void*)
793	{
794	struct pipe_inode_info *pipe;
795	unsigned long pipe_bufs = PIPE_DEF_BUFFERS;
796	struct user_struct *user = get_current_user();
797	unsigned long user_bufs;
798	unsigned int max_size = READ_ONCE(pipe_max_size);
799
800	pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_ACCOUNT);
801	if (pipe == NULL)
802	goto out_free_uid;
803
804	if (pipe_bufs * PAGE_SIZE > max_size && !capable(CAP_SYS_RESOURCE))
805	pipe_bufs = max_size >> PAGE_SHIFT;
806
807	user_bufs = account_pipe_buffers(user, old: `0`, new: pipe_bufs);
808
809	if (too_many_pipe_buffers_soft(user_bufs) && pipe_is_unprivileged_user()) {
810	user_bufs = account_pipe_buffers(user, old: pipe_bufs, PIPE_MIN_DEF_BUFFERS);
811	pipe_bufs = PIPE_MIN_DEF_BUFFERS;
812	}
813
814	if (too_many_pipe_buffers_hard(user_bufs) && pipe_is_unprivileged_user())
815	goto out_revert_acct;
816
817	pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer),
818	GFP_KERNEL_ACCOUNT);
819
820	if (pipe->bufs) {
821	init_waitqueue_head(&pipe->rd_wait);
822	init_waitqueue_head(&pipe->wr_wait);
823	pipe->r_counter = pipe->w_counter = `1`;
824	pipe->max_usage = pipe_bufs;
825	pipe->ring_size = pipe_bufs;
826	pipe->nr_accounted = pipe_bufs;
827	pipe->user = user;
828	mutex_init(&pipe->mutex);
829	lock_set_cmp_fn(&pipe->mutex, pipe_lock_cmp_fn, NULL);
830	return pipe;
831	}
832
833	out_revert_acct:
834	(void) account_pipe_buffers(user, old: pipe_bufs, new: `0`);
835	kfree(objp: pipe);
836	out_free_uid:
837	free_uid(user);
838	return NULL;
839	}
840
841	void free_pipe_info(struct pipe_inode_info *pipe)
842	{
843	unsigned int i;
844
845	#ifdef CONFIG_WATCH_QUEUE
846	if (pipe->watch_queue)
847	watch_queue_clear(pipe->watch_queue);
848	#endif
849
850	(void) account_pipe_buffers(user: pipe->user, old: pipe->nr_accounted, new: `0`);
851	free_uid(pipe->user);
852	for (i = `0`; i < pipe->ring_size; i++) {
853	struct pipe_buffer *buf = pipe->bufs + i;
854	if (buf->ops)
855	pipe_buf_release(pipe, buf);
856	}
857	#ifdef CONFIG_WATCH_QUEUE
858	if (pipe->watch_queue)
859	put_watch_queue(pipe->watch_queue);
860	#endif
861	for (i = `0`; i < ARRAY_SIZE(pipe->tmp_page); i++) {
862	if (pipe->tmp_page[i])
863	__free_page(pipe->tmp_page[i]);
864	}
865	kfree(objp: pipe->bufs);
866	kfree(objp: pipe);
867	}
868
869	static struct vfsmount *pipe_mnt __ro_after_init;
870
871	/*
872	* pipefs_dname() is called from d_path().
873	*/
874	static char pipefs_dname(struct* dentry dentry, char* buffer, int* buflen)
875	{
876	return dynamic_dname(buffer, buflen, "pipe:[%lu]",
877	d_inode(dentry)->i_ino);
878	}
879
880	static const struct dentry_operations pipefs_dentry_operations = {
881	.d_dname = pipefs_dname,
882	};
883
884	static const struct file_operations pipeanon_fops;
885
886	static struct inode * get_pipe_inode(void)
887	{
888	struct inode *inode = new_inode_pseudo(sb: pipe_mnt->mnt_sb);
889	struct pipe_inode_info *pipe;
890
891	if (!inode)
892	goto fail_inode;
893
894	inode->i_ino = get_next_ino();
895
896	pipe = alloc_pipe_info();
897	if (!pipe)
898	goto fail_iput;
899
900	inode->i_pipe = pipe;
901	pipe->files = `2`;
902	pipe->readers = pipe->writers = `1`;
903	inode->i_fop = &pipeanon_fops;
904
905	/*
906	* Mark the inode dirty from the very beginning,
907	* that way it will never be moved to the dirty
908	* list because "mark_inode_dirty()" will think
909	* that it already _is_ on the dirty list.
910	*/
911	inode_state_assign_raw(inode, I_DIRTY);
912	inode->i_mode = S_IFIFO \| S_IRUSR \| S_IWUSR;
913	inode->i_uid = current_fsuid();
914	inode->i_gid = current_fsgid();
915	simple_inode_init_ts(inode);
916
917	return inode;
918
919	fail_iput:
920	iput(inode);
921
922	fail_inode:
923	return NULL;
924	}
925
926	int create_pipe_files(struct file *res, int* flags)
927	{
928	struct inode *inode = get_pipe_inode();
929	struct file *f;
930	int error;
931
932	if (!inode)
933	return -ENFILE;
934
935	if (flags & O_NOTIFICATION_PIPE) {
936	error = watch_queue_init(inode->i_pipe);
937	if (error) {
938	free_pipe_info(pipe: inode->i_pipe);
939	iput(inode);
940	return error;
941	}
942	}
943
944	f = alloc_file_pseudo(inode, pipe_mnt, "",
945	O_WRONLY \| (flags & (O_NONBLOCK \| O_DIRECT)),
946	&pipeanon_fops);
947	if (IS_ERR(ptr: f)) {
948	free_pipe_info(pipe: inode->i_pipe);
949	iput(inode);
950	return PTR_ERR(ptr: f);
951	}
952
953	f->private_data = inode->i_pipe;
954	f->f_pipe = `0`;
955
956	res[`0`] = alloc_file_clone(f, O_RDONLY \| (flags & O_NONBLOCK),
957	&pipeanon_fops);
958	if (IS_ERR(ptr: res[`0`])) {
959	put_pipe_info(inode, pipe: inode->i_pipe);
960	fput(f);
961	return PTR_ERR(ptr: res[`0`]);
962	}
963	res[`0`]->private_data = inode->i_pipe;
964	res[`0`]->f_pipe = `0`;
965	res[`1`] = f;
966	stream_open(inode, filp: res[`0`]);
967	stream_open(inode, filp: res[`1`]);
968
969	/ pipe groks IOCB_NOWAIT /
970	res[`0`]->f_mode \|= FMODE_NOWAIT;
971	res[`1`]->f_mode \|= FMODE_NOWAIT;
972
973	/*
974	* Disable permission and pre-content events, but enable legacy
975	* inotify events for legacy users.
976	*/
977	file_set_fsnotify_mode(file: res[`0`], FMODE_NONOTIFY_PERM);
978	file_set_fsnotify_mode(file: res[`1`], FMODE_NONOTIFY_PERM);
979	return `0`;
980	}
981
982	static int __do_pipe_flags(int fd, struct* file *files, int* flags)
983	{
984	int error;
985	int fdw, fdr;
986
987	if (flags & ~(O_CLOEXEC \| O_NONBLOCK \| O_DIRECT \| O_NOTIFICATION_PIPE))
988	return -EINVAL;
989
990	error = create_pipe_files(res: files, flags);
991	if (error)
992	return error;
993
994	error = get_unused_fd_flags(flags);
995	if (error < `0`)
996	goto err_read_pipe;
997	fdr = error;
998
999	error = get_unused_fd_flags(flags);
1000	if (error < `0`)
1001	goto err_fdr;
1002	fdw = error;
1003
1004	audit_fd_pair(fd1: fdr, fd2: fdw);
1005	fd[`0`] = fdr;
1006	fd[`1`] = fdw;
1007	return `0`;
1008
1009	err_fdr:
1010	put_unused_fd(fd: fdr);
1011	err_read_pipe:
1012	fput(files[`0`]);
1013	fput(files[`1`]);
1014	return error;
1015	}
1016
1017	int do_pipe_flags(int fd, int* flags)
1018	{
1019	struct file *files[`2`];
1020	int error = __do_pipe_flags(fd, files, flags);
1021	if (!error) {
1022	fd_install(fd: fd[`0`], file: files[`0`]);
1023	fd_install(fd: fd[`1`], file: files[`1`]);
1024	}
1025	return error;
1026	}
1027
1028	/*
1029	* sys_pipe() is the normal C calling standard for creating
1030	* a pipe. It's not the way Unix traditionally does this, though.
1031	*/
1032	static int do_pipe2(int __user fildes, int* flags)
1033	{
1034	struct file *files[`2`];
1035	int fd[`2`];
1036	int error;
1037
1038	error = __do_pipe_flags(fd, files, flags);
1039	if (!error) {
1040	if (unlikely(copy_to_user(fildes, fd, sizeof(fd)))) {
1041	fput(files[`0`]);
1042	fput(files[`1`]);
1043	put_unused_fd(fd: fd[`0`]);
1044	put_unused_fd(fd: fd[`1`]);
1045	error = -EFAULT;
1046	} else {
1047	fd_install(fd: fd[`0`], file: files[`0`]);
1048	fd_install(fd: fd[`1`], file: files[`1`]);
1049	}
1050	}
1051	return error;
1052	}
1053
1054	SYSCALL_DEFINE2(pipe2, int __user , fildes, int*, flags)
1055	{
1056	return do_pipe2(fildes, flags);
1057	}
1058
1059	SYSCALL_DEFINE1(pipe, int __user *, fildes)
1060	{
1061	return do_pipe2(fildes, flags: `0`);
1062	}
1063
1064	/*
1065	* This is the stupid "wait for pipe to be readable or writable"
1066	* model.
1067	*
1068	* See pipe_read/write() for the proper kind of exclusive wait,
1069	* but that requires that we wake up any other readers/writers
1070	* if we then do not end up reading everything (ie the whole
1071	* "wake_next_reader/writer" logic in pipe_read/write()).
1072	*/
1073	void pipe_wait_readable(struct pipe_inode_info *pipe)
1074	{
1075	pipe_unlock(pipe);
1076	wait_event_interruptible(pipe->rd_wait, pipe_readable(pipe));
1077	pipe_lock(pipe);
1078	}
1079
1080	void pipe_wait_writable(struct pipe_inode_info *pipe)
1081	{
1082	pipe_unlock(pipe);
1083	wait_event_interruptible(pipe->wr_wait, pipe_writable(pipe));
1084	pipe_lock(pipe);
1085	}
1086
1087	/*
1088	* This depends on both the wait (here) and the wakeup (wake_up_partner)
1089	* holding the pipe lock, so "*cnt" is stable and we know a wakeup cannot
1090	* race with the count check and waitqueue prep.
1091	*
1092	* Normally in order to avoid races, you'd do the prepare_to_wait() first,
1093	* then check the condition you're waiting for, and only then sleep. But
1094	* because of the pipe lock, we can check the condition before being on
1095	* the wait queue.
1096	*
1097	* We use the 'rd_wait' waitqueue for pipe partner waiting.
1098	*/
1099	static int wait_for_partner(struct pipe_inode_info pipe, unsigned* int *cnt)
1100	{
1101	DEFINE_WAIT(rdwait);
1102	int cur = *cnt;
1103
1104	while (cur == *cnt) {
1105	prepare_to_wait(wq_head: &pipe->rd_wait, wq_entry: &rdwait, TASK_INTERRUPTIBLE);
1106	pipe_unlock(pipe);
1107	schedule();
1108	finish_wait(wq_head: &pipe->rd_wait, wq_entry: &rdwait);
1109	pipe_lock(pipe);
1110	if (signal_pending(current))
1111	break;
1112	}
1113	return cur == *cnt ? -ERESTARTSYS : `0`;
1114	}
1115
1116	static void wake_up_partner(struct pipe_inode_info *pipe)
1117	{
1118	wake_up_interruptible_all(&pipe->rd_wait);
1119	}
1120
1121	static int fifo_open(struct inode inode, struct* file *filp)
1122	{
1123	bool is_pipe = inode->i_fop == &pipeanon_fops;
1124	struct pipe_inode_info *pipe;
1125	int ret;
1126
1127	filp->f_pipe = `0`;
1128
1129	spin_lock(lock: &inode->i_lock);
1130	if (inode->i_pipe) {
1131	pipe = inode->i_pipe;
1132	pipe->files++;
1133	spin_unlock(lock: &inode->i_lock);
1134	} else {
1135	spin_unlock(lock: &inode->i_lock);
1136	pipe = alloc_pipe_info();
1137	if (!pipe)
1138	return -ENOMEM;
1139	pipe->files = `1`;
1140	spin_lock(lock: &inode->i_lock);
1141	if (unlikely(inode->i_pipe)) {
1142	inode->i_pipe->files++;
1143	spin_unlock(lock: &inode->i_lock);
1144	free_pipe_info(pipe);
1145	pipe = inode->i_pipe;
1146	} else {
1147	inode->i_pipe = pipe;
1148	spin_unlock(lock: &inode->i_lock);
1149	}
1150	}
1151	filp->private_data = pipe;
1152	/ OK, we have a pipe and it's pinned down /
1153
1154	mutex_lock(&pipe->mutex);
1155
1156	/ We can only do regular read/write on fifos /
1157	stream_open(inode, filp);
1158
1159	switch (filp->f_mode & (FMODE_READ \| FMODE_WRITE)) {
1160	case FMODE_READ:
1161	/*
1162	* O_RDONLY
1163	* POSIX.1 says that O_NONBLOCK means return with the FIFO
1164	* opened, even when there is no process writing the FIFO.
1165	*/
1166	pipe->r_counter++;
1167	if (pipe->readers++ == `0`)
1168	wake_up_partner(pipe);
1169
1170	if (!is_pipe && !pipe->writers) {
1171	if ((filp->f_flags & O_NONBLOCK)) {
1172	/ suppress EPOLLHUP until we have*
1173	* seen a writer */
1174	filp->f_pipe = pipe->w_counter;
1175	} else {
1176	if (wait_for_partner(pipe, cnt: &pipe->w_counter))
1177	goto err_rd;
1178	}
1179	}
1180	break;
1181
1182	case FMODE_WRITE:
1183	/*
1184	* O_WRONLY
1185	* POSIX.1 says that O_NONBLOCK means return -1 with
1186	* errno=ENXIO when there is no process reading the FIFO.
1187	*/
1188	ret = -ENXIO;
1189	if (!is_pipe && (filp->f_flags & O_NONBLOCK) && !pipe->readers)
1190	goto err;
1191
1192	pipe->w_counter++;
1193	if (!pipe->writers++)
1194	wake_up_partner(pipe);
1195
1196	if (!is_pipe && !pipe->readers) {
1197	if (wait_for_partner(pipe, cnt: &pipe->r_counter))
1198	goto err_wr;
1199	}
1200	break;
1201
1202	case FMODE_READ \| FMODE_WRITE:
1203	/*
1204	* O_RDWR
1205	* POSIX.1 leaves this case "undefined" when O_NONBLOCK is set.
1206	* This implementation will NEVER block on a O_RDWR open, since
1207	* the process can at least talk to itself.
1208	*/
1209
1210	pipe->readers++;
1211	pipe->writers++;
1212	pipe->r_counter++;
1213	pipe->w_counter++;
1214	if (pipe->readers == `1` \|\| pipe->writers == `1`)
1215	wake_up_partner(pipe);
1216	break;
1217
1218	default:
1219	ret = -EINVAL;
1220	goto err;
1221	}
1222
1223	/ Ok! /
1224	mutex_unlock(lock: &pipe->mutex);
1225	return `0`;
1226
1227	err_rd:
1228	if (!--pipe->readers)
1229	wake_up_interruptible(&pipe->wr_wait);
1230	ret = -ERESTARTSYS;
1231	goto err;
1232
1233	err_wr:
1234	if (!--pipe->writers)
1235	wake_up_interruptible_all(&pipe->rd_wait);
1236	ret = -ERESTARTSYS;
1237	goto err;
1238
1239	err:
1240	mutex_unlock(lock: &pipe->mutex);
1241
1242	put_pipe_info(inode, pipe);
1243	return ret;
1244	}
1245
1246	const struct file_operations pipefifo_fops = {
1247	.open = fifo_open,
1248	.read_iter = fifo_pipe_read,
1249	.write_iter = fifo_pipe_write,
1250	.poll = pipe_poll,
1251	.unlocked_ioctl = pipe_ioctl,
1252	.release = pipe_release,
1253	.fasync = pipe_fasync,
1254	.splice_write = iter_file_splice_write,
1255	};
1256
1257	static const struct file_operations pipeanon_fops = {
1258	.open = fifo_open,
1259	.read_iter = anon_pipe_read,
1260	.write_iter = anon_pipe_write,
1261	.poll = pipe_poll,
1262	.unlocked_ioctl = pipe_ioctl,
1263	.release = pipe_release,
1264	.fasync = pipe_fasync,
1265	.splice_write = iter_file_splice_write,
1266	};
1267
1268	/*
1269	* Currently we rely on the pipe array holding a power-of-2 number
1270	* of pages. Returns 0 on error.
1271	*/
1272	unsigned int round_pipe_size(unsigned int size)
1273	{
1274	if (size > (`1U` << `31`))
1275	return `0`;
1276
1277	/ Minimum pipe size, as required by POSIX /
1278	if (size < PAGE_SIZE)
1279	return PAGE_SIZE;
1280
1281	return roundup_pow_of_two(size);
1282	}
1283
1284	/*
1285	* Resize the pipe ring to a number of slots.
1286	*
1287	* Note the pipe can be reduced in capacity, but only if the current
1288	* occupancy doesn't exceed nr_slots; if it does, EBUSY will be
1289	* returned instead.
1290	*/
1291	int pipe_resize_ring(struct pipe_inode_info pipe, unsigned* int nr_slots)
1292	{
1293	struct pipe_buffer *bufs;
1294	unsigned int head, tail, mask, n;
1295
1296	/ nr_slots larger than limits of pipe->{head,tail} /
1297	if (unlikely(nr_slots > (pipe_index_t)-`1u`))
1298	return -EINVAL;
1299
1300	bufs = kcalloc(nr_slots, sizeof(*bufs),
1301	GFP_KERNEL_ACCOUNT \| __GFP_NOWARN);
1302	if (unlikely(!bufs))
1303	return -ENOMEM;
1304
1305	spin_lock_irq(lock: &pipe->rd_wait.lock);
1306	mask = pipe->ring_size - `1`;
1307	head = pipe->head;
1308	tail = pipe->tail;
1309
1310	n = pipe_occupancy(head, tail);
1311	if (nr_slots < n) {
1312	spin_unlock_irq(lock: &pipe->rd_wait.lock);
1313	kfree(objp: bufs);
1314	return -EBUSY;
1315	}
1316
1317	/*
1318	* The pipe array wraps around, so just start the new one at zero
1319	* and adjust the indices.
1320	*/
1321	if (n > `0`) {
1322	unsigned int h = head & mask;
1323	unsigned int t = tail & mask;
1324	if (h > t) {
1325	memcpy(bufs, pipe->bufs + t,
1326	n * sizeof(struct pipe_buffer));
1327	} else {
1328	unsigned int tsize = pipe->ring_size - t;
1329	if (h > `0`)
1330	memcpy(bufs + tsize, pipe->bufs,
1331	h * sizeof(struct pipe_buffer));
1332	memcpy(bufs, pipe->bufs + t,
1333	tsize * sizeof(struct pipe_buffer));
1334	}
1335	}
1336
1337	head = n;
1338	tail = `0`;
1339
1340	kfree(objp: pipe->bufs);
1341	pipe->bufs = bufs;
1342	pipe->ring_size = nr_slots;
1343	if (pipe->max_usage > nr_slots)
1344	pipe->max_usage = nr_slots;
1345	pipe->tail = tail;
1346	pipe->head = head;
1347
1348	if (!pipe_has_watch_queue(pipe)) {
1349	pipe->max_usage = nr_slots;
1350	pipe->nr_accounted = nr_slots;
1351	}
1352
1353	spin_unlock_irq(lock: &pipe->rd_wait.lock);
1354
1355	/ This might have made more room for writers /
1356	wake_up_interruptible(&pipe->wr_wait);
1357	return `0`;
1358	}
1359
1360	/*
1361	* Allocate a new array of pipe buffers and copy the info over. Returns the
1362	* pipe size if successful, or return -ERROR on error.
1363	*/
1364	static long pipe_set_size(struct pipe_inode_info pipe, unsigned* int arg)
1365	{
1366	unsigned long user_bufs;
1367	unsigned int nr_slots, size;
1368	long ret = `0`;
1369
1370	if (pipe_has_watch_queue(pipe))
1371	return -EBUSY;
1372
1373	size = round_pipe_size(size: arg);
1374	nr_slots = size >> PAGE_SHIFT;
1375
1376	if (!nr_slots)
1377	return -EINVAL;
1378
1379	/*
1380	* If trying to increase the pipe capacity, check that an
1381	* unprivileged user is not trying to exceed various limits
1382	* (soft limit check here, hard limit check just below).
1383	* Decreasing the pipe capacity is always permitted, even
1384	* if the user is currently over a limit.
1385	*/
1386	if (nr_slots > pipe->max_usage &&
1387	size > pipe_max_size && !capable(CAP_SYS_RESOURCE))
1388	return -EPERM;
1389
1390	user_bufs = account_pipe_buffers(user: pipe->user, old: pipe->nr_accounted, new: nr_slots);
1391
1392	if (nr_slots > pipe->max_usage &&
1393	(too_many_pipe_buffers_hard(user_bufs) \|\|
1394	too_many_pipe_buffers_soft(user_bufs)) &&
1395	pipe_is_unprivileged_user()) {
1396	ret = -EPERM;
1397	goto out_revert_acct;
1398	}
1399
1400	ret = pipe_resize_ring(pipe, nr_slots);
1401	if (ret < `0`)
1402	goto out_revert_acct;
1403
1404	return pipe->max_usage * PAGE_SIZE;
1405
1406	out_revert_acct:
1407	(void) account_pipe_buffers(user: pipe->user, old: nr_slots, new: pipe->nr_accounted);
1408	return ret;
1409	}
1410
1411	/*
1412	* Note that i_pipe and i_cdev share the same location, so checking ->i_pipe is
1413	* not enough to verify that this is a pipe.
1414	*/
1415	struct pipe_inode_info get_pipe_info(struct* file *file, bool for_splice)
1416	{
1417	struct pipe_inode_info *pipe = file->private_data;
1418
1419	if (!pipe)
1420	return NULL;
1421	if (file->f_op != &pipefifo_fops && file->f_op != &pipeanon_fops)
1422	return NULL;
1423	if (for_splice && pipe_has_watch_queue(pipe))
1424	return NULL;
1425	return pipe;
1426	}
1427
1428	long pipe_fcntl(struct file file, unsigned* int cmd, unsigned int arg)
1429	{
1430	struct pipe_inode_info *pipe;
1431	long ret;
1432
1433	pipe = get_pipe_info(file, for_splice: false);
1434	if (!pipe)
1435	return -EBADF;
1436
1437	mutex_lock(&pipe->mutex);
1438
1439	switch (cmd) {
1440	case F_SETPIPE_SZ:
1441	ret = pipe_set_size(pipe, arg);
1442	break;
1443	case F_GETPIPE_SZ:
1444	ret = pipe->max_usage * PAGE_SIZE;
1445	break;
1446	default:
1447	ret = -EINVAL;
1448	break;
1449	}
1450
1451	mutex_unlock(lock: &pipe->mutex);
1452	return ret;
1453	}
1454
1455	static const struct super_operations pipefs_ops = {
1456	.destroy_inode = free_inode_nonrcu,
1457	.statfs = simple_statfs,
1458	};
1459
1460	/*
1461	* pipefs should _never_ be mounted by userland - too much of security hassle,
1462	* no real gain from having the whole file system mounted. So we don't need
1463	* any operations on the root directory. However, we need a non-trivial
1464	* d_name - pipe: will go nicely and kill the special-casing in procfs.
1465	*/
1466
1467	static int pipefs_init_fs_context(struct fs_context *fc)
1468	{
1469	struct pseudo_fs_context *ctx = init_pseudo(fc, PIPEFS_MAGIC);
1470	if (!ctx)
1471	return -ENOMEM;
1472	ctx->ops = &pipefs_ops;
1473	ctx->dops = &pipefs_dentry_operations;
1474	return `0`;
1475	}
1476
1477	static struct file_system_type pipe_fs_type = {
1478	.name = "pipefs",
1479	.init_fs_context = pipefs_init_fs_context,
1480	.kill_sb = kill_anon_super,
1481	};
1482
1483	#ifdef CONFIG_SYSCTL
1484	static SYSCTL_USER_TO_KERN_UINT_CONV(_pipe_maxsz, round_pipe_size)
1485	static SYSCTL_UINT_CONV_CUSTOM(_pipe_maxsz,
1486	sysctl_user_to_kern_uint_conv_pipe_maxsz,
1487	sysctl_kern_to_user_uint_conv, true)
1488
1489	static int proc_dopipe_max_size(const struct ctl_table table, int* write,
1490	void buffer, size_t lenp, loff_t *ppos)
1491	{
1492	return proc_douintvec_conv(table, write, buffer, lenp, ppos,
1493	conv: do_proc_uint_conv_pipe_maxsz);
1494	}
1495
1496	static const struct ctl_table fs_pipe_sysctls[] = {
1497	{
1498	.procname = "pipe-max-size",
1499	.data = &pipe_max_size,
1500	.maxlen = sizeof(pipe_max_size),
1501	.mode = `0644`,
1502	.proc_handler = proc_dopipe_max_size,
1503	.extra1 = SYSCTL_ONE,
1504	},
1505	{
1506	.procname = "pipe-user-pages-hard",
1507	.data = &pipe_user_pages_hard,
1508	.maxlen = sizeof(pipe_user_pages_hard),
1509	.mode = `0644`,
1510	.proc_handler = proc_doulongvec_minmax,
1511	},
1512	{
1513	.procname = "pipe-user-pages-soft",
1514	.data = &pipe_user_pages_soft,
1515	.maxlen = sizeof(pipe_user_pages_soft),
1516	.mode = `0644`,
1517	.proc_handler = proc_doulongvec_minmax,
1518	},
1519	};
1520	#endif
1521
1522	static int __init init_pipe_fs(void)
1523	{
1524	int err = register_filesystem(&pipe_fs_type);
1525
1526	if (!err) {
1527	pipe_mnt = kern_mount(&pipe_fs_type);
1528	if (IS_ERR(ptr: pipe_mnt)) {
1529	err = PTR_ERR(ptr: pipe_mnt);
1530	unregister_filesystem(&pipe_fs_type);
1531	}
1532	}
1533	#ifdef CONFIG_SYSCTL
1534	register_sysctl_init("fs", fs_pipe_sysctls);
1535	#endif
1536	return err;
1537	}
1538
1539	fs_initcall(init_pipe_fs);
1540

source code of linux/fs/pipe.c