1	The text below describes the locking rules for VFS-related methods.
2It is (believed to be) up-to-date. *Please*, if you change anything in
3prototypes or locking protocols - update this file. And update the relevant
4instances in the tree, don't leave that to maintainers of filesystems/devices/
5etc. At the very least, put the list of dubious cases in the end of this file.
6Don't turn it into log - maintainers of out-of-the-tree code are supposed to
7be able to use diff(1).
8	Thing currently missing here: socket operations. Alexey?
9
10--------------------------- dentry_operations --------------------------
11prototypes:
12	int (*d_revalidate)(struct dentry *, int);
13	int (*d_hash) (struct dentry *, struct qstr *);
14	int (*d_compare) (struct dentry *, struct qstr *, struct qstr *);
15	int (*d_delete)(struct dentry *);
16	void (*d_release)(struct dentry *);
17	void (*d_iput)(struct dentry *, struct inode *);
18
19locking rules:
20	none have BKL
21		dcache_lock	may block
22d_revalidate:	no		yes
23d_hash		no		yes
24d_compare:	yes		no
25d_delete:	yes		no
26d_release:	no		yes
27d_iput:		no		yes
28
29--------------------------- inode_operations ---------------------------
30prototypes:
31	int (*create) (struct inode *,struct dentry *,int);
32	struct dentry * (*lookup) (struct inode *,struct dentry *);
33	int (*link) (struct dentry *,struct inode *,struct dentry *);
34	int (*unlink) (struct inode *,struct dentry *);
35	int (*symlink) (struct inode *,struct dentry *,const char *);
36	int (*mkdir) (struct inode *,struct dentry *,int);
37	int (*rmdir) (struct inode *,struct dentry *);
38	int (*mknod) (struct inode *,struct dentry *,int,int);
39	int (*rename) (struct inode *, struct dentry *,
40			struct inode *, struct dentry *);
41	int (*readlink) (struct dentry *, char *,int);
42	int (*follow_link) (struct dentry *, struct nameidata *);
43	void (*truncate) (struct inode *);
44	int (*permission) (struct inode *, int);
45	int (*revalidate) (struct dentry *);
46	int (*setattr) (struct dentry *, struct iattr *);
47	int (*getattr) (struct dentry *, struct iattr *);
48	int (*setxattr) (struct dentry *, const char *, void *, size_t, int);
49	ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
50	ssize_t (*listxattr) (struct dentry *, char *, size_t);
51	int (*removexattr) (struct dentry *, const char *);
52
53locking rules:
54	all may block
55		BKL	i_sem(inode)	i_zombie(inode)
56lookup:		yes	yes		no
57create:		yes	yes		yes
58link:		yes	yes		yes
59mknod:		yes	yes		yes
60mkdir:		yes	yes		yes
61unlink:		yes	yes		yes
62rmdir:		yes	yes		yes		(see below)
63rename:		yes	yes (both)	yes (both)	(see below)
64readlink:	no	no		no
65follow_link:	no	no		no
66truncate:	yes	yes		no		(see below)
67setattr:	yes	if ATTR_SIZE	no
68permission:	yes	no		no
69getattr:						(see below)
70revalidate:	no					(see below)
71setxattr:	yes	yes		no
72getxattr:	yes	yes		no
73listxattr:	yes	yes		no
74removexattr:	yes	yes		no
75	Additionally, ->rmdir() has i_zombie on victim and so does ->rename()
76in case when target exists and is a directory.
77	->rename() on directories has (per-superblock) ->s_vfs_rename_sem.
78	->revalidate(), it may be called both with and without the i_sem
79on dentry->d_inode. VFS never calls it with i_zombie on dentry->d_inode,
80but watch for other methods directly calling this one...
81	->truncate() is never called directly - it's a callback, not a
82method. It's called by vmtruncate() - library function normally used by
83->setattr(). Locking information above applies to that call (i.e. is
84inherited from ->setattr() - vmtruncate() is used when ATTR_SIZE had been
85passed).
86	->getattr() is currently unused.
87
88--------------------------- super_operations ---------------------------
89prototypes:
90	void (*read_inode) (struct inode *);
91	void (*write_inode) (struct inode *, int);
92	void (*put_inode) (struct inode *);
93	void (*delete_inode) (struct inode *);
94	void (*put_super) (struct super_block *);
95	void (*write_super) (struct super_block *);
96	int (*sync_fs) (struct super_block *);
97	int (*statfs) (struct super_block *, struct statfs *);
98	int (*remount_fs) (struct super_block *, int *, char *);
99	void (*clear_inode) (struct inode *);
100	void (*umount_begin) (struct super_block *);
101
102locking rules:
103	All may block.
104		BKL	s_lock	mount_sem
105read_inode:	yes				(see below)
106write_inode:	no
107put_inode:	no
108delete_inode:	no
109clear_inode:	no
110put_super:	yes	yes	maybe		(see below)
111write_super:	yes	yes	maybe		(see below)
112sync_fs:	yes	no	maybe		(see below)
113statfs:		yes	no	no
114remount_fs:	yes	yes	maybe		(see below)
115umount_begin:	yes	no	maybe		(see below)
116
117->read_inode() is not a method - it's a callback used in iget()/iget4().
118rules for mount_sem are not too nice - it is going to die and be replaced
119by better scheme anyway.
120
121--------------------------- file_system_type ---------------------------
122prototypes:
123	struct super_block *(*read_super) (struct super_block *, void *, int);
124locking rules:
125may block	BKL	->s_lock	mount_sem
126yes		yes	yes		maybe
127
128--------------------------- address_space_operations --------------------------
129prototypes:
130	int (*writepage)(struct page *);
131	int (*readpage)(struct file *, struct page *);
132	int (*sync_page)(struct page *);
133	int (*prepare_write)(struct file *, struct page *, unsigned, unsigned);
134	int (*commit_write)(struct file *, struct page *, unsigned, unsigned);
135	int (*bmap)(struct address_space *, long);
136	int (*flushpage) (struct page *, unsigned long);
137	int (*releasepage) (struct page *, int);
138	int (*direct_IO)(int, struct inode *, struct kiobuf *, unsigned long, int);
139
140locking rules:
141	All may block
142		BKL	PageLocked(page)
143writepage:	no	yes, unlocks
144readpage:	no	yes, unlocks
145sync_page:	no	maybe
146prepare_write:	no	yes
147commit_write:	no	yes
148bmap:		yes
149flushpage:	no	yes
150releasepage:	no	yes
151
152	->prepare_write(), ->commit_write(), ->sync_page() and ->readpage()
153may be called from the request handler (/dev/loop).
154	->readpage() and ->writepage() unlock the page.
155	->sync_page() locking rules are not well-defined - usually it is called
156with lock on page, but that is not guaranteed. Considering the currently
157existing instances of this method ->sync_page() itself doesn't look
158well-defined...
159	->bmap() is currently used by legacy ioctl() (FIBMAP) provided by some
160filesystems and by the swapper. The latter will eventually go away. All
161instances do not actually need the BKL. Please, keep it that way and don't
162breed new callers.
163	->flushpage() is called when the filesystem must attempt to drop
164some or all of the buffers from the page when it is being truncated.  It
165returns zero on success.  If ->flushpage is zero, the kernel uses
166block_flushpage() instead.
167	->releasepage() is called when the kernel is about to try to drop the
168buffers from the page in preparation for freeing it.  It returns zero to
169indicate that the buffers are (or may be) freeable.  If ->releasepage is zero,
170the kernel assumes that the fs has no private interest in the buffers.
171
172	Note: currently almost all instances of address_space methods are
173using BKL for internal serialization and that's one of the worst sources
174of contention. Normally they are calling library functions (in fs/buffer.c)
175and pass foo_get_block() as a callback (on local block-based filesystems,
176indeed). BKL is not needed for library stuff and is usually taken by
177foo_get_block(). It's an overkill, since block bitmaps can be protected by
178internal fs locking and real critical areas are much smaller than the areas
179filesystems protect now.
180
181--------------------------- file_lock ------------------------------------
182prototypes:
183	void (*fl_notify)(struct file_lock *);	/* unblock callback */
184	void (*fl_insert)(struct file_lock *);	/* lock insertion callback */
185	void (*fl_remove)(struct file_lock *);	/* lock removal callback */
186
187locking rules:
188		BKL	may block
189fl_notify:	yes	no
190fl_insert:	yes	maybe
191fl_remove:	yes	maybe
192	Currently only NLM provides instances of this class. None of the
193them block. If you have out-of-tree instances - please, show up. Locking
194in that area will change.
195
196--------------------------- buffer_head -----------------------------------
197prototypes:
198	void (*b_end_io)(struct buffer_head *bh, int uptodate);
199
200locking rules:
201	called from interrupts. In other words, extreme care is needed here.
202bh is locked, but that's all warranties we have here. Currently only RAID1,
203highmem and fs/buffer.c are providing these. Block devices call this method
204upon the IO completion.
205
206--------------------------- block_device_operations -----------------------
207prototypes:
208	int (*open) (struct inode *, struct file *);
209	int (*release) (struct inode *, struct file *);
210	int (*ioctl) (struct inode *, struct file *, unsigned, unsigned long);
211	int (*check_media_change) (kdev_t);
212	int (*revalidate) (kdev_t);
213locking rules:
214			BKL	bd_sem
215open:			yes	yes
216release:		yes	yes
217ioctl:			yes	no
218check_media_change:	yes	no
219revalidate:		yes	no
220
221The last two are called only from check_disk_change(). Prototypes are very
222bad - as soon as we'll get disk_struct they will change (and methods will
223become per-disk instead of per-partition).
224
225--------------------------- file_operations -------------------------------
226prototypes:
227	loff_t (*llseek) (struct file *, loff_t, int);
228	ssize_t (*read) (struct file *, char *, size_t, loff_t *);
229	ssize_t (*write) (struct file *, const char *, size_t, loff_t *);
230	int (*readdir) (struct file *, void *, filldir_t);
231	unsigned int (*poll) (struct file *, struct poll_table_struct *);
232	int (*ioctl) (struct inode *, struct file *, unsigned int, unsigned long);
233	int (*mmap) (struct file *, struct vm_area_struct *);
234	int (*open) (struct inode *, struct file *);
235	int (*flush) (struct file *);
236	int (*release) (struct inode *, struct file *);
237	int (*fsync) (struct file *, struct dentry *, int datasync);
238	int (*fasync) (int, struct file *, int);
239	int (*lock) (struct file *, int, struct file_lock *);
240	ssize_t (*readv) (struct file *, const struct iovec *, unsigned long, loff_t *);
241	ssize_t (*writev) (struct file *, const struct iovec *, unsigned long, loff_t *);
242};
243
244locking rules:
245	All except ->poll() may block.
246		BKL
247llseek:		yes
248read:		no
249write:		no
250readdir:	yes	(see below)
251poll:		no
252ioctl:		yes	(see below)
253mmap:		no
254open:		maybe	(see below)
255flush:		yes
256release:	no
257fsync:		yes	(see below)
258fasync:		yes	(see below)
259lock:		yes
260readv:		no
261writev:		no
262
263->open() locking is in-transit: big lock partially moved into the methods.
264The only exception is ->open() in the instances of file_operations that never
265end up in ->i_fop/->proc_fops, i.e. ones that belong to character devices
266(chrdev_open() takes lock before replacing ->f_op and calling the secondary
267method. As soon as we fix the handling of module reference counters all
268instances of ->open() will be called without the BKL.
269
270Note: ext2_release() was *the* source of contention on fs-intensive
271loads and dropping BKL on ->release() helps to get rid of that (we still
272grab BKL for cases when we close a file that had been opened r/w, but that
273can and should be done using the internal locking with smaller critical areas).
274Current worst offender is ext2_get_block()...
275
276->fasync() is a mess. This area needs a big cleanup and that will probably
277affect locking.
278
279->readdir() and ->ioctl() on directories must be changed. Ideally we would
280move ->readdir() to inode_operations and use a separate method for directory
281->ioctl() or kill the latter completely. One of the problems is that for
282anything that resembles union-mount we won't have a struct file for all
283components. And there are other reasons why the current interface is a mess...
284
285->read on directories probably must go away - we should just enforce -EISDIR
286in sys_read() and friends.
287
288->fsync() has i_sem on inode.
289
290--------------------------- dquot_operations -------------------------------
291prototypes:
292	void (*initialize) (struct inode *, short);
293	void (*drop) (struct inode *);
294	int (*alloc_block) (const struct inode *, unsigned long, char);
295	int (*alloc_inode) (const struct inode *, unsigned long);
296	void (*free_block) (const struct inode *, unsigned long);
297	void (*free_inode) (const struct inode *, unsigned long);
298	int (*transfer) (struct dentry *, struct iattr *);
299
300locking rules:
301		BKL
302initialize:	no
303drop:		no
304alloc_block:	yes
305alloc_inode:	yes
306free_block:	yes
307free_inode:	yes
308transfer:	no
309
310--------------------------- vm_operations_struct -----------------------------
311prototypes:
312	void (*open)(struct vm_area_struct*);
313	void (*close)(struct vm_area_struct*);
314	struct page *(*nopage)(struct vm_area_struct*, unsigned long, int);
315
316locking rules:
317		BKL	mmap_sem
318open:		no	yes
319close:		no	yes
320nopage:		no	yes
321
322================================================================================
323			Dubious stuff
324
325(if you break something or notice that it is broken and do not fix it yourself
326- at least put it here)
327
328ipc/shm.c::shm_delete() - may need BKL.
329->read() and ->write() in many drivers are (probably) missing BKL.
330drivers/sgi/char/graphics.c::sgi_graphics_nopage() - may need BKL.
331