1 The text below describes the locking rules for VFS-related methods. 2It is (believed to be) up-to-date. *Please*, if you change anything in 3prototypes or locking protocols - update this file. And update the relevant 4instances in the tree, don't leave that to maintainers of filesystems/devices/ 5etc. At the very least, put the list of dubious cases in the end of this file. 6Don't turn it into log - maintainers of out-of-the-tree code are supposed to 7be able to use diff(1). 8 Thing currently missing here: socket operations. Alexey? 9 10--------------------------- dentry_operations -------------------------- 11prototypes: 12 int (*d_revalidate)(struct dentry *, int); 13 int (*d_hash) (struct dentry *, struct qstr *); 14 int (*d_compare) (struct dentry *, struct qstr *, struct qstr *); 15 int (*d_delete)(struct dentry *); 16 void (*d_release)(struct dentry *); 17 void (*d_iput)(struct dentry *, struct inode *); 18 19locking rules: 20 none have BKL 21 dcache_lock may block 22d_revalidate: no yes 23d_hash no yes 24d_compare: yes no 25d_delete: yes no 26d_release: no yes 27d_iput: no yes 28 29--------------------------- inode_operations --------------------------- 30prototypes: 31 int (*create) (struct inode *,struct dentry *,int); 32 struct dentry * (*lookup) (struct inode *,struct dentry *); 33 int (*link) (struct dentry *,struct inode *,struct dentry *); 34 int (*unlink) (struct inode *,struct dentry *); 35 int (*symlink) (struct inode *,struct dentry *,const char *); 36 int (*mkdir) (struct inode *,struct dentry *,int); 37 int (*rmdir) (struct inode *,struct dentry *); 38 int (*mknod) (struct inode *,struct dentry *,int,int); 39 int (*rename) (struct inode *, struct dentry *, 40 struct inode *, struct dentry *); 41 int (*readlink) (struct dentry *, char *,int); 42 int (*follow_link) (struct dentry *, struct nameidata *); 43 void (*truncate) (struct inode *); 44 int (*permission) (struct inode *, int); 45 int (*revalidate) (struct dentry *); 46 int (*setattr) (struct dentry *, struct iattr *); 47 int (*getattr) (struct dentry *, struct iattr *); 48 int (*setxattr) (struct dentry *, const char *, void *, size_t, int); 49 ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t); 50 ssize_t (*listxattr) (struct dentry *, char *, size_t); 51 int (*removexattr) (struct dentry *, const char *); 52 53locking rules: 54 all may block 55 BKL i_sem(inode) i_zombie(inode) 56lookup: yes yes no 57create: yes yes yes 58link: yes yes yes 59mknod: yes yes yes 60mkdir: yes yes yes 61unlink: yes yes yes 62rmdir: yes yes yes (see below) 63rename: yes yes (both) yes (both) (see below) 64readlink: no no no 65follow_link: no no no 66truncate: yes yes no (see below) 67setattr: yes if ATTR_SIZE no 68permission: yes no no 69getattr: (see below) 70revalidate: no (see below) 71setxattr: yes yes no 72getxattr: yes yes no 73listxattr: yes yes no 74removexattr: yes yes no 75 Additionally, ->rmdir() has i_zombie on victim and so does ->rename() 76in case when target exists and is a directory. 77 ->rename() on directories has (per-superblock) ->s_vfs_rename_sem. 78 ->revalidate(), it may be called both with and without the i_sem 79on dentry->d_inode. VFS never calls it with i_zombie on dentry->d_inode, 80but watch for other methods directly calling this one... 81 ->truncate() is never called directly - it's a callback, not a 82method. It's called by vmtruncate() - library function normally used by 83->setattr(). Locking information above applies to that call (i.e. is 84inherited from ->setattr() - vmtruncate() is used when ATTR_SIZE had been 85passed). 86 ->getattr() is currently unused. 87 88--------------------------- super_operations --------------------------- 89prototypes: 90 void (*read_inode) (struct inode *); 91 void (*write_inode) (struct inode *, int); 92 void (*put_inode) (struct inode *); 93 void (*delete_inode) (struct inode *); 94 void (*put_super) (struct super_block *); 95 void (*write_super) (struct super_block *); 96 int (*sync_fs) (struct super_block *); 97 int (*statfs) (struct super_block *, struct statfs *); 98 int (*remount_fs) (struct super_block *, int *, char *); 99 void (*clear_inode) (struct inode *); 100 void (*umount_begin) (struct super_block *); 101 102locking rules: 103 All may block. 104 BKL s_lock mount_sem 105read_inode: yes (see below) 106write_inode: no 107put_inode: no 108delete_inode: no 109clear_inode: no 110put_super: yes yes maybe (see below) 111write_super: yes yes maybe (see below) 112sync_fs: yes no maybe (see below) 113statfs: yes no no 114remount_fs: yes yes maybe (see below) 115umount_begin: yes no maybe (see below) 116 117->read_inode() is not a method - it's a callback used in iget()/iget4(). 118rules for mount_sem are not too nice - it is going to die and be replaced 119by better scheme anyway. 120 121--------------------------- file_system_type --------------------------- 122prototypes: 123 struct super_block *(*read_super) (struct super_block *, void *, int); 124locking rules: 125may block BKL ->s_lock mount_sem 126yes yes yes maybe 127 128--------------------------- address_space_operations -------------------------- 129prototypes: 130 int (*writepage)(struct page *); 131 int (*readpage)(struct file *, struct page *); 132 int (*sync_page)(struct page *); 133 int (*prepare_write)(struct file *, struct page *, unsigned, unsigned); 134 int (*commit_write)(struct file *, struct page *, unsigned, unsigned); 135 int (*bmap)(struct address_space *, long); 136 int (*flushpage) (struct page *, unsigned long); 137 int (*releasepage) (struct page *, int); 138 int (*direct_IO)(int, struct inode *, struct kiobuf *, unsigned long, int); 139 140locking rules: 141 All may block 142 BKL PageLocked(page) 143writepage: no yes, unlocks 144readpage: no yes, unlocks 145sync_page: no maybe 146prepare_write: no yes 147commit_write: no yes 148bmap: yes 149flushpage: no yes 150releasepage: no yes 151 152 ->prepare_write(), ->commit_write(), ->sync_page() and ->readpage() 153may be called from the request handler (/dev/loop). 154 ->readpage() and ->writepage() unlock the page. 155 ->sync_page() locking rules are not well-defined - usually it is called 156with lock on page, but that is not guaranteed. Considering the currently 157existing instances of this method ->sync_page() itself doesn't look 158well-defined... 159 ->bmap() is currently used by legacy ioctl() (FIBMAP) provided by some 160filesystems and by the swapper. The latter will eventually go away. All 161instances do not actually need the BKL. Please, keep it that way and don't 162breed new callers. 163 ->flushpage() is called when the filesystem must attempt to drop 164some or all of the buffers from the page when it is being truncated. It 165returns zero on success. If ->flushpage is zero, the kernel uses 166block_flushpage() instead. 167 ->releasepage() is called when the kernel is about to try to drop the 168buffers from the page in preparation for freeing it. It returns zero to 169indicate that the buffers are (or may be) freeable. If ->releasepage is zero, 170the kernel assumes that the fs has no private interest in the buffers. 171 172 Note: currently almost all instances of address_space methods are 173using BKL for internal serialization and that's one of the worst sources 174of contention. Normally they are calling library functions (in fs/buffer.c) 175and pass foo_get_block() as a callback (on local block-based filesystems, 176indeed). BKL is not needed for library stuff and is usually taken by 177foo_get_block(). It's an overkill, since block bitmaps can be protected by 178internal fs locking and real critical areas are much smaller than the areas 179filesystems protect now. 180 181--------------------------- file_lock ------------------------------------ 182prototypes: 183 void (*fl_notify)(struct file_lock *); /* unblock callback */ 184 void (*fl_insert)(struct file_lock *); /* lock insertion callback */ 185 void (*fl_remove)(struct file_lock *); /* lock removal callback */ 186 187locking rules: 188 BKL may block 189fl_notify: yes no 190fl_insert: yes maybe 191fl_remove: yes maybe 192 Currently only NLM provides instances of this class. None of the 193them block. If you have out-of-tree instances - please, show up. Locking 194in that area will change. 195 196--------------------------- buffer_head ----------------------------------- 197prototypes: 198 void (*b_end_io)(struct buffer_head *bh, int uptodate); 199 200locking rules: 201 called from interrupts. In other words, extreme care is needed here. 202bh is locked, but that's all warranties we have here. Currently only RAID1, 203highmem and fs/buffer.c are providing these. Block devices call this method 204upon the IO completion. 205 206--------------------------- block_device_operations ----------------------- 207prototypes: 208 int (*open) (struct inode *, struct file *); 209 int (*release) (struct inode *, struct file *); 210 int (*ioctl) (struct inode *, struct file *, unsigned, unsigned long); 211 int (*check_media_change) (kdev_t); 212 int (*revalidate) (kdev_t); 213locking rules: 214 BKL bd_sem 215open: yes yes 216release: yes yes 217ioctl: yes no 218check_media_change: yes no 219revalidate: yes no 220 221The last two are called only from check_disk_change(). Prototypes are very 222bad - as soon as we'll get disk_struct they will change (and methods will 223become per-disk instead of per-partition). 224 225--------------------------- file_operations ------------------------------- 226prototypes: 227 loff_t (*llseek) (struct file *, loff_t, int); 228 ssize_t (*read) (struct file *, char *, size_t, loff_t *); 229 ssize_t (*write) (struct file *, const char *, size_t, loff_t *); 230 int (*readdir) (struct file *, void *, filldir_t); 231 unsigned int (*poll) (struct file *, struct poll_table_struct *); 232 int (*ioctl) (struct inode *, struct file *, unsigned int, unsigned long); 233 int (*mmap) (struct file *, struct vm_area_struct *); 234 int (*open) (struct inode *, struct file *); 235 int (*flush) (struct file *); 236 int (*release) (struct inode *, struct file *); 237 int (*fsync) (struct file *, struct dentry *, int datasync); 238 int (*fasync) (int, struct file *, int); 239 int (*lock) (struct file *, int, struct file_lock *); 240 ssize_t (*readv) (struct file *, const struct iovec *, unsigned long, loff_t *); 241 ssize_t (*writev) (struct file *, const struct iovec *, unsigned long, loff_t *); 242}; 243 244locking rules: 245 All except ->poll() may block. 246 BKL 247llseek: yes 248read: no 249write: no 250readdir: yes (see below) 251poll: no 252ioctl: yes (see below) 253mmap: no 254open: maybe (see below) 255flush: yes 256release: no 257fsync: yes (see below) 258fasync: yes (see below) 259lock: yes 260readv: no 261writev: no 262 263->open() locking is in-transit: big lock partially moved into the methods. 264The only exception is ->open() in the instances of file_operations that never 265end up in ->i_fop/->proc_fops, i.e. ones that belong to character devices 266(chrdev_open() takes lock before replacing ->f_op and calling the secondary 267method. As soon as we fix the handling of module reference counters all 268instances of ->open() will be called without the BKL. 269 270Note: ext2_release() was *the* source of contention on fs-intensive 271loads and dropping BKL on ->release() helps to get rid of that (we still 272grab BKL for cases when we close a file that had been opened r/w, but that 273can and should be done using the internal locking with smaller critical areas). 274Current worst offender is ext2_get_block()... 275 276->fasync() is a mess. This area needs a big cleanup and that will probably 277affect locking. 278 279->readdir() and ->ioctl() on directories must be changed. Ideally we would 280move ->readdir() to inode_operations and use a separate method for directory 281->ioctl() or kill the latter completely. One of the problems is that for 282anything that resembles union-mount we won't have a struct file for all 283components. And there are other reasons why the current interface is a mess... 284 285->read on directories probably must go away - we should just enforce -EISDIR 286in sys_read() and friends. 287 288->fsync() has i_sem on inode. 289 290--------------------------- dquot_operations ------------------------------- 291prototypes: 292 void (*initialize) (struct inode *, short); 293 void (*drop) (struct inode *); 294 int (*alloc_block) (const struct inode *, unsigned long, char); 295 int (*alloc_inode) (const struct inode *, unsigned long); 296 void (*free_block) (const struct inode *, unsigned long); 297 void (*free_inode) (const struct inode *, unsigned long); 298 int (*transfer) (struct dentry *, struct iattr *); 299 300locking rules: 301 BKL 302initialize: no 303drop: no 304alloc_block: yes 305alloc_inode: yes 306free_block: yes 307free_inode: yes 308transfer: no 309 310--------------------------- vm_operations_struct ----------------------------- 311prototypes: 312 void (*open)(struct vm_area_struct*); 313 void (*close)(struct vm_area_struct*); 314 struct page *(*nopage)(struct vm_area_struct*, unsigned long, int); 315 316locking rules: 317 BKL mmap_sem 318open: no yes 319close: no yes 320nopage: no yes 321 322================================================================================ 323 Dubious stuff 324 325(if you break something or notice that it is broken and do not fix it yourself 326- at least put it here) 327 328ipc/shm.c::shm_delete() - may need BKL. 329->read() and ->write() in many drivers are (probably) missing BKL. 330drivers/sgi/char/graphics.c::sgi_graphics_nopage() - may need BKL. 331