1 /*
2 * linux/fs/nfs/direct.c
3 *
4 * High-performance direct I/O for the NFS client
5 *
6 * When an application requests uncached I/O, all read and write requests
7 * are made directly to the server; data stored or fetched via these
8 * requests is not cached in the Linux page cache. The client does not
9 * correct unaligned requests from applications. All requested bytes are
10 * held on permanent storage before a direct write system call returns to
11 * an application. Applications that manage their own data caching, such
12 * as databases, make very good use of direct I/O on local file systems.
13 *
14 * Solaris implements an uncached I/O facility called directio() that
15 * is used for backups and sequential I/O to very large files. Solaris
16 * also supports uncaching whole NFS partitions with "-o forcedirectio,"
17 * an undocumented mount option.
18 *
19 * Note that I/O to read in executables (e.g. kernel_read) cannot use
20 * direct (kiobuf) reads because there is no vma backing the passed-in
21 * data buffer.
22 *
23 * Designed by Jeff Kimmel, Chuck Lever, and Trond Myklebust.
24 *
25 * Initial implementation: 12/2001 by Chuck Lever <cel@netapp.com>
26 *
27 * TODO:
28 *
29 * 1. Use concurrent asynchronous network requests rather than
30 * serialized synchronous network requests for normal (non-sync)
31 * direct I/O.
32 */
33
34 #include <linux/config.h>
35 #include <linux/sched.h>
36 #include <linux/kernel.h>
37 #include <linux/file.h>
38 #include <linux/errno.h>
39 #include <linux/nfs_fs.h>
40 #include <linux/smp_lock.h>
41 #include <linux/sunrpc/clnt.h>
42 #include <linux/iobuf.h>
43
44 #include <asm/system.h>
45 #include <asm/uaccess.h>
46
47 #define NFSDBG_FACILITY (NFSDBG_PAGECACHE | NFSDBG_VFS)
48 #define VERF_SIZE (2 * sizeof(__u32))
49
50 static inline int
nfs_direct_read_rpc(struct file * file,struct nfs_readargs * arg)51 nfs_direct_read_rpc(struct file *file, struct nfs_readargs *arg)
52 {
53 int result;
54 struct inode * inode = file->f_dentry->d_inode;
55 struct nfs_fattr fattr;
56 struct rpc_message msg;
57 struct nfs_readres res = { &fattr, arg->count, 0 };
58
59 #ifdef CONFIG_NFS_V3
60 msg.rpc_proc = (NFS_PROTO(inode)->version == 3) ?
61 NFS3PROC_READ : NFSPROC_READ;
62 #else
63 msg.rpc_proc = NFSPROC_READ;
64 #endif
65 msg.rpc_argp = arg;
66 msg.rpc_resp = &res;
67
68 lock_kernel();
69 msg.rpc_cred = nfs_file_cred(file);
70 fattr.valid = 0;
71 result = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
72 nfs_refresh_inode(inode, &fattr);
73 unlock_kernel();
74
75 return result;
76 }
77
78 static inline int
nfs_direct_write_rpc(struct file * file,struct nfs_writeargs * arg,struct nfs_writeverf * verf)79 nfs_direct_write_rpc(struct file *file, struct nfs_writeargs *arg,
80 struct nfs_writeverf *verf)
81 {
82 int result;
83 struct inode *inode = file->f_dentry->d_inode;
84 struct nfs_fattr fattr;
85 struct rpc_message msg;
86 struct nfs_writeres res = { &fattr, verf, 0 };
87
88 #ifdef CONFIG_NFS_V3
89 msg.rpc_proc = (NFS_PROTO(inode)->version == 3) ?
90 NFS3PROC_WRITE : NFSPROC_WRITE;
91 #else
92 msg.rpc_proc = NFSPROC_WRITE;
93 #endif
94 msg.rpc_argp = arg;
95 msg.rpc_resp = &res;
96
97 lock_kernel();
98 msg.rpc_cred = get_rpccred(nfs_file_cred(file));
99 fattr.valid = 0;
100 result = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
101 nfs_write_attributes(inode, &fattr);
102 put_rpccred(msg.rpc_cred);
103 unlock_kernel();
104
105 #ifdef CONFIG_NFS_V3
106 if (NFS_PROTO(inode)->version == 3) {
107 if (result > 0) {
108 if ((arg->stable == NFS_FILE_SYNC) &&
109 (verf->committed != NFS_FILE_SYNC)) {
110 printk(KERN_ERR
111 "%s: server didn't sync stable write request\n",
112 __FUNCTION__);
113 return -EIO;
114 }
115
116 if (result != arg->count) {
117 printk(KERN_INFO
118 "%s: short write, count=%u, result=%d\n",
119 __FUNCTION__, arg->count, result);
120 }
121 }
122 return result;
123 } else {
124 #endif
125 verf->committed = NFS_FILE_SYNC; /* NFSv2 always syncs data */
126 if (result == 0)
127 return arg->count;
128 return result;
129 #ifdef CONFIG_NFS_V3
130 }
131 #endif
132 }
133
134 #ifdef CONFIG_NFS_V3
135 static inline int
nfs_direct_commit_rpc(struct inode * inode,loff_t offset,size_t count,struct nfs_writeverf * verf)136 nfs_direct_commit_rpc(struct inode *inode, loff_t offset, size_t count,
137 struct nfs_writeverf *verf)
138 {
139 int result;
140 struct nfs_fattr fattr;
141 struct nfs_writeargs arg = { NFS_FH(inode), offset, count, 0, 0,
142 NULL };
143 struct nfs_writeres res = { &fattr, verf, 0 };
144 struct rpc_message msg = { NFS3PROC_COMMIT, &arg, &res, NULL };
145
146 fattr.valid = 0;
147
148 lock_kernel();
149 result = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
150 nfs_write_attributes(inode, &fattr);
151 unlock_kernel();
152
153 return result;
154 }
155 #else
156 static inline int
nfs_direct_commit_rpc(struct inode * inode,loff_t offset,size_t count,struct nfs_writeverf * verf)157 nfs_direct_commit_rpc(struct inode *inode, loff_t offset, size_t count,
158 struct nfs_writeverf *verf)
159 {
160 return 0;
161 }
162 #endif
163
164 /*
165 * Walk through the iobuf and create an iovec for each "rsize" bytes.
166 */
167 static int
nfs_direct_read(struct file * file,struct kiobuf * iobuf,loff_t offset,size_t count)168 nfs_direct_read(struct file *file, struct kiobuf *iobuf, loff_t offset,
169 size_t count)
170 {
171 int curpage, total;
172 int result = 0;
173 struct inode *inode = file->f_dentry->d_inode;
174 int rsize = NFS_SERVER(inode)->rsize;
175 struct page *pages[NFS_READ_MAXIOV];
176 struct nfs_readargs args = { NFS_FH(inode), offset, 0, iobuf->offset,
177 pages };
178
179 total = 0;
180 curpage = 0;
181 while (count) {
182 int len, request;
183 struct page **dest = pages;
184
185 request = count;
186 if (count > rsize)
187 request = rsize;
188 args.count = request;
189 args.offset = offset;
190 args.pgbase = (iobuf->offset + total) & ~PAGE_MASK;
191 len = PAGE_SIZE - args.pgbase;
192
193 do {
194 struct page *page = iobuf->maplist[curpage];
195
196 if (curpage >= iobuf->nr_pages || !page) {
197 result = -EFAULT;
198 goto out_err;
199 }
200
201 *dest++ = page;
202 /* zero after the first iov */
203 if (request < len)
204 break;
205 request -= len;
206 len = PAGE_SIZE;
207 curpage++;
208 } while (request != 0);
209
210 result = nfs_direct_read_rpc(file, &args);
211
212 if (result < 0)
213 break;
214
215 total += result;
216 if (result < args.count) /* NFSv2ism */
217 break;
218 count -= result;
219 offset += result;
220 };
221 out_err:
222 if (!total)
223 return result;
224 return total;
225 }
226
227 /*
228 * Walk through the iobuf and create an iovec for each "wsize" bytes.
229 * If only one network write is necessary, or if the O_SYNC flag or
230 * 'sync' mount option are present, or if this is a V2 inode, use
231 * FILE_SYNC. Otherwise, use UNSTABLE and finish with a COMMIT.
232 *
233 * The mechanics of this function are much the same as nfs_direct_read,
234 * with the added complexity of committing unstable writes.
235 */
236 static int
nfs_direct_write(struct file * file,struct kiobuf * iobuf,loff_t offset,size_t count)237 nfs_direct_write(struct file *file, struct kiobuf *iobuf,
238 loff_t offset, size_t count)
239 {
240 int curpage, total;
241 int need_commit = 0;
242 int result = 0;
243 loff_t save_offset = offset;
244 struct inode *inode = file->f_dentry->d_inode;
245 int wsize = NFS_SERVER(inode)->wsize;
246 struct nfs_writeverf first_verf, ret_verf;
247 struct page *pages[NFS_WRITE_MAXIOV];
248 struct nfs_writeargs args = { NFS_FH(inode), 0, 0, NFS_FILE_SYNC, 0,
249 pages };
250
251 #ifdef CONFIG_NFS_V3
252 if ((NFS_PROTO(inode)->version == 3) && (count > wsize) &&
253 (!IS_SYNC(inode)))
254 args.stable = NFS_UNSTABLE;
255 #endif
256
257 retry:
258 total = 0;
259 curpage = 0;
260 while (count) {
261 int len, request;
262 struct page **dest = pages;
263
264 request = count;
265 if (count > wsize)
266 request = wsize;
267 args.count = request;
268 args.offset = offset;
269 args.pgbase = (iobuf->offset + total) & ~PAGE_MASK;
270 len = PAGE_SIZE - args.pgbase;
271
272 do {
273 struct page *page = iobuf->maplist[curpage];
274
275 if (curpage >= iobuf->nr_pages || !page) {
276 result = -EFAULT;
277 goto out_err;
278 }
279
280 *dest++ = page;
281 /* zero after the first iov */
282 if (request < len)
283 break;
284 request -= len;
285 len = PAGE_SIZE;
286 curpage++;
287 } while (request != 0);
288
289 result = nfs_direct_write_rpc(file, &args, &ret_verf);
290
291 if (result < 0)
292 break;
293
294 if (!total)
295 memcpy(&first_verf.verifier, &ret_verf.verifier,
296 VERF_SIZE);
297 if (ret_verf.committed != NFS_FILE_SYNC) {
298 need_commit = 1;
299 if (memcmp(&first_verf.verifier, &ret_verf.verifier,
300 VERF_SIZE))
301 goto print_retry;
302 }
303
304 total += result;
305 count -= result;
306 offset += result;
307 };
308
309 out_err:
310 /*
311 * Commit data written so far, even in the event of an error
312 */
313 if (need_commit) {
314 if (nfs_direct_commit_rpc(inode, save_offset,
315 iobuf->length - count, &ret_verf))
316 goto print_retry;
317 if (memcmp(&first_verf.verifier, &ret_verf.verifier,
318 VERF_SIZE))
319 goto print_retry;
320 }
321
322 if (!total)
323 return result;
324 return total;
325
326 print_retry:
327 printk(KERN_INFO "%s: detected server restart; retrying with FILE_SYNC\n",
328 __FUNCTION__);
329 args.stable = NFS_FILE_SYNC;
330 offset = save_offset;
331 count = iobuf->length;
332 goto retry;
333 }
334
335 /*
336 * Read or write data, moving the data directly to/from the
337 * application's buffer without caching in the page cache.
338 *
339 * Rules for direct I/O
340 *
341 * 1. block size = 512 bytes or more
342 * 2. file byte offset is block aligned
343 * 3. byte count is a multiple of block size
344 * 4. user buffer is not aligned
345 * 5. user buffer is faulted in and pinned
346 *
347 * These are verified before we get here.
348 */
349 int
nfs_direct_IO(int rw,struct file * file,struct kiobuf * iobuf,unsigned long blocknr,int blocksize)350 nfs_direct_IO(int rw, struct file *file, struct kiobuf *iobuf,
351 unsigned long blocknr, int blocksize)
352 {
353 int result = -EINVAL;
354 size_t count = iobuf->length;
355 struct dentry *dentry = file->f_dentry;
356 struct inode *inode = dentry->d_inode;
357 loff_t offset = (loff_t) blocknr << inode->i_blkbits;
358
359 switch (rw) {
360 case READ:
361 dfprintk(VFS,
362 "NFS: direct_IO(READ) (%s/%s) off/cnt(%Lu/%d)\n",
363 dentry->d_parent->d_name.name,
364 dentry->d_name.name, offset, count);
365
366 result = nfs_direct_read(file, iobuf, offset, count);
367 break;
368 case WRITE:
369 dfprintk(VFS,
370 "NFS: direct_IO(WRITE) (%s/%s) off/cnt(%Lu/%d)\n",
371 dentry->d_parent->d_name.name,
372 dentry->d_name.name, offset, count);
373
374 result = nfs_direct_write(file, iobuf, offset, count);
375 break;
376 default:
377 break;
378 }
379
380 dfprintk(VFS, "NFS: direct_IO result = %d\n", result);
381 return result;
382 }
383