1 /*
2  * linux/fs/nfs/direct.c
3  *
4  * High-performance direct I/O for the NFS client
5  *
6  * When an application requests uncached I/O, all read and write requests
7  * are made directly to the server; data stored or fetched via these
8  * requests is not cached in the Linux page cache.  The client does not
9  * correct unaligned requests from applications.  All requested bytes are
10  * held on permanent storage before a direct write system call returns to
11  * an application.  Applications that manage their own data caching, such
12  * as databases, make very good use of direct I/O on local file systems.
13  *
14  * Solaris implements an uncached I/O facility called directio() that
15  * is used for backups and sequential I/O to very large files.  Solaris
16  * also supports uncaching whole NFS partitions with "-o forcedirectio,"
17  * an undocumented mount option.
18  *
19  * Note that I/O to read in executables (e.g. kernel_read) cannot use
20  * direct (kiobuf) reads because there is no vma backing the passed-in
21  * data buffer.
22  *
23  * Designed by Jeff Kimmel, Chuck Lever, and Trond Myklebust.
24  *
25  * Initial implementation:	12/2001 by Chuck Lever <cel@netapp.com>
26  *
27  * TODO:
28  *
29  * 1.  Use concurrent asynchronous network requests rather than
30  *     serialized synchronous network requests for normal (non-sync)
31  *     direct I/O.
32  */
33 
34 #include <linux/config.h>
35 #include <linux/sched.h>
36 #include <linux/kernel.h>
37 #include <linux/file.h>
38 #include <linux/errno.h>
39 #include <linux/nfs_fs.h>
40 #include <linux/smp_lock.h>
41 #include <linux/sunrpc/clnt.h>
42 #include <linux/iobuf.h>
43 
44 #include <asm/system.h>
45 #include <asm/uaccess.h>
46 
47 #define NFSDBG_FACILITY		(NFSDBG_PAGECACHE | NFSDBG_VFS)
48 #define VERF_SIZE		(2 * sizeof(__u32))
49 
50 static inline int
nfs_direct_read_rpc(struct file * file,struct nfs_readargs * arg)51 nfs_direct_read_rpc(struct file *file, struct nfs_readargs *arg)
52 {
53 	int result;
54 	struct inode * inode = file->f_dentry->d_inode;
55 	struct nfs_fattr fattr;
56         struct rpc_message msg;
57         struct nfs_readres res = { &fattr, arg->count, 0 };
58 
59 #ifdef CONFIG_NFS_V3
60 	msg.rpc_proc = (NFS_PROTO(inode)->version == 3) ?
61 						NFS3PROC_READ : NFSPROC_READ;
62 #else
63 	msg.rpc_proc = NFSPROC_READ;
64 #endif
65 	msg.rpc_argp = arg;
66         msg.rpc_resp = &res;
67 
68 	lock_kernel();
69         msg.rpc_cred = nfs_file_cred(file);
70         fattr.valid = 0;
71         result = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
72 	nfs_refresh_inode(inode, &fattr);
73 	unlock_kernel();
74 
75 	return result;
76 }
77 
78 static inline int
nfs_direct_write_rpc(struct file * file,struct nfs_writeargs * arg,struct nfs_writeverf * verf)79 nfs_direct_write_rpc(struct file *file, struct nfs_writeargs *arg,
80 	struct nfs_writeverf *verf)
81 {
82 	int result;
83 	struct inode *inode = file->f_dentry->d_inode;
84 	struct nfs_fattr fattr;
85         struct rpc_message msg;
86         struct nfs_writeres res = { &fattr, verf, 0 };
87 
88 #ifdef CONFIG_NFS_V3
89 	msg.rpc_proc = (NFS_PROTO(inode)->version == 3) ?
90 						NFS3PROC_WRITE : NFSPROC_WRITE;
91 #else
92 	msg.rpc_proc = NFSPROC_WRITE;
93 #endif
94 	msg.rpc_argp = arg;
95         msg.rpc_resp = &res;
96 
97 	lock_kernel();
98 	msg.rpc_cred = get_rpccred(nfs_file_cred(file));
99 	fattr.valid = 0;
100         result = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
101 	nfs_write_attributes(inode, &fattr);
102 	put_rpccred(msg.rpc_cred);
103 	unlock_kernel();
104 
105 #ifdef CONFIG_NFS_V3
106 	if (NFS_PROTO(inode)->version == 3) {
107 		if (result > 0) {
108 			if ((arg->stable == NFS_FILE_SYNC) &&
109 			    (verf->committed != NFS_FILE_SYNC)) {
110 				printk(KERN_ERR
111 				"%s: server didn't sync stable write request\n",
112 				__FUNCTION__);
113 				return -EIO;
114 			}
115 
116 			if (result != arg->count) {
117 				printk(KERN_INFO
118 					"%s: short write, count=%u, result=%d\n",
119 					__FUNCTION__, arg->count, result);
120 			}
121 		}
122 		return result;
123 	} else {
124 #endif
125         	verf->committed = NFS_FILE_SYNC; /* NFSv2 always syncs data */
126 		if (result == 0)
127 			return arg->count;
128 		return result;
129 #ifdef CONFIG_NFS_V3
130 	}
131 #endif
132 }
133 
134 #ifdef CONFIG_NFS_V3
135 static inline int
nfs_direct_commit_rpc(struct inode * inode,loff_t offset,size_t count,struct nfs_writeverf * verf)136 nfs_direct_commit_rpc(struct inode *inode, loff_t offset, size_t count,
137 	struct nfs_writeverf *verf)
138 {
139 	int result;
140 	struct nfs_fattr fattr;
141 	struct nfs_writeargs	arg = { NFS_FH(inode), offset, count, 0, 0,
142 					NULL };
143 	struct nfs_writeres	res = { &fattr, verf, 0 };
144 	struct rpc_message	msg = { NFS3PROC_COMMIT, &arg, &res, NULL };
145 
146 	fattr.valid = 0;
147 
148 	lock_kernel();
149 	result = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
150 	nfs_write_attributes(inode, &fattr);
151 	unlock_kernel();
152 
153 	return result;
154 }
155 #else
156 static inline int
nfs_direct_commit_rpc(struct inode * inode,loff_t offset,size_t count,struct nfs_writeverf * verf)157 nfs_direct_commit_rpc(struct inode *inode, loff_t offset, size_t count,
158 	struct nfs_writeverf *verf)
159 {
160 	return 0;
161 }
162 #endif
163 
164 /*
165  * Walk through the iobuf and create an iovec for each "rsize" bytes.
166  */
167 static int
nfs_direct_read(struct file * file,struct kiobuf * iobuf,loff_t offset,size_t count)168 nfs_direct_read(struct file *file, struct kiobuf *iobuf, loff_t offset,
169 	size_t count)
170 {
171 	int curpage, total;
172 	int result = 0;
173 	struct inode *inode = file->f_dentry->d_inode;
174 	int rsize = NFS_SERVER(inode)->rsize;
175 	struct page *pages[NFS_READ_MAXIOV];
176 	struct nfs_readargs args = { NFS_FH(inode), offset, 0, iobuf->offset,
177 				     pages };
178 
179 	total = 0;
180 	curpage = 0;
181         while (count) {
182 		int len, request;
183 		struct page **dest = pages;
184 
185                 request = count;
186                 if (count > rsize)
187                         request = rsize;
188 		args.count = request;
189 		args.offset = offset;
190 		args.pgbase = (iobuf->offset + total) & ~PAGE_MASK;
191 		len = PAGE_SIZE - args.pgbase;
192 
193 		do {
194 			struct page *page = iobuf->maplist[curpage];
195 
196 			if (curpage >= iobuf->nr_pages || !page) {
197 				result = -EFAULT;
198 				goto out_err;
199 			}
200 
201 			*dest++ = page;
202 			/* zero after the first iov */
203 			if (request < len)
204 				break;
205 			request -= len;
206 			len = PAGE_SIZE;
207 			curpage++;
208 		} while (request != 0);
209 
210                 result = nfs_direct_read_rpc(file, &args);
211 
212                 if (result < 0)
213                         break;
214 
215                 total += result;
216                 if (result < args.count)   /* NFSv2ism */
217                         break;
218                 count -= result;
219                 offset += result;
220         };
221 out_err:
222 	if (!total)
223 		return result;
224 	return total;
225 }
226 
227 /*
228  * Walk through the iobuf and create an iovec for each "wsize" bytes.
229  * If only one network write is necessary, or if the O_SYNC flag or
230  * 'sync' mount option are present, or if this is a V2 inode, use
231  * FILE_SYNC.  Otherwise, use UNSTABLE and finish with a COMMIT.
232  *
233  * The mechanics of this function are much the same as nfs_direct_read,
234  * with the added complexity of committing unstable writes.
235  */
236 static int
nfs_direct_write(struct file * file,struct kiobuf * iobuf,loff_t offset,size_t count)237 nfs_direct_write(struct file *file, struct kiobuf *iobuf,
238 	loff_t offset, size_t count)
239 {
240 	int curpage, total;
241 	int need_commit = 0;
242 	int result = 0;
243 	loff_t save_offset = offset;
244 	struct inode *inode = file->f_dentry->d_inode;
245 	int wsize = NFS_SERVER(inode)->wsize;
246 	struct nfs_writeverf first_verf, ret_verf;
247 	struct page *pages[NFS_WRITE_MAXIOV];
248         struct nfs_writeargs args = { NFS_FH(inode), 0, 0, NFS_FILE_SYNC, 0,
249 				pages };
250 
251 #ifdef CONFIG_NFS_V3
252 	if ((NFS_PROTO(inode)->version == 3) && (count > wsize) &&
253 							(!IS_SYNC(inode)))
254 		args.stable = NFS_UNSTABLE;
255 #endif
256 
257 retry:
258 	total = 0;
259 	curpage = 0;
260         while (count) {
261 		int len, request;
262 		struct page **dest = pages;
263 
264                 request = count;
265                 if (count > wsize)
266                         request = wsize;
267 		args.count = request;
268 		args.offset = offset;
269 		args.pgbase = (iobuf->offset + total) & ~PAGE_MASK;
270 		len = PAGE_SIZE - args.pgbase;
271 
272 		do {
273 			struct page *page = iobuf->maplist[curpage];
274 
275 			if (curpage >= iobuf->nr_pages || !page) {
276 				result = -EFAULT;
277 				goto out_err;
278 			}
279 
280 			*dest++ = page;
281 			/* zero after the first iov */
282 			if (request < len)
283 				break;
284 			request -= len;
285 			len = PAGE_SIZE;
286 			curpage++;
287 		} while (request != 0);
288 
289                 result = nfs_direct_write_rpc(file, &args, &ret_verf);
290 
291                 if (result < 0)
292                         break;
293 
294 		if (!total)
295 			memcpy(&first_verf.verifier, &ret_verf.verifier,
296 								VERF_SIZE);
297 		if (ret_verf.committed != NFS_FILE_SYNC) {
298 			need_commit = 1;
299 			if (memcmp(&first_verf.verifier, &ret_verf.verifier,
300 								VERF_SIZE))
301 				goto print_retry;
302 		}
303 
304                 total += result;
305                 count -= result;
306                 offset += result;
307         };
308 
309 out_err:
310 	/*
311 	 * Commit data written so far, even in the event of an error
312 	 */
313 	if (need_commit) {
314 		if (nfs_direct_commit_rpc(inode, save_offset,
315 					iobuf->length - count, &ret_verf))
316 			goto print_retry;
317 		if (memcmp(&first_verf.verifier, &ret_verf.verifier,
318 								VERF_SIZE))
319 			goto print_retry;
320 	}
321 
322 	if (!total)
323 		return result;
324 	return total;
325 
326 print_retry:
327 	printk(KERN_INFO "%s: detected server restart; retrying with FILE_SYNC\n",
328 			__FUNCTION__);
329 	args.stable = NFS_FILE_SYNC;
330 	offset = save_offset;
331 	count = iobuf->length;
332 	goto retry;
333 }
334 
335 /*
336  * Read or write data, moving the data directly to/from the
337  * application's buffer without caching in the page cache.
338  *
339  * Rules for direct I/O
340  *
341  * 1.  block size = 512 bytes or more
342  * 2.  file byte offset is block aligned
343  * 3.  byte count is a multiple of block size
344  * 4.  user buffer is not aligned
345  * 5.  user buffer is faulted in and pinned
346  *
347  * These are verified before we get here.
348  */
349 int
nfs_direct_IO(int rw,struct file * file,struct kiobuf * iobuf,unsigned long blocknr,int blocksize)350 nfs_direct_IO(int rw, struct file *file, struct kiobuf *iobuf,
351 	unsigned long blocknr, int blocksize)
352 {
353 	int result = -EINVAL;
354 	size_t count = iobuf->length;
355 	struct dentry *dentry = file->f_dentry;
356 	struct inode *inode = dentry->d_inode;
357 	loff_t offset = (loff_t) blocknr << inode->i_blkbits;
358 
359 	switch (rw) {
360 	case READ:
361 		dfprintk(VFS,
362 			"NFS: direct_IO(READ) (%s/%s) off/cnt(%Lu/%d)\n",
363 				dentry->d_parent->d_name.name,
364 					dentry->d_name.name, offset, count);
365 
366 		result = nfs_direct_read(file, iobuf, offset, count);
367 		break;
368 	case WRITE:
369 		dfprintk(VFS,
370 			"NFS: direct_IO(WRITE) (%s/%s) off/cnt(%Lu/%d)\n",
371 				dentry->d_parent->d_name.name,
372 					dentry->d_name.name, offset, count);
373 
374 		result = nfs_direct_write(file, iobuf, offset, count);
375 		break;
376 	default:
377 		break;
378 	}
379 
380 	dfprintk(VFS, "NFS: direct_IO result = %d\n", result);
381 	return result;
382 }
383