1 /*
2    drbd.c
3 
4    This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5 
6    Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7    Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8    Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9 
10    Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
11    from Logicworks, Inc. for making SDP replication support possible.
12 
13    drbd is free software; you can redistribute it and/or modify
14    it under the terms of the GNU General Public License as published by
15    the Free Software Foundation; either version 2, or (at your option)
16    any later version.
17 
18    drbd is distributed in the hope that it will be useful,
19    but WITHOUT ANY WARRANTY; without even the implied warranty of
20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21    GNU General Public License for more details.
22 
23    You should have received a copy of the GNU General Public License
24    along with drbd; see the file COPYING.  If not, write to
25    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
26 
27  */
28 
29 #include <linux/module.h>
30 #include <linux/drbd.h>
31 #include <asm/uaccess.h>
32 #include <asm/types.h>
33 #include <net/sock.h>
34 #include <linux/ctype.h>
35 #include <linux/mutex.h>
36 #include <linux/fs.h>
37 #include <linux/file.h>
38 #include <linux/proc_fs.h>
39 #include <linux/init.h>
40 #include <linux/mm.h>
41 #include <linux/memcontrol.h>
42 #include <linux/mm_inline.h>
43 #include <linux/slab.h>
44 #include <linux/random.h>
45 #include <linux/reboot.h>
46 #include <linux/notifier.h>
47 #include <linux/kthread.h>
48 
49 #define __KERNEL_SYSCALLS__
50 #include <linux/unistd.h>
51 #include <linux/vmalloc.h>
52 
53 #include <linux/drbd_limits.h>
54 #include "drbd_int.h"
55 #include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
56 
57 #include "drbd_vli.h"
58 
59 struct after_state_chg_work {
60 	struct drbd_work w;
61 	union drbd_state os;
62 	union drbd_state ns;
63 	enum chg_state_flags flags;
64 	struct completion *done;
65 };
66 
67 static DEFINE_MUTEX(drbd_main_mutex);
68 int drbdd_init(struct drbd_thread *);
69 int drbd_worker(struct drbd_thread *);
70 int drbd_asender(struct drbd_thread *);
71 
72 int drbd_init(void);
73 static int drbd_open(struct block_device *bdev, fmode_t mode);
74 static int drbd_release(struct gendisk *gd, fmode_t mode);
75 static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused);
76 static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
77 			   union drbd_state ns, enum chg_state_flags flags);
78 static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
79 static void md_sync_timer_fn(unsigned long data);
80 static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
81 static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused);
82 
83 MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
84 	      "Lars Ellenberg <lars@linbit.com>");
85 MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
86 MODULE_VERSION(REL_VERSION);
87 MODULE_LICENSE("GPL");
88 MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices ("
89 		 __stringify(DRBD_MINOR_COUNT_MIN) "-" __stringify(DRBD_MINOR_COUNT_MAX) ")");
90 MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
91 
92 #include <linux/moduleparam.h>
93 /* allow_open_on_secondary */
94 MODULE_PARM_DESC(allow_oos, "DONT USE!");
95 /* thanks to these macros, if compiled into the kernel (not-module),
96  * this becomes the boot parameter drbd.minor_count */
97 module_param(minor_count, uint, 0444);
98 module_param(disable_sendpage, bool, 0644);
99 module_param(allow_oos, bool, 0);
100 module_param(cn_idx, uint, 0444);
101 module_param(proc_details, int, 0644);
102 
103 #ifdef CONFIG_DRBD_FAULT_INJECTION
104 int enable_faults;
105 int fault_rate;
106 static int fault_count;
107 int fault_devs;
108 /* bitmap of enabled faults */
109 module_param(enable_faults, int, 0664);
110 /* fault rate % value - applies to all enabled faults */
111 module_param(fault_rate, int, 0664);
112 /* count of faults inserted */
113 module_param(fault_count, int, 0664);
114 /* bitmap of devices to insert faults on */
115 module_param(fault_devs, int, 0644);
116 #endif
117 
118 /* module parameter, defined */
119 unsigned int minor_count = DRBD_MINOR_COUNT_DEF;
120 int disable_sendpage;
121 int allow_oos;
122 unsigned int cn_idx = CN_IDX_DRBD;
123 int proc_details;       /* Detail level in proc drbd*/
124 
125 /* Module parameter for setting the user mode helper program
126  * to run. Default is /sbin/drbdadm */
127 char usermode_helper[80] = "/sbin/drbdadm";
128 
129 module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644);
130 
131 /* in 2.6.x, our device mapping and config info contains our virtual gendisks
132  * as member "struct gendisk *vdisk;"
133  */
134 struct drbd_conf **minor_table;
135 
136 struct kmem_cache *drbd_request_cache;
137 struct kmem_cache *drbd_ee_cache;	/* epoch entries */
138 struct kmem_cache *drbd_bm_ext_cache;	/* bitmap extents */
139 struct kmem_cache *drbd_al_ext_cache;	/* activity log extents */
140 mempool_t *drbd_request_mempool;
141 mempool_t *drbd_ee_mempool;
142 
143 /* I do not use a standard mempool, because:
144    1) I want to hand out the pre-allocated objects first.
145    2) I want to be able to interrupt sleeping allocation with a signal.
146    Note: This is a single linked list, the next pointer is the private
147 	 member of struct page.
148  */
149 struct page *drbd_pp_pool;
150 spinlock_t   drbd_pp_lock;
151 int          drbd_pp_vacant;
152 wait_queue_head_t drbd_pp_wait;
153 
154 DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
155 
156 static const struct block_device_operations drbd_ops = {
157 	.owner =   THIS_MODULE,
158 	.open =    drbd_open,
159 	.release = drbd_release,
160 };
161 
162 #define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0]))
163 
164 #ifdef __CHECKER__
165 /* When checking with sparse, and this is an inline function, sparse will
166    give tons of false positives. When this is a real functions sparse works.
167  */
_get_ldev_if_state(struct drbd_conf * mdev,enum drbd_disk_state mins)168 int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
169 {
170 	int io_allowed;
171 
172 	atomic_inc(&mdev->local_cnt);
173 	io_allowed = (mdev->state.disk >= mins);
174 	if (!io_allowed) {
175 		if (atomic_dec_and_test(&mdev->local_cnt))
176 			wake_up(&mdev->misc_wait);
177 	}
178 	return io_allowed;
179 }
180 
181 #endif
182 
183 /**
184  * DOC: The transfer log
185  *
186  * The transfer log is a single linked list of &struct drbd_tl_epoch objects.
187  * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail
188  * of the list. There is always at least one &struct drbd_tl_epoch object.
189  *
190  * Each &struct drbd_tl_epoch has a circular double linked list of requests
191  * attached.
192  */
tl_init(struct drbd_conf * mdev)193 static int tl_init(struct drbd_conf *mdev)
194 {
195 	struct drbd_tl_epoch *b;
196 
197 	/* during device minor initialization, we may well use GFP_KERNEL */
198 	b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL);
199 	if (!b)
200 		return 0;
201 	INIT_LIST_HEAD(&b->requests);
202 	INIT_LIST_HEAD(&b->w.list);
203 	b->next = NULL;
204 	b->br_number = 4711;
205 	b->n_writes = 0;
206 	b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
207 
208 	mdev->oldest_tle = b;
209 	mdev->newest_tle = b;
210 	INIT_LIST_HEAD(&mdev->out_of_sequence_requests);
211 
212 	mdev->tl_hash = NULL;
213 	mdev->tl_hash_s = 0;
214 
215 	return 1;
216 }
217 
tl_cleanup(struct drbd_conf * mdev)218 static void tl_cleanup(struct drbd_conf *mdev)
219 {
220 	D_ASSERT(mdev->oldest_tle == mdev->newest_tle);
221 	D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
222 	kfree(mdev->oldest_tle);
223 	mdev->oldest_tle = NULL;
224 	kfree(mdev->unused_spare_tle);
225 	mdev->unused_spare_tle = NULL;
226 	kfree(mdev->tl_hash);
227 	mdev->tl_hash = NULL;
228 	mdev->tl_hash_s = 0;
229 }
230 
231 /**
232  * _tl_add_barrier() - Adds a barrier to the transfer log
233  * @mdev:	DRBD device.
234  * @new:	Barrier to be added before the current head of the TL.
235  *
236  * The caller must hold the req_lock.
237  */
_tl_add_barrier(struct drbd_conf * mdev,struct drbd_tl_epoch * new)238 void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
239 {
240 	struct drbd_tl_epoch *newest_before;
241 
242 	INIT_LIST_HEAD(&new->requests);
243 	INIT_LIST_HEAD(&new->w.list);
244 	new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
245 	new->next = NULL;
246 	new->n_writes = 0;
247 
248 	newest_before = mdev->newest_tle;
249 	/* never send a barrier number == 0, because that is special-cased
250 	 * when using TCQ for our write ordering code */
251 	new->br_number = (newest_before->br_number+1) ?: 1;
252 	if (mdev->newest_tle != new) {
253 		mdev->newest_tle->next = new;
254 		mdev->newest_tle = new;
255 	}
256 }
257 
258 /**
259  * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL
260  * @mdev:	DRBD device.
261  * @barrier_nr:	Expected identifier of the DRBD write barrier packet.
262  * @set_size:	Expected number of requests before that barrier.
263  *
264  * In case the passed barrier_nr or set_size does not match the oldest
265  * &struct drbd_tl_epoch objects this function will cause a termination
266  * of the connection.
267  */
tl_release(struct drbd_conf * mdev,unsigned int barrier_nr,unsigned int set_size)268 void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
269 		       unsigned int set_size)
270 {
271 	struct drbd_tl_epoch *b, *nob; /* next old barrier */
272 	struct list_head *le, *tle;
273 	struct drbd_request *r;
274 
275 	spin_lock_irq(&mdev->req_lock);
276 
277 	b = mdev->oldest_tle;
278 
279 	/* first some paranoia code */
280 	if (b == NULL) {
281 		dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
282 			barrier_nr);
283 		goto bail;
284 	}
285 	if (b->br_number != barrier_nr) {
286 		dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n",
287 			barrier_nr, b->br_number);
288 		goto bail;
289 	}
290 	if (b->n_writes != set_size) {
291 		dev_err(DEV, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
292 			barrier_nr, set_size, b->n_writes);
293 		goto bail;
294 	}
295 
296 	/* Clean up list of requests processed during current epoch */
297 	list_for_each_safe(le, tle, &b->requests) {
298 		r = list_entry(le, struct drbd_request, tl_requests);
299 		_req_mod(r, barrier_acked);
300 	}
301 	/* There could be requests on the list waiting for completion
302 	   of the write to the local disk. To avoid corruptions of
303 	   slab's data structures we have to remove the lists head.
304 
305 	   Also there could have been a barrier ack out of sequence, overtaking
306 	   the write acks - which would be a bug and violating write ordering.
307 	   To not deadlock in case we lose connection while such requests are
308 	   still pending, we need some way to find them for the
309 	   _req_mode(connection_lost_while_pending).
310 
311 	   These have been list_move'd to the out_of_sequence_requests list in
312 	   _req_mod(, barrier_acked) above.
313 	   */
314 	list_del_init(&b->requests);
315 
316 	nob = b->next;
317 	if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
318 		_tl_add_barrier(mdev, b);
319 		if (nob)
320 			mdev->oldest_tle = nob;
321 		/* if nob == NULL b was the only barrier, and becomes the new
322 		   barrier. Therefore mdev->oldest_tle points already to b */
323 	} else {
324 		D_ASSERT(nob != NULL);
325 		mdev->oldest_tle = nob;
326 		kfree(b);
327 	}
328 
329 	spin_unlock_irq(&mdev->req_lock);
330 	dec_ap_pending(mdev);
331 
332 	return;
333 
334 bail:
335 	spin_unlock_irq(&mdev->req_lock);
336 	drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
337 }
338 
339 
340 /**
341  * _tl_restart() - Walks the transfer log, and applies an action to all requests
342  * @mdev:	DRBD device.
343  * @what:       The action/event to perform with all request objects
344  *
345  * @what might be one of connection_lost_while_pending, resend, fail_frozen_disk_io,
346  * restart_frozen_disk_io.
347  */
_tl_restart(struct drbd_conf * mdev,enum drbd_req_event what)348 static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
349 {
350 	struct drbd_tl_epoch *b, *tmp, **pn;
351 	struct list_head *le, *tle, carry_reads;
352 	struct drbd_request *req;
353 	int rv, n_writes, n_reads;
354 
355 	b = mdev->oldest_tle;
356 	pn = &mdev->oldest_tle;
357 	while (b) {
358 		n_writes = 0;
359 		n_reads = 0;
360 		INIT_LIST_HEAD(&carry_reads);
361 		list_for_each_safe(le, tle, &b->requests) {
362 			req = list_entry(le, struct drbd_request, tl_requests);
363 			rv = _req_mod(req, what);
364 
365 			n_writes += (rv & MR_WRITE) >> MR_WRITE_SHIFT;
366 			n_reads  += (rv & MR_READ) >> MR_READ_SHIFT;
367 		}
368 		tmp = b->next;
369 
370 		if (n_writes) {
371 			if (what == resend) {
372 				b->n_writes = n_writes;
373 				if (b->w.cb == NULL) {
374 					b->w.cb = w_send_barrier;
375 					inc_ap_pending(mdev);
376 					set_bit(CREATE_BARRIER, &mdev->flags);
377 				}
378 
379 				drbd_queue_work(&mdev->data.work, &b->w);
380 			}
381 			pn = &b->next;
382 		} else {
383 			if (n_reads)
384 				list_add(&carry_reads, &b->requests);
385 			/* there could still be requests on that ring list,
386 			 * in case local io is still pending */
387 			list_del(&b->requests);
388 
389 			/* dec_ap_pending corresponding to queue_barrier.
390 			 * the newest barrier may not have been queued yet,
391 			 * in which case w.cb is still NULL. */
392 			if (b->w.cb != NULL)
393 				dec_ap_pending(mdev);
394 
395 			if (b == mdev->newest_tle) {
396 				/* recycle, but reinit! */
397 				D_ASSERT(tmp == NULL);
398 				INIT_LIST_HEAD(&b->requests);
399 				list_splice(&carry_reads, &b->requests);
400 				INIT_LIST_HEAD(&b->w.list);
401 				b->w.cb = NULL;
402 				b->br_number = net_random();
403 				b->n_writes = 0;
404 
405 				*pn = b;
406 				break;
407 			}
408 			*pn = tmp;
409 			kfree(b);
410 		}
411 		b = tmp;
412 		list_splice(&carry_reads, &b->requests);
413 	}
414 }
415 
416 
417 /**
418  * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
419  * @mdev:	DRBD device.
420  *
421  * This is called after the connection to the peer was lost. The storage covered
422  * by the requests on the transfer gets marked as our of sync. Called from the
423  * receiver thread and the worker thread.
424  */
tl_clear(struct drbd_conf * mdev)425 void tl_clear(struct drbd_conf *mdev)
426 {
427 	struct list_head *le, *tle;
428 	struct drbd_request *r;
429 
430 	spin_lock_irq(&mdev->req_lock);
431 
432 	_tl_restart(mdev, connection_lost_while_pending);
433 
434 	/* we expect this list to be empty. */
435 	D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
436 
437 	/* but just in case, clean it up anyways! */
438 	list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) {
439 		r = list_entry(le, struct drbd_request, tl_requests);
440 		/* It would be nice to complete outside of spinlock.
441 		 * But this is easier for now. */
442 		_req_mod(r, connection_lost_while_pending);
443 	}
444 
445 	/* ensure bit indicating barrier is required is clear */
446 	clear_bit(CREATE_BARRIER, &mdev->flags);
447 
448 	memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
449 
450 	spin_unlock_irq(&mdev->req_lock);
451 }
452 
tl_restart(struct drbd_conf * mdev,enum drbd_req_event what)453 void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
454 {
455 	spin_lock_irq(&mdev->req_lock);
456 	_tl_restart(mdev, what);
457 	spin_unlock_irq(&mdev->req_lock);
458 }
459 
460 /**
461  * cl_wide_st_chg() - true if the state change is a cluster wide one
462  * @mdev:	DRBD device.
463  * @os:		old (current) state.
464  * @ns:		new (wanted) state.
465  */
cl_wide_st_chg(struct drbd_conf * mdev,union drbd_state os,union drbd_state ns)466 static int cl_wide_st_chg(struct drbd_conf *mdev,
467 			  union drbd_state os, union drbd_state ns)
468 {
469 	return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
470 		 ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
471 		  (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
472 		  (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
473 		  (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))) ||
474 		(os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
475 		(os.conn == C_CONNECTED && ns.conn == C_VERIFY_S);
476 }
477 
478 enum drbd_state_rv
drbd_change_state(struct drbd_conf * mdev,enum chg_state_flags f,union drbd_state mask,union drbd_state val)479 drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
480 		  union drbd_state mask, union drbd_state val)
481 {
482 	unsigned long flags;
483 	union drbd_state os, ns;
484 	enum drbd_state_rv rv;
485 
486 	spin_lock_irqsave(&mdev->req_lock, flags);
487 	os = mdev->state;
488 	ns.i = (os.i & ~mask.i) | val.i;
489 	rv = _drbd_set_state(mdev, ns, f, NULL);
490 	ns = mdev->state;
491 	spin_unlock_irqrestore(&mdev->req_lock, flags);
492 
493 	return rv;
494 }
495 
496 /**
497  * drbd_force_state() - Impose a change which happens outside our control on our state
498  * @mdev:	DRBD device.
499  * @mask:	mask of state bits to change.
500  * @val:	value of new state bits.
501  */
drbd_force_state(struct drbd_conf * mdev,union drbd_state mask,union drbd_state val)502 void drbd_force_state(struct drbd_conf *mdev,
503 	union drbd_state mask, union drbd_state val)
504 {
505 	drbd_change_state(mdev, CS_HARD, mask, val);
506 }
507 
508 static enum drbd_state_rv is_valid_state(struct drbd_conf *, union drbd_state);
509 static enum drbd_state_rv is_valid_state_transition(struct drbd_conf *,
510 						    union drbd_state,
511 						    union drbd_state);
512 static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
513 				       union drbd_state ns, const char **warn_sync_abort);
514 int drbd_send_state_req(struct drbd_conf *,
515 			union drbd_state, union drbd_state);
516 
517 static enum drbd_state_rv
_req_st_cond(struct drbd_conf * mdev,union drbd_state mask,union drbd_state val)518 _req_st_cond(struct drbd_conf *mdev, union drbd_state mask,
519 	     union drbd_state val)
520 {
521 	union drbd_state os, ns;
522 	unsigned long flags;
523 	enum drbd_state_rv rv;
524 
525 	if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags))
526 		return SS_CW_SUCCESS;
527 
528 	if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags))
529 		return SS_CW_FAILED_BY_PEER;
530 
531 	rv = 0;
532 	spin_lock_irqsave(&mdev->req_lock, flags);
533 	os = mdev->state;
534 	ns.i = (os.i & ~mask.i) | val.i;
535 	ns = sanitize_state(mdev, os, ns, NULL);
536 
537 	if (!cl_wide_st_chg(mdev, os, ns))
538 		rv = SS_CW_NO_NEED;
539 	if (!rv) {
540 		rv = is_valid_state(mdev, ns);
541 		if (rv == SS_SUCCESS) {
542 			rv = is_valid_state_transition(mdev, ns, os);
543 			if (rv == SS_SUCCESS)
544 				rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */
545 		}
546 	}
547 	spin_unlock_irqrestore(&mdev->req_lock, flags);
548 
549 	return rv;
550 }
551 
552 /**
553  * drbd_req_state() - Perform an eventually cluster wide state change
554  * @mdev:	DRBD device.
555  * @mask:	mask of state bits to change.
556  * @val:	value of new state bits.
557  * @f:		flags
558  *
559  * Should not be called directly, use drbd_request_state() or
560  * _drbd_request_state().
561  */
562 static enum drbd_state_rv
drbd_req_state(struct drbd_conf * mdev,union drbd_state mask,union drbd_state val,enum chg_state_flags f)563 drbd_req_state(struct drbd_conf *mdev, union drbd_state mask,
564 	       union drbd_state val, enum chg_state_flags f)
565 {
566 	struct completion done;
567 	unsigned long flags;
568 	union drbd_state os, ns;
569 	enum drbd_state_rv rv;
570 
571 	init_completion(&done);
572 
573 	if (f & CS_SERIALIZE)
574 		mutex_lock(&mdev->state_mutex);
575 
576 	spin_lock_irqsave(&mdev->req_lock, flags);
577 	os = mdev->state;
578 	ns.i = (os.i & ~mask.i) | val.i;
579 	ns = sanitize_state(mdev, os, ns, NULL);
580 
581 	if (cl_wide_st_chg(mdev, os, ns)) {
582 		rv = is_valid_state(mdev, ns);
583 		if (rv == SS_SUCCESS)
584 			rv = is_valid_state_transition(mdev, ns, os);
585 		spin_unlock_irqrestore(&mdev->req_lock, flags);
586 
587 		if (rv < SS_SUCCESS) {
588 			if (f & CS_VERBOSE)
589 				print_st_err(mdev, os, ns, rv);
590 			goto abort;
591 		}
592 
593 		drbd_state_lock(mdev);
594 		if (!drbd_send_state_req(mdev, mask, val)) {
595 			drbd_state_unlock(mdev);
596 			rv = SS_CW_FAILED_BY_PEER;
597 			if (f & CS_VERBOSE)
598 				print_st_err(mdev, os, ns, rv);
599 			goto abort;
600 		}
601 
602 		wait_event(mdev->state_wait,
603 			(rv = _req_st_cond(mdev, mask, val)));
604 
605 		if (rv < SS_SUCCESS) {
606 			drbd_state_unlock(mdev);
607 			if (f & CS_VERBOSE)
608 				print_st_err(mdev, os, ns, rv);
609 			goto abort;
610 		}
611 		spin_lock_irqsave(&mdev->req_lock, flags);
612 		os = mdev->state;
613 		ns.i = (os.i & ~mask.i) | val.i;
614 		rv = _drbd_set_state(mdev, ns, f, &done);
615 		drbd_state_unlock(mdev);
616 	} else {
617 		rv = _drbd_set_state(mdev, ns, f, &done);
618 	}
619 
620 	spin_unlock_irqrestore(&mdev->req_lock, flags);
621 
622 	if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
623 		D_ASSERT(current != mdev->worker.task);
624 		wait_for_completion(&done);
625 	}
626 
627 abort:
628 	if (f & CS_SERIALIZE)
629 		mutex_unlock(&mdev->state_mutex);
630 
631 	return rv;
632 }
633 
634 /**
635  * _drbd_request_state() - Request a state change (with flags)
636  * @mdev:	DRBD device.
637  * @mask:	mask of state bits to change.
638  * @val:	value of new state bits.
639  * @f:		flags
640  *
641  * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
642  * flag, or when logging of failed state change requests is not desired.
643  */
644 enum drbd_state_rv
_drbd_request_state(struct drbd_conf * mdev,union drbd_state mask,union drbd_state val,enum chg_state_flags f)645 _drbd_request_state(struct drbd_conf *mdev, union drbd_state mask,
646 		    union drbd_state val, enum chg_state_flags f)
647 {
648 	enum drbd_state_rv rv;
649 
650 	wait_event(mdev->state_wait,
651 		   (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE);
652 
653 	return rv;
654 }
655 
print_st(struct drbd_conf * mdev,char * name,union drbd_state ns)656 static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns)
657 {
658 	dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n",
659 	    name,
660 	    drbd_conn_str(ns.conn),
661 	    drbd_role_str(ns.role),
662 	    drbd_role_str(ns.peer),
663 	    drbd_disk_str(ns.disk),
664 	    drbd_disk_str(ns.pdsk),
665 	    is_susp(ns) ? 's' : 'r',
666 	    ns.aftr_isp ? 'a' : '-',
667 	    ns.peer_isp ? 'p' : '-',
668 	    ns.user_isp ? 'u' : '-'
669 	    );
670 }
671 
print_st_err(struct drbd_conf * mdev,union drbd_state os,union drbd_state ns,enum drbd_state_rv err)672 void print_st_err(struct drbd_conf *mdev, union drbd_state os,
673 	          union drbd_state ns, enum drbd_state_rv err)
674 {
675 	if (err == SS_IN_TRANSIENT_STATE)
676 		return;
677 	dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err));
678 	print_st(mdev, " state", os);
679 	print_st(mdev, "wanted", ns);
680 }
681 
682 
683 /**
684  * is_valid_state() - Returns an SS_ error code if ns is not valid
685  * @mdev:	DRBD device.
686  * @ns:		State to consider.
687  */
688 static enum drbd_state_rv
is_valid_state(struct drbd_conf * mdev,union drbd_state ns)689 is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
690 {
691 	/* See drbd_state_sw_errors in drbd_strings.c */
692 
693 	enum drbd_fencing_p fp;
694 	enum drbd_state_rv rv = SS_SUCCESS;
695 
696 	fp = FP_DONT_CARE;
697 	if (get_ldev(mdev)) {
698 		fp = mdev->ldev->dc.fencing;
699 		put_ldev(mdev);
700 	}
701 
702 	if (get_net_conf(mdev)) {
703 		if (!mdev->net_conf->two_primaries &&
704 		    ns.role == R_PRIMARY && ns.peer == R_PRIMARY)
705 			rv = SS_TWO_PRIMARIES;
706 		put_net_conf(mdev);
707 	}
708 
709 	if (rv <= 0)
710 		/* already found a reason to abort */;
711 	else if (ns.role == R_SECONDARY && mdev->open_cnt)
712 		rv = SS_DEVICE_IN_USE;
713 
714 	else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
715 		rv = SS_NO_UP_TO_DATE_DISK;
716 
717 	else if (fp >= FP_RESOURCE &&
718 		 ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
719 		rv = SS_PRIMARY_NOP;
720 
721 	else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
722 		rv = SS_NO_UP_TO_DATE_DISK;
723 
724 	else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
725 		rv = SS_NO_LOCAL_DISK;
726 
727 	else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
728 		rv = SS_NO_REMOTE_DISK;
729 
730 	else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
731 		rv = SS_NO_UP_TO_DATE_DISK;
732 
733 	else if ((ns.conn == C_CONNECTED ||
734 		  ns.conn == C_WF_BITMAP_S ||
735 		  ns.conn == C_SYNC_SOURCE ||
736 		  ns.conn == C_PAUSED_SYNC_S) &&
737 		  ns.disk == D_OUTDATED)
738 		rv = SS_CONNECTED_OUTDATES;
739 
740 	else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
741 		 (mdev->sync_conf.verify_alg[0] == 0))
742 		rv = SS_NO_VERIFY_ALG;
743 
744 	else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
745 		  mdev->agreed_pro_version < 88)
746 		rv = SS_NOT_SUPPORTED;
747 
748 	return rv;
749 }
750 
751 /**
752  * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible
753  * @mdev:	DRBD device.
754  * @ns:		new state.
755  * @os:		old state.
756  */
757 static enum drbd_state_rv
is_valid_state_transition(struct drbd_conf * mdev,union drbd_state ns,union drbd_state os)758 is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns,
759 			  union drbd_state os)
760 {
761 	enum drbd_state_rv rv = SS_SUCCESS;
762 
763 	if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
764 	    os.conn > C_CONNECTED)
765 		rv = SS_RESYNC_RUNNING;
766 
767 	if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
768 		rv = SS_ALREADY_STANDALONE;
769 
770 	if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
771 		rv = SS_IS_DISKLESS;
772 
773 	if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
774 		rv = SS_NO_NET_CONFIG;
775 
776 	if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
777 		rv = SS_LOWER_THAN_OUTDATED;
778 
779 	if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
780 		rv = SS_IN_TRANSIENT_STATE;
781 
782 	if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
783 		rv = SS_IN_TRANSIENT_STATE;
784 
785 	if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
786 		rv = SS_NEED_CONNECTION;
787 
788 	if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
789 	    ns.conn != os.conn && os.conn > C_CONNECTED)
790 		rv = SS_RESYNC_RUNNING;
791 
792 	if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
793 	    os.conn < C_CONNECTED)
794 		rv = SS_NEED_CONNECTION;
795 
796 	if ((ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)
797 	    && os.conn < C_WF_REPORT_PARAMS)
798 		rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */
799 
800 	return rv;
801 }
802 
803 /**
804  * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
805  * @mdev:	DRBD device.
806  * @os:		old state.
807  * @ns:		new state.
808  * @warn_sync_abort:
809  *
810  * When we loose connection, we have to set the state of the peers disk (pdsk)
811  * to D_UNKNOWN. This rule and many more along those lines are in this function.
812  */
sanitize_state(struct drbd_conf * mdev,union drbd_state os,union drbd_state ns,const char ** warn_sync_abort)813 static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
814 				       union drbd_state ns, const char **warn_sync_abort)
815 {
816 	enum drbd_fencing_p fp;
817 	enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max;
818 
819 	fp = FP_DONT_CARE;
820 	if (get_ldev(mdev)) {
821 		fp = mdev->ldev->dc.fencing;
822 		put_ldev(mdev);
823 	}
824 
825 	/* Disallow Network errors to configure a device's network part */
826 	if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) &&
827 	    os.conn <= C_DISCONNECTING)
828 		ns.conn = os.conn;
829 
830 	/* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow.
831 	 * If you try to go into some Sync* state, that shall fail (elsewhere). */
832 	if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
833 	    ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_TEAR_DOWN)
834 		ns.conn = os.conn;
835 
836 	/* we cannot fail (again) if we already detached */
837 	if (ns.disk == D_FAILED && os.disk == D_DISKLESS)
838 		ns.disk = D_DISKLESS;
839 
840 	/* if we are only D_ATTACHING yet,
841 	 * we can (and should) go directly to D_DISKLESS. */
842 	if (ns.disk == D_FAILED && os.disk == D_ATTACHING)
843 		ns.disk = D_DISKLESS;
844 
845 	/* After C_DISCONNECTING only C_STANDALONE may follow */
846 	if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
847 		ns.conn = os.conn;
848 
849 	if (ns.conn < C_CONNECTED) {
850 		ns.peer_isp = 0;
851 		ns.peer = R_UNKNOWN;
852 		if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
853 			ns.pdsk = D_UNKNOWN;
854 	}
855 
856 	/* Clear the aftr_isp when becoming unconfigured */
857 	if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
858 		ns.aftr_isp = 0;
859 
860 	/* Abort resync if a disk fails/detaches */
861 	if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
862 	    (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
863 		if (warn_sync_abort)
864 			*warn_sync_abort =
865 				os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ?
866 				"Online-verify" : "Resync";
867 		ns.conn = C_CONNECTED;
868 	}
869 
870 	/* Connection breaks down before we finished "Negotiating" */
871 	if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
872 	    get_ldev_if_state(mdev, D_NEGOTIATING)) {
873 		if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) {
874 			ns.disk = mdev->new_state_tmp.disk;
875 			ns.pdsk = mdev->new_state_tmp.pdsk;
876 		} else {
877 			dev_alert(DEV, "Connection lost while negotiating, no data!\n");
878 			ns.disk = D_DISKLESS;
879 			ns.pdsk = D_UNKNOWN;
880 		}
881 		put_ldev(mdev);
882 	}
883 
884 	/* D_CONSISTENT and D_OUTDATED vanish when we get connected */
885 	if (ns.conn >= C_CONNECTED && ns.conn < C_AHEAD) {
886 		if (ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED)
887 			ns.disk = D_UP_TO_DATE;
888 		if (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)
889 			ns.pdsk = D_UP_TO_DATE;
890 	}
891 
892 	/* Implications of the connection stat on the disk states */
893 	disk_min = D_DISKLESS;
894 	disk_max = D_UP_TO_DATE;
895 	pdsk_min = D_INCONSISTENT;
896 	pdsk_max = D_UNKNOWN;
897 	switch ((enum drbd_conns)ns.conn) {
898 	case C_WF_BITMAP_T:
899 	case C_PAUSED_SYNC_T:
900 	case C_STARTING_SYNC_T:
901 	case C_WF_SYNC_UUID:
902 	case C_BEHIND:
903 		disk_min = D_INCONSISTENT;
904 		disk_max = D_OUTDATED;
905 		pdsk_min = D_UP_TO_DATE;
906 		pdsk_max = D_UP_TO_DATE;
907 		break;
908 	case C_VERIFY_S:
909 	case C_VERIFY_T:
910 		disk_min = D_UP_TO_DATE;
911 		disk_max = D_UP_TO_DATE;
912 		pdsk_min = D_UP_TO_DATE;
913 		pdsk_max = D_UP_TO_DATE;
914 		break;
915 	case C_CONNECTED:
916 		disk_min = D_DISKLESS;
917 		disk_max = D_UP_TO_DATE;
918 		pdsk_min = D_DISKLESS;
919 		pdsk_max = D_UP_TO_DATE;
920 		break;
921 	case C_WF_BITMAP_S:
922 	case C_PAUSED_SYNC_S:
923 	case C_STARTING_SYNC_S:
924 	case C_AHEAD:
925 		disk_min = D_UP_TO_DATE;
926 		disk_max = D_UP_TO_DATE;
927 		pdsk_min = D_INCONSISTENT;
928 		pdsk_max = D_CONSISTENT; /* D_OUTDATED would be nice. But explicit outdate necessary*/
929 		break;
930 	case C_SYNC_TARGET:
931 		disk_min = D_INCONSISTENT;
932 		disk_max = D_INCONSISTENT;
933 		pdsk_min = D_UP_TO_DATE;
934 		pdsk_max = D_UP_TO_DATE;
935 		break;
936 	case C_SYNC_SOURCE:
937 		disk_min = D_UP_TO_DATE;
938 		disk_max = D_UP_TO_DATE;
939 		pdsk_min = D_INCONSISTENT;
940 		pdsk_max = D_INCONSISTENT;
941 		break;
942 	case C_STANDALONE:
943 	case C_DISCONNECTING:
944 	case C_UNCONNECTED:
945 	case C_TIMEOUT:
946 	case C_BROKEN_PIPE:
947 	case C_NETWORK_FAILURE:
948 	case C_PROTOCOL_ERROR:
949 	case C_TEAR_DOWN:
950 	case C_WF_CONNECTION:
951 	case C_WF_REPORT_PARAMS:
952 	case C_MASK:
953 		break;
954 	}
955 	if (ns.disk > disk_max)
956 		ns.disk = disk_max;
957 
958 	if (ns.disk < disk_min) {
959 		dev_warn(DEV, "Implicitly set disk from %s to %s\n",
960 			 drbd_disk_str(ns.disk), drbd_disk_str(disk_min));
961 		ns.disk = disk_min;
962 	}
963 	if (ns.pdsk > pdsk_max)
964 		ns.pdsk = pdsk_max;
965 
966 	if (ns.pdsk < pdsk_min) {
967 		dev_warn(DEV, "Implicitly set pdsk from %s to %s\n",
968 			 drbd_disk_str(ns.pdsk), drbd_disk_str(pdsk_min));
969 		ns.pdsk = pdsk_min;
970 	}
971 
972 	if (fp == FP_STONITH &&
973 	    (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
974 	    !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
975 		ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */
976 
977 	if (mdev->sync_conf.on_no_data == OND_SUSPEND_IO &&
978 	    (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) &&
979 	    !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE))
980 		ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */
981 
982 	if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
983 		if (ns.conn == C_SYNC_SOURCE)
984 			ns.conn = C_PAUSED_SYNC_S;
985 		if (ns.conn == C_SYNC_TARGET)
986 			ns.conn = C_PAUSED_SYNC_T;
987 	} else {
988 		if (ns.conn == C_PAUSED_SYNC_S)
989 			ns.conn = C_SYNC_SOURCE;
990 		if (ns.conn == C_PAUSED_SYNC_T)
991 			ns.conn = C_SYNC_TARGET;
992 	}
993 
994 	return ns;
995 }
996 
997 /* helper for __drbd_set_state */
set_ov_position(struct drbd_conf * mdev,enum drbd_conns cs)998 static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
999 {
1000 	if (mdev->agreed_pro_version < 90)
1001 		mdev->ov_start_sector = 0;
1002 	mdev->rs_total = drbd_bm_bits(mdev);
1003 	mdev->ov_position = 0;
1004 	if (cs == C_VERIFY_T) {
1005 		/* starting online verify from an arbitrary position
1006 		 * does not fit well into the existing protocol.
1007 		 * on C_VERIFY_T, we initialize ov_left and friends
1008 		 * implicitly in receive_DataRequest once the
1009 		 * first P_OV_REQUEST is received */
1010 		mdev->ov_start_sector = ~(sector_t)0;
1011 	} else {
1012 		unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector);
1013 		if (bit >= mdev->rs_total) {
1014 			mdev->ov_start_sector =
1015 				BM_BIT_TO_SECT(mdev->rs_total - 1);
1016 			mdev->rs_total = 1;
1017 		} else
1018 			mdev->rs_total -= bit;
1019 		mdev->ov_position = mdev->ov_start_sector;
1020 	}
1021 	mdev->ov_left = mdev->rs_total;
1022 }
1023 
drbd_resume_al(struct drbd_conf * mdev)1024 static void drbd_resume_al(struct drbd_conf *mdev)
1025 {
1026 	if (test_and_clear_bit(AL_SUSPENDED, &mdev->flags))
1027 		dev_info(DEV, "Resumed AL updates\n");
1028 }
1029 
1030 /**
1031  * __drbd_set_state() - Set a new DRBD state
1032  * @mdev:	DRBD device.
1033  * @ns:		new state.
1034  * @flags:	Flags
1035  * @done:	Optional completion, that will get completed after the after_state_ch() finished
1036  *
1037  * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
1038  */
1039 enum drbd_state_rv
__drbd_set_state(struct drbd_conf * mdev,union drbd_state ns,enum chg_state_flags flags,struct completion * done)1040 __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
1041 	         enum chg_state_flags flags, struct completion *done)
1042 {
1043 	union drbd_state os;
1044 	enum drbd_state_rv rv = SS_SUCCESS;
1045 	const char *warn_sync_abort = NULL;
1046 	struct after_state_chg_work *ascw;
1047 
1048 	os = mdev->state;
1049 
1050 	ns = sanitize_state(mdev, os, ns, &warn_sync_abort);
1051 
1052 	if (ns.i == os.i)
1053 		return SS_NOTHING_TO_DO;
1054 
1055 	if (!(flags & CS_HARD)) {
1056 		/*  pre-state-change checks ; only look at ns  */
1057 		/* See drbd_state_sw_errors in drbd_strings.c */
1058 
1059 		rv = is_valid_state(mdev, ns);
1060 		if (rv < SS_SUCCESS) {
1061 			/* If the old state was illegal as well, then let
1062 			   this happen...*/
1063 
1064 			if (is_valid_state(mdev, os) == rv)
1065 				rv = is_valid_state_transition(mdev, ns, os);
1066 		} else
1067 			rv = is_valid_state_transition(mdev, ns, os);
1068 	}
1069 
1070 	if (rv < SS_SUCCESS) {
1071 		if (flags & CS_VERBOSE)
1072 			print_st_err(mdev, os, ns, rv);
1073 		return rv;
1074 	}
1075 
1076 	if (warn_sync_abort)
1077 		dev_warn(DEV, "%s aborted.\n", warn_sync_abort);
1078 
1079 	{
1080 	char *pbp, pb[300];
1081 	pbp = pb;
1082 	*pbp = 0;
1083 	if (ns.role != os.role)
1084 		pbp += sprintf(pbp, "role( %s -> %s ) ",
1085 			       drbd_role_str(os.role),
1086 			       drbd_role_str(ns.role));
1087 	if (ns.peer != os.peer)
1088 		pbp += sprintf(pbp, "peer( %s -> %s ) ",
1089 			       drbd_role_str(os.peer),
1090 			       drbd_role_str(ns.peer));
1091 	if (ns.conn != os.conn)
1092 		pbp += sprintf(pbp, "conn( %s -> %s ) ",
1093 			       drbd_conn_str(os.conn),
1094 			       drbd_conn_str(ns.conn));
1095 	if (ns.disk != os.disk)
1096 		pbp += sprintf(pbp, "disk( %s -> %s ) ",
1097 			       drbd_disk_str(os.disk),
1098 			       drbd_disk_str(ns.disk));
1099 	if (ns.pdsk != os.pdsk)
1100 		pbp += sprintf(pbp, "pdsk( %s -> %s ) ",
1101 			       drbd_disk_str(os.pdsk),
1102 			       drbd_disk_str(ns.pdsk));
1103 	if (is_susp(ns) != is_susp(os))
1104 		pbp += sprintf(pbp, "susp( %d -> %d ) ",
1105 			       is_susp(os),
1106 			       is_susp(ns));
1107 	if (ns.aftr_isp != os.aftr_isp)
1108 		pbp += sprintf(pbp, "aftr_isp( %d -> %d ) ",
1109 			       os.aftr_isp,
1110 			       ns.aftr_isp);
1111 	if (ns.peer_isp != os.peer_isp)
1112 		pbp += sprintf(pbp, "peer_isp( %d -> %d ) ",
1113 			       os.peer_isp,
1114 			       ns.peer_isp);
1115 	if (ns.user_isp != os.user_isp)
1116 		pbp += sprintf(pbp, "user_isp( %d -> %d ) ",
1117 			       os.user_isp,
1118 			       ns.user_isp);
1119 	dev_info(DEV, "%s\n", pb);
1120 	}
1121 
1122 	/* solve the race between becoming unconfigured,
1123 	 * worker doing the cleanup, and
1124 	 * admin reconfiguring us:
1125 	 * on (re)configure, first set CONFIG_PENDING,
1126 	 * then wait for a potentially exiting worker,
1127 	 * start the worker, and schedule one no_op.
1128 	 * then proceed with configuration.
1129 	 */
1130 	if (ns.disk == D_DISKLESS &&
1131 	    ns.conn == C_STANDALONE &&
1132 	    ns.role == R_SECONDARY &&
1133 	    !test_and_set_bit(CONFIG_PENDING, &mdev->flags))
1134 		set_bit(DEVICE_DYING, &mdev->flags);
1135 
1136 	/* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference
1137 	 * on the ldev here, to be sure the transition -> D_DISKLESS resp.
1138 	 * drbd_ldev_destroy() won't happen before our corresponding
1139 	 * after_state_ch works run, where we put_ldev again. */
1140 	if ((os.disk != D_FAILED && ns.disk == D_FAILED) ||
1141 	    (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))
1142 		atomic_inc(&mdev->local_cnt);
1143 
1144 	mdev->state = ns;
1145 
1146 	if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING)
1147 		drbd_print_uuids(mdev, "attached to UUIDs");
1148 
1149 	wake_up(&mdev->misc_wait);
1150 	wake_up(&mdev->state_wait);
1151 
1152 	/* aborted verify run. log the last position */
1153 	if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
1154 	    ns.conn < C_CONNECTED) {
1155 		mdev->ov_start_sector =
1156 			BM_BIT_TO_SECT(drbd_bm_bits(mdev) - mdev->ov_left);
1157 		dev_info(DEV, "Online Verify reached sector %llu\n",
1158 			(unsigned long long)mdev->ov_start_sector);
1159 	}
1160 
1161 	if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
1162 	    (ns.conn == C_SYNC_TARGET  || ns.conn == C_SYNC_SOURCE)) {
1163 		dev_info(DEV, "Syncer continues.\n");
1164 		mdev->rs_paused += (long)jiffies
1165 				  -(long)mdev->rs_mark_time[mdev->rs_last_mark];
1166 		if (ns.conn == C_SYNC_TARGET)
1167 			mod_timer(&mdev->resync_timer, jiffies);
1168 	}
1169 
1170 	if ((os.conn == C_SYNC_TARGET  || os.conn == C_SYNC_SOURCE) &&
1171 	    (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
1172 		dev_info(DEV, "Resync suspended\n");
1173 		mdev->rs_mark_time[mdev->rs_last_mark] = jiffies;
1174 	}
1175 
1176 	if (os.conn == C_CONNECTED &&
1177 	    (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
1178 		unsigned long now = jiffies;
1179 		int i;
1180 
1181 		set_ov_position(mdev, ns.conn);
1182 		mdev->rs_start = now;
1183 		mdev->rs_last_events = 0;
1184 		mdev->rs_last_sect_ev = 0;
1185 		mdev->ov_last_oos_size = 0;
1186 		mdev->ov_last_oos_start = 0;
1187 
1188 		for (i = 0; i < DRBD_SYNC_MARKS; i++) {
1189 			mdev->rs_mark_left[i] = mdev->ov_left;
1190 			mdev->rs_mark_time[i] = now;
1191 		}
1192 
1193 		drbd_rs_controller_reset(mdev);
1194 
1195 		if (ns.conn == C_VERIFY_S) {
1196 			dev_info(DEV, "Starting Online Verify from sector %llu\n",
1197 					(unsigned long long)mdev->ov_position);
1198 			mod_timer(&mdev->resync_timer, jiffies);
1199 		}
1200 	}
1201 
1202 	if (get_ldev(mdev)) {
1203 		u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
1204 						 MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
1205 						 MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
1206 
1207 		if (test_bit(CRASHED_PRIMARY, &mdev->flags))
1208 			mdf |= MDF_CRASHED_PRIMARY;
1209 		if (mdev->state.role == R_PRIMARY ||
1210 		    (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY))
1211 			mdf |= MDF_PRIMARY_IND;
1212 		if (mdev->state.conn > C_WF_REPORT_PARAMS)
1213 			mdf |= MDF_CONNECTED_IND;
1214 		if (mdev->state.disk > D_INCONSISTENT)
1215 			mdf |= MDF_CONSISTENT;
1216 		if (mdev->state.disk > D_OUTDATED)
1217 			mdf |= MDF_WAS_UP_TO_DATE;
1218 		if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT)
1219 			mdf |= MDF_PEER_OUT_DATED;
1220 		if (mdf != mdev->ldev->md.flags) {
1221 			mdev->ldev->md.flags = mdf;
1222 			drbd_md_mark_dirty(mdev);
1223 		}
1224 		if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
1225 			drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]);
1226 		put_ldev(mdev);
1227 	}
1228 
1229 	/* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
1230 	if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
1231 	    os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
1232 		set_bit(CONSIDER_RESYNC, &mdev->flags);
1233 
1234 	/* Receiver should clean up itself */
1235 	if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
1236 		drbd_thread_stop_nowait(&mdev->receiver);
1237 
1238 	/* Now the receiver finished cleaning up itself, it should die */
1239 	if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
1240 		drbd_thread_stop_nowait(&mdev->receiver);
1241 
1242 	/* Upon network failure, we need to restart the receiver. */
1243 	if (os.conn > C_TEAR_DOWN &&
1244 	    ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
1245 		drbd_thread_restart_nowait(&mdev->receiver);
1246 
1247 	/* Resume AL writing if we get a connection */
1248 	if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
1249 		drbd_resume_al(mdev);
1250 
1251 	ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
1252 	if (ascw) {
1253 		ascw->os = os;
1254 		ascw->ns = ns;
1255 		ascw->flags = flags;
1256 		ascw->w.cb = w_after_state_ch;
1257 		ascw->done = done;
1258 		drbd_queue_work(&mdev->data.work, &ascw->w);
1259 	} else {
1260 		dev_warn(DEV, "Could not kmalloc an ascw\n");
1261 	}
1262 
1263 	return rv;
1264 }
1265 
w_after_state_ch(struct drbd_conf * mdev,struct drbd_work * w,int unused)1266 static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused)
1267 {
1268 	struct after_state_chg_work *ascw =
1269 		container_of(w, struct after_state_chg_work, w);
1270 	after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags);
1271 	if (ascw->flags & CS_WAIT_COMPLETE) {
1272 		D_ASSERT(ascw->done != NULL);
1273 		complete(ascw->done);
1274 	}
1275 	kfree(ascw);
1276 
1277 	return 1;
1278 }
1279 
abw_start_sync(struct drbd_conf * mdev,int rv)1280 static void abw_start_sync(struct drbd_conf *mdev, int rv)
1281 {
1282 	if (rv) {
1283 		dev_err(DEV, "Writing the bitmap failed not starting resync.\n");
1284 		_drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE);
1285 		return;
1286 	}
1287 
1288 	switch (mdev->state.conn) {
1289 	case C_STARTING_SYNC_T:
1290 		_drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
1291 		break;
1292 	case C_STARTING_SYNC_S:
1293 		drbd_start_resync(mdev, C_SYNC_SOURCE);
1294 		break;
1295 	}
1296 }
1297 
drbd_bitmap_io_from_worker(struct drbd_conf * mdev,int (* io_fn)(struct drbd_conf *),char * why,enum bm_flag flags)1298 int drbd_bitmap_io_from_worker(struct drbd_conf *mdev,
1299 		int (*io_fn)(struct drbd_conf *),
1300 		char *why, enum bm_flag flags)
1301 {
1302 	int rv;
1303 
1304 	D_ASSERT(current == mdev->worker.task);
1305 
1306 	/* open coded non-blocking drbd_suspend_io(mdev); */
1307 	set_bit(SUSPEND_IO, &mdev->flags);
1308 
1309 	drbd_bm_lock(mdev, why, flags);
1310 	rv = io_fn(mdev);
1311 	drbd_bm_unlock(mdev);
1312 
1313 	drbd_resume_io(mdev);
1314 
1315 	return rv;
1316 }
1317 
1318 /**
1319  * after_state_ch() - Perform after state change actions that may sleep
1320  * @mdev:	DRBD device.
1321  * @os:		old state.
1322  * @ns:		new state.
1323  * @flags:	Flags
1324  */
after_state_ch(struct drbd_conf * mdev,union drbd_state os,union drbd_state ns,enum chg_state_flags flags)1325 static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1326 			   union drbd_state ns, enum chg_state_flags flags)
1327 {
1328 	enum drbd_fencing_p fp;
1329 	enum drbd_req_event what = nothing;
1330 	union drbd_state nsm = (union drbd_state){ .i = -1 };
1331 
1332 	if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
1333 		clear_bit(CRASHED_PRIMARY, &mdev->flags);
1334 		if (mdev->p_uuid)
1335 			mdev->p_uuid[UI_FLAGS] &= ~((u64)2);
1336 	}
1337 
1338 	fp = FP_DONT_CARE;
1339 	if (get_ldev(mdev)) {
1340 		fp = mdev->ldev->dc.fencing;
1341 		put_ldev(mdev);
1342 	}
1343 
1344 	/* Inform userspace about the change... */
1345 	drbd_bcast_state(mdev, ns);
1346 
1347 	if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
1348 	    (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
1349 		drbd_khelper(mdev, "pri-on-incon-degr");
1350 
1351 	/* Here we have the actions that are performed after a
1352 	   state change. This function might sleep */
1353 
1354 	nsm.i = -1;
1355 	if (ns.susp_nod) {
1356 		if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
1357 			what = resend;
1358 
1359 		if (os.disk == D_ATTACHING && ns.disk > D_ATTACHING)
1360 			what = restart_frozen_disk_io;
1361 
1362 		if (what != nothing)
1363 			nsm.susp_nod = 0;
1364 	}
1365 
1366 	if (ns.susp_fen) {
1367 		/* case1: The outdate peer handler is successful: */
1368 		if (os.pdsk > D_OUTDATED  && ns.pdsk <= D_OUTDATED) {
1369 			tl_clear(mdev);
1370 			if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
1371 				drbd_uuid_new_current(mdev);
1372 				clear_bit(NEW_CUR_UUID, &mdev->flags);
1373 			}
1374 			spin_lock_irq(&mdev->req_lock);
1375 			_drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL);
1376 			spin_unlock_irq(&mdev->req_lock);
1377 		}
1378 		/* case2: The connection was established again: */
1379 		if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
1380 			clear_bit(NEW_CUR_UUID, &mdev->flags);
1381 			what = resend;
1382 			nsm.susp_fen = 0;
1383 		}
1384 	}
1385 
1386 	if (what != nothing) {
1387 		spin_lock_irq(&mdev->req_lock);
1388 		_tl_restart(mdev, what);
1389 		nsm.i &= mdev->state.i;
1390 		_drbd_set_state(mdev, nsm, CS_VERBOSE, NULL);
1391 		spin_unlock_irq(&mdev->req_lock);
1392 	}
1393 
1394 	/* Became sync source.  With protocol >= 96, we still need to send out
1395 	 * the sync uuid now. Need to do that before any drbd_send_state, or
1396 	 * the other side may go "paused sync" before receiving the sync uuids,
1397 	 * which is unexpected. */
1398 	if ((os.conn != C_SYNC_SOURCE && os.conn != C_PAUSED_SYNC_S) &&
1399 	    (ns.conn == C_SYNC_SOURCE || ns.conn == C_PAUSED_SYNC_S) &&
1400 	    mdev->agreed_pro_version >= 96 && get_ldev(mdev)) {
1401 		drbd_gen_and_send_sync_uuid(mdev);
1402 		put_ldev(mdev);
1403 	}
1404 
1405 	/* Do not change the order of the if above and the two below... */
1406 	if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) {      /* attach on the peer */
1407 		drbd_send_uuids(mdev);
1408 		drbd_send_state(mdev);
1409 	}
1410 	/* No point in queuing send_bitmap if we don't have a connection
1411 	 * anymore, so check also the _current_ state, not only the new state
1412 	 * at the time this work was queued. */
1413 	if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S &&
1414 	    mdev->state.conn == C_WF_BITMAP_S)
1415 		drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL,
1416 				"send_bitmap (WFBitMapS)",
1417 				BM_LOCKED_TEST_ALLOWED);
1418 
1419 	/* Lost contact to peer's copy of the data */
1420 	if ((os.pdsk >= D_INCONSISTENT &&
1421 	     os.pdsk != D_UNKNOWN &&
1422 	     os.pdsk != D_OUTDATED)
1423 	&&  (ns.pdsk < D_INCONSISTENT ||
1424 	     ns.pdsk == D_UNKNOWN ||
1425 	     ns.pdsk == D_OUTDATED)) {
1426 		if (get_ldev(mdev)) {
1427 			if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
1428 			    mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
1429 				if (is_susp(mdev->state)) {
1430 					set_bit(NEW_CUR_UUID, &mdev->flags);
1431 				} else {
1432 					drbd_uuid_new_current(mdev);
1433 					drbd_send_uuids(mdev);
1434 				}
1435 			}
1436 			put_ldev(mdev);
1437 		}
1438 	}
1439 
1440 	if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
1441 		if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0) {
1442 			drbd_uuid_new_current(mdev);
1443 			drbd_send_uuids(mdev);
1444 		}
1445 
1446 		/* D_DISKLESS Peer becomes secondary */
1447 		if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
1448 			/* We may still be Primary ourselves.
1449 			 * No harm done if the bitmap still changes,
1450 			 * redirtied pages will follow later. */
1451 			drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
1452 				"demote diskless peer", BM_LOCKED_SET_ALLOWED);
1453 		put_ldev(mdev);
1454 	}
1455 
1456 	/* Write out all changed bits on demote.
1457 	 * Though, no need to da that just yet
1458 	 * if there is a resync going on still */
1459 	if (os.role == R_PRIMARY && ns.role == R_SECONDARY &&
1460 		mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) {
1461 		/* No changes to the bitmap expected this time, so assert that,
1462 		 * even though no harm was done if it did change. */
1463 		drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
1464 				"demote", BM_LOCKED_TEST_ALLOWED);
1465 		put_ldev(mdev);
1466 	}
1467 
1468 	/* Last part of the attaching process ... */
1469 	if (ns.conn >= C_CONNECTED &&
1470 	    os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
1471 		drbd_send_sizes(mdev, 0, 0);  /* to start sync... */
1472 		drbd_send_uuids(mdev);
1473 		drbd_send_state(mdev);
1474 	}
1475 
1476 	/* We want to pause/continue resync, tell peer. */
1477 	if (ns.conn >= C_CONNECTED &&
1478 	     ((os.aftr_isp != ns.aftr_isp) ||
1479 	      (os.user_isp != ns.user_isp)))
1480 		drbd_send_state(mdev);
1481 
1482 	/* In case one of the isp bits got set, suspend other devices. */
1483 	if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
1484 	    (ns.aftr_isp || ns.peer_isp || ns.user_isp))
1485 		suspend_other_sg(mdev);
1486 
1487 	/* Make sure the peer gets informed about eventual state
1488 	   changes (ISP bits) while we were in WFReportParams. */
1489 	if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
1490 		drbd_send_state(mdev);
1491 
1492 	if (os.conn != C_AHEAD && ns.conn == C_AHEAD)
1493 		drbd_send_state(mdev);
1494 
1495 	/* We are in the progress to start a full sync... */
1496 	if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
1497 	    (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
1498 		/* no other bitmap changes expected during this phase */
1499 		drbd_queue_bitmap_io(mdev,
1500 			&drbd_bmio_set_n_write, &abw_start_sync,
1501 			"set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED);
1502 
1503 	/* We are invalidating our self... */
1504 	if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
1505 	    os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
1506 		/* other bitmap operation expected during this phase */
1507 		drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL,
1508 			"set_n_write from invalidate", BM_LOCKED_MASK);
1509 
1510 	/* first half of local IO error, failure to attach,
1511 	 * or administrative detach */
1512 	if (os.disk != D_FAILED && ns.disk == D_FAILED) {
1513 		enum drbd_io_error_p eh;
1514 		int was_io_error;
1515 		/* corresponding get_ldev was in __drbd_set_state, to serialize
1516 		 * our cleanup here with the transition to D_DISKLESS,
1517 		 * so it is safe to dreference ldev here. */
1518 		eh = mdev->ldev->dc.on_io_error;
1519 		was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags);
1520 
1521 		/* current state still has to be D_FAILED,
1522 		 * there is only one way out: to D_DISKLESS,
1523 		 * and that may only happen after our put_ldev below. */
1524 		if (mdev->state.disk != D_FAILED)
1525 			dev_err(DEV,
1526 				"ASSERT FAILED: disk is %s during detach\n",
1527 				drbd_disk_str(mdev->state.disk));
1528 
1529 		if (drbd_send_state(mdev))
1530 			dev_warn(DEV, "Notified peer that I am detaching my disk\n");
1531 		else
1532 			dev_err(DEV, "Sending state for detaching disk failed\n");
1533 
1534 		drbd_rs_cancel_all(mdev);
1535 
1536 		/* In case we want to get something to stable storage still,
1537 		 * this may be the last chance.
1538 		 * Following put_ldev may transition to D_DISKLESS. */
1539 		drbd_md_sync(mdev);
1540 		put_ldev(mdev);
1541 
1542 		if (was_io_error && eh == EP_CALL_HELPER)
1543 			drbd_khelper(mdev, "local-io-error");
1544 	}
1545 
1546         /* second half of local IO error, failure to attach,
1547          * or administrative detach,
1548          * after local_cnt references have reached zero again */
1549         if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) {
1550                 /* We must still be diskless,
1551                  * re-attach has to be serialized with this! */
1552                 if (mdev->state.disk != D_DISKLESS)
1553                         dev_err(DEV,
1554                                 "ASSERT FAILED: disk is %s while going diskless\n",
1555                                 drbd_disk_str(mdev->state.disk));
1556 
1557                 mdev->rs_total = 0;
1558                 mdev->rs_failed = 0;
1559                 atomic_set(&mdev->rs_pending_cnt, 0);
1560 
1561 		if (drbd_send_state(mdev))
1562 			dev_warn(DEV, "Notified peer that I'm now diskless.\n");
1563 		/* corresponding get_ldev in __drbd_set_state
1564 		 * this may finally trigger drbd_ldev_destroy. */
1565 		put_ldev(mdev);
1566 	}
1567 
1568 	/* Disks got bigger while they were detached */
1569 	if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
1570 	    test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) {
1571 		if (ns.conn == C_CONNECTED)
1572 			resync_after_online_grow(mdev);
1573 	}
1574 
1575 	/* A resync finished or aborted, wake paused devices... */
1576 	if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
1577 	    (os.peer_isp && !ns.peer_isp) ||
1578 	    (os.user_isp && !ns.user_isp))
1579 		resume_next_sg(mdev);
1580 
1581 	/* sync target done with resync.  Explicitly notify peer, even though
1582 	 * it should (at least for non-empty resyncs) already know itself. */
1583 	if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
1584 		drbd_send_state(mdev);
1585 
1586 	/* This triggers bitmap writeout of potentially still unwritten pages
1587 	 * if the resync finished cleanly, or aborted because of peer disk
1588 	 * failure, or because of connection loss.
1589 	 * For resync aborted because of local disk failure, we cannot do
1590 	 * any bitmap writeout anymore.
1591 	 * No harm done if some bits change during this phase.
1592 	 */
1593 	if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(mdev)) {
1594 		drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL,
1595 			"write from resync_finished", BM_LOCKED_SET_ALLOWED);
1596 		put_ldev(mdev);
1597 	}
1598 
1599 	/* free tl_hash if we Got thawed and are C_STANDALONE */
1600 	if (ns.conn == C_STANDALONE && !is_susp(ns) && mdev->tl_hash)
1601 		drbd_free_tl_hash(mdev);
1602 
1603 	/* Upon network connection, we need to start the receiver */
1604 	if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED)
1605 		drbd_thread_start(&mdev->receiver);
1606 
1607 	/* Terminate worker thread if we are unconfigured - it will be
1608 	   restarted as needed... */
1609 	if (ns.disk == D_DISKLESS &&
1610 	    ns.conn == C_STANDALONE &&
1611 	    ns.role == R_SECONDARY) {
1612 		if (os.aftr_isp != ns.aftr_isp)
1613 			resume_next_sg(mdev);
1614 		/* set in __drbd_set_state, unless CONFIG_PENDING was set */
1615 		if (test_bit(DEVICE_DYING, &mdev->flags))
1616 			drbd_thread_stop_nowait(&mdev->worker);
1617 	}
1618 
1619 	drbd_md_sync(mdev);
1620 }
1621 
1622 
drbd_thread_setup(void * arg)1623 static int drbd_thread_setup(void *arg)
1624 {
1625 	struct drbd_thread *thi = (struct drbd_thread *) arg;
1626 	struct drbd_conf *mdev = thi->mdev;
1627 	unsigned long flags;
1628 	int retval;
1629 
1630 restart:
1631 	retval = thi->function(thi);
1632 
1633 	spin_lock_irqsave(&thi->t_lock, flags);
1634 
1635 	/* if the receiver has been "Exiting", the last thing it did
1636 	 * was set the conn state to "StandAlone",
1637 	 * if now a re-connect request comes in, conn state goes C_UNCONNECTED,
1638 	 * and receiver thread will be "started".
1639 	 * drbd_thread_start needs to set "Restarting" in that case.
1640 	 * t_state check and assignment needs to be within the same spinlock,
1641 	 * so either thread_start sees Exiting, and can remap to Restarting,
1642 	 * or thread_start see None, and can proceed as normal.
1643 	 */
1644 
1645 	if (thi->t_state == Restarting) {
1646 		dev_info(DEV, "Restarting %s\n", current->comm);
1647 		thi->t_state = Running;
1648 		spin_unlock_irqrestore(&thi->t_lock, flags);
1649 		goto restart;
1650 	}
1651 
1652 	thi->task = NULL;
1653 	thi->t_state = None;
1654 	smp_mb();
1655 	complete(&thi->stop);
1656 	spin_unlock_irqrestore(&thi->t_lock, flags);
1657 
1658 	dev_info(DEV, "Terminating %s\n", current->comm);
1659 
1660 	/* Release mod reference taken when thread was started */
1661 	module_put(THIS_MODULE);
1662 	return retval;
1663 }
1664 
drbd_thread_init(struct drbd_conf * mdev,struct drbd_thread * thi,int (* func)(struct drbd_thread *))1665 static void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi,
1666 		      int (*func) (struct drbd_thread *))
1667 {
1668 	spin_lock_init(&thi->t_lock);
1669 	thi->task    = NULL;
1670 	thi->t_state = None;
1671 	thi->function = func;
1672 	thi->mdev = mdev;
1673 }
1674 
drbd_thread_start(struct drbd_thread * thi)1675 int drbd_thread_start(struct drbd_thread *thi)
1676 {
1677 	struct drbd_conf *mdev = thi->mdev;
1678 	struct task_struct *nt;
1679 	unsigned long flags;
1680 
1681 	const char *me =
1682 		thi == &mdev->receiver ? "receiver" :
1683 		thi == &mdev->asender  ? "asender"  :
1684 		thi == &mdev->worker   ? "worker"   : "NONSENSE";
1685 
1686 	/* is used from state engine doing drbd_thread_stop_nowait,
1687 	 * while holding the req lock irqsave */
1688 	spin_lock_irqsave(&thi->t_lock, flags);
1689 
1690 	switch (thi->t_state) {
1691 	case None:
1692 		dev_info(DEV, "Starting %s thread (from %s [%d])\n",
1693 				me, current->comm, current->pid);
1694 
1695 		/* Get ref on module for thread - this is released when thread exits */
1696 		if (!try_module_get(THIS_MODULE)) {
1697 			dev_err(DEV, "Failed to get module reference in drbd_thread_start\n");
1698 			spin_unlock_irqrestore(&thi->t_lock, flags);
1699 			return false;
1700 		}
1701 
1702 		init_completion(&thi->stop);
1703 		D_ASSERT(thi->task == NULL);
1704 		thi->reset_cpu_mask = 1;
1705 		thi->t_state = Running;
1706 		spin_unlock_irqrestore(&thi->t_lock, flags);
1707 		flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
1708 
1709 		nt = kthread_create(drbd_thread_setup, (void *) thi,
1710 				    "drbd%d_%s", mdev_to_minor(mdev), me);
1711 
1712 		if (IS_ERR(nt)) {
1713 			dev_err(DEV, "Couldn't start thread\n");
1714 
1715 			module_put(THIS_MODULE);
1716 			return false;
1717 		}
1718 		spin_lock_irqsave(&thi->t_lock, flags);
1719 		thi->task = nt;
1720 		thi->t_state = Running;
1721 		spin_unlock_irqrestore(&thi->t_lock, flags);
1722 		wake_up_process(nt);
1723 		break;
1724 	case Exiting:
1725 		thi->t_state = Restarting;
1726 		dev_info(DEV, "Restarting %s thread (from %s [%d])\n",
1727 				me, current->comm, current->pid);
1728 		/* fall through */
1729 	case Running:
1730 	case Restarting:
1731 	default:
1732 		spin_unlock_irqrestore(&thi->t_lock, flags);
1733 		break;
1734 	}
1735 
1736 	return true;
1737 }
1738 
1739 
_drbd_thread_stop(struct drbd_thread * thi,int restart,int wait)1740 void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
1741 {
1742 	unsigned long flags;
1743 
1744 	enum drbd_thread_state ns = restart ? Restarting : Exiting;
1745 
1746 	/* may be called from state engine, holding the req lock irqsave */
1747 	spin_lock_irqsave(&thi->t_lock, flags);
1748 
1749 	if (thi->t_state == None) {
1750 		spin_unlock_irqrestore(&thi->t_lock, flags);
1751 		if (restart)
1752 			drbd_thread_start(thi);
1753 		return;
1754 	}
1755 
1756 	if (thi->t_state != ns) {
1757 		if (thi->task == NULL) {
1758 			spin_unlock_irqrestore(&thi->t_lock, flags);
1759 			return;
1760 		}
1761 
1762 		thi->t_state = ns;
1763 		smp_mb();
1764 		init_completion(&thi->stop);
1765 		if (thi->task != current)
1766 			force_sig(DRBD_SIGKILL, thi->task);
1767 
1768 	}
1769 
1770 	spin_unlock_irqrestore(&thi->t_lock, flags);
1771 
1772 	if (wait)
1773 		wait_for_completion(&thi->stop);
1774 }
1775 
1776 #ifdef CONFIG_SMP
1777 /**
1778  * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
1779  * @mdev:	DRBD device.
1780  *
1781  * Forces all threads of a device onto the same CPU. This is beneficial for
1782  * DRBD's performance. May be overwritten by user's configuration.
1783  */
drbd_calc_cpu_mask(struct drbd_conf * mdev)1784 void drbd_calc_cpu_mask(struct drbd_conf *mdev)
1785 {
1786 	int ord, cpu;
1787 
1788 	/* user override. */
1789 	if (cpumask_weight(mdev->cpu_mask))
1790 		return;
1791 
1792 	ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask);
1793 	for_each_online_cpu(cpu) {
1794 		if (ord-- == 0) {
1795 			cpumask_set_cpu(cpu, mdev->cpu_mask);
1796 			return;
1797 		}
1798 	}
1799 	/* should not be reached */
1800 	cpumask_setall(mdev->cpu_mask);
1801 }
1802 
1803 /**
1804  * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
1805  * @mdev:	DRBD device.
1806  *
1807  * call in the "main loop" of _all_ threads, no need for any mutex, current won't die
1808  * prematurely.
1809  */
drbd_thread_current_set_cpu(struct drbd_conf * mdev)1810 void drbd_thread_current_set_cpu(struct drbd_conf *mdev)
1811 {
1812 	struct task_struct *p = current;
1813 	struct drbd_thread *thi =
1814 		p == mdev->asender.task  ? &mdev->asender  :
1815 		p == mdev->receiver.task ? &mdev->receiver :
1816 		p == mdev->worker.task   ? &mdev->worker   :
1817 		NULL;
1818 	ERR_IF(thi == NULL)
1819 		return;
1820 	if (!thi->reset_cpu_mask)
1821 		return;
1822 	thi->reset_cpu_mask = 0;
1823 	set_cpus_allowed_ptr(p, mdev->cpu_mask);
1824 }
1825 #endif
1826 
1827 /* the appropriate socket mutex must be held already */
_drbd_send_cmd(struct drbd_conf * mdev,struct socket * sock,enum drbd_packets cmd,struct p_header80 * h,size_t size,unsigned msg_flags)1828 int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
1829 			  enum drbd_packets cmd, struct p_header80 *h,
1830 			  size_t size, unsigned msg_flags)
1831 {
1832 	int sent, ok;
1833 
1834 	ERR_IF(!h) return false;
1835 	ERR_IF(!size) return false;
1836 
1837 	h->magic   = BE_DRBD_MAGIC;
1838 	h->command = cpu_to_be16(cmd);
1839 	h->length  = cpu_to_be16(size-sizeof(struct p_header80));
1840 
1841 	sent = drbd_send(mdev, sock, h, size, msg_flags);
1842 
1843 	ok = (sent == size);
1844 	if (!ok && !signal_pending(current))
1845 		dev_warn(DEV, "short sent %s size=%d sent=%d\n",
1846 		    cmdname(cmd), (int)size, sent);
1847 	return ok;
1848 }
1849 
1850 /* don't pass the socket. we may only look at it
1851  * when we hold the appropriate socket mutex.
1852  */
drbd_send_cmd(struct drbd_conf * mdev,int use_data_socket,enum drbd_packets cmd,struct p_header80 * h,size_t size)1853 int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
1854 		  enum drbd_packets cmd, struct p_header80 *h, size_t size)
1855 {
1856 	int ok = 0;
1857 	struct socket *sock;
1858 
1859 	if (use_data_socket) {
1860 		mutex_lock(&mdev->data.mutex);
1861 		sock = mdev->data.socket;
1862 	} else {
1863 		mutex_lock(&mdev->meta.mutex);
1864 		sock = mdev->meta.socket;
1865 	}
1866 
1867 	/* drbd_disconnect() could have called drbd_free_sock()
1868 	 * while we were waiting in down()... */
1869 	if (likely(sock != NULL))
1870 		ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0);
1871 
1872 	if (use_data_socket)
1873 		mutex_unlock(&mdev->data.mutex);
1874 	else
1875 		mutex_unlock(&mdev->meta.mutex);
1876 	return ok;
1877 }
1878 
drbd_send_cmd2(struct drbd_conf * mdev,enum drbd_packets cmd,char * data,size_t size)1879 int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data,
1880 		   size_t size)
1881 {
1882 	struct p_header80 h;
1883 	int ok;
1884 
1885 	h.magic   = BE_DRBD_MAGIC;
1886 	h.command = cpu_to_be16(cmd);
1887 	h.length  = cpu_to_be16(size);
1888 
1889 	if (!drbd_get_data_sock(mdev))
1890 		return 0;
1891 
1892 	ok = (sizeof(h) ==
1893 		drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0));
1894 	ok = ok && (size ==
1895 		drbd_send(mdev, mdev->data.socket, data, size, 0));
1896 
1897 	drbd_put_data_sock(mdev);
1898 
1899 	return ok;
1900 }
1901 
drbd_send_sync_param(struct drbd_conf * mdev,struct syncer_conf * sc)1902 int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
1903 {
1904 	struct p_rs_param_95 *p;
1905 	struct socket *sock;
1906 	int size, rv;
1907 	const int apv = mdev->agreed_pro_version;
1908 
1909 	size = apv <= 87 ? sizeof(struct p_rs_param)
1910 		: apv == 88 ? sizeof(struct p_rs_param)
1911 			+ strlen(mdev->sync_conf.verify_alg) + 1
1912 		: apv <= 94 ? sizeof(struct p_rs_param_89)
1913 		: /* apv >= 95 */ sizeof(struct p_rs_param_95);
1914 
1915 	/* used from admin command context and receiver/worker context.
1916 	 * to avoid kmalloc, grab the socket right here,
1917 	 * then use the pre-allocated sbuf there */
1918 	mutex_lock(&mdev->data.mutex);
1919 	sock = mdev->data.socket;
1920 
1921 	if (likely(sock != NULL)) {
1922 		enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
1923 
1924 		p = &mdev->data.sbuf.rs_param_95;
1925 
1926 		/* initialize verify_alg and csums_alg */
1927 		memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
1928 
1929 		p->rate = cpu_to_be32(sc->rate);
1930 		p->c_plan_ahead = cpu_to_be32(sc->c_plan_ahead);
1931 		p->c_delay_target = cpu_to_be32(sc->c_delay_target);
1932 		p->c_fill_target = cpu_to_be32(sc->c_fill_target);
1933 		p->c_max_rate = cpu_to_be32(sc->c_max_rate);
1934 
1935 		if (apv >= 88)
1936 			strcpy(p->verify_alg, mdev->sync_conf.verify_alg);
1937 		if (apv >= 89)
1938 			strcpy(p->csums_alg, mdev->sync_conf.csums_alg);
1939 
1940 		rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0);
1941 	} else
1942 		rv = 0; /* not ok */
1943 
1944 	mutex_unlock(&mdev->data.mutex);
1945 
1946 	return rv;
1947 }
1948 
drbd_send_protocol(struct drbd_conf * mdev)1949 int drbd_send_protocol(struct drbd_conf *mdev)
1950 {
1951 	struct p_protocol *p;
1952 	int size, cf, rv;
1953 
1954 	size = sizeof(struct p_protocol);
1955 
1956 	if (mdev->agreed_pro_version >= 87)
1957 		size += strlen(mdev->net_conf->integrity_alg) + 1;
1958 
1959 	/* we must not recurse into our own queue,
1960 	 * as that is blocked during handshake */
1961 	p = kmalloc(size, GFP_NOIO);
1962 	if (p == NULL)
1963 		return 0;
1964 
1965 	p->protocol      = cpu_to_be32(mdev->net_conf->wire_protocol);
1966 	p->after_sb_0p   = cpu_to_be32(mdev->net_conf->after_sb_0p);
1967 	p->after_sb_1p   = cpu_to_be32(mdev->net_conf->after_sb_1p);
1968 	p->after_sb_2p   = cpu_to_be32(mdev->net_conf->after_sb_2p);
1969 	p->two_primaries = cpu_to_be32(mdev->net_conf->two_primaries);
1970 
1971 	cf = 0;
1972 	if (mdev->net_conf->want_lose)
1973 		cf |= CF_WANT_LOSE;
1974 	if (mdev->net_conf->dry_run) {
1975 		if (mdev->agreed_pro_version >= 92)
1976 			cf |= CF_DRY_RUN;
1977 		else {
1978 			dev_err(DEV, "--dry-run is not supported by peer");
1979 			kfree(p);
1980 			return -1;
1981 		}
1982 	}
1983 	p->conn_flags    = cpu_to_be32(cf);
1984 
1985 	if (mdev->agreed_pro_version >= 87)
1986 		strcpy(p->integrity_alg, mdev->net_conf->integrity_alg);
1987 
1988 	rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL,
1989 			   (struct p_header80 *)p, size);
1990 	kfree(p);
1991 	return rv;
1992 }
1993 
_drbd_send_uuids(struct drbd_conf * mdev,u64 uuid_flags)1994 int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
1995 {
1996 	struct p_uuids p;
1997 	int i;
1998 
1999 	if (!get_ldev_if_state(mdev, D_NEGOTIATING))
2000 		return 1;
2001 
2002 	for (i = UI_CURRENT; i < UI_SIZE; i++)
2003 		p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0;
2004 
2005 	mdev->comm_bm_set = drbd_bm_total_weight(mdev);
2006 	p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set);
2007 	uuid_flags |= mdev->net_conf->want_lose ? 1 : 0;
2008 	uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0;
2009 	uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
2010 	p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
2011 
2012 	put_ldev(mdev);
2013 
2014 	return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS,
2015 			     (struct p_header80 *)&p, sizeof(p));
2016 }
2017 
drbd_send_uuids(struct drbd_conf * mdev)2018 int drbd_send_uuids(struct drbd_conf *mdev)
2019 {
2020 	return _drbd_send_uuids(mdev, 0);
2021 }
2022 
drbd_send_uuids_skip_initial_sync(struct drbd_conf * mdev)2023 int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev)
2024 {
2025 	return _drbd_send_uuids(mdev, 8);
2026 }
2027 
drbd_print_uuids(struct drbd_conf * mdev,const char * text)2028 void drbd_print_uuids(struct drbd_conf *mdev, const char *text)
2029 {
2030 	if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
2031 		u64 *uuid = mdev->ldev->md.uuid;
2032 		dev_info(DEV, "%s %016llX:%016llX:%016llX:%016llX\n",
2033 		     text,
2034 		     (unsigned long long)uuid[UI_CURRENT],
2035 		     (unsigned long long)uuid[UI_BITMAP],
2036 		     (unsigned long long)uuid[UI_HISTORY_START],
2037 		     (unsigned long long)uuid[UI_HISTORY_END]);
2038 		put_ldev(mdev);
2039 	} else {
2040 		dev_info(DEV, "%s effective data uuid: %016llX\n",
2041 				text,
2042 				(unsigned long long)mdev->ed_uuid);
2043 	}
2044 }
2045 
drbd_gen_and_send_sync_uuid(struct drbd_conf * mdev)2046 int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev)
2047 {
2048 	struct p_rs_uuid p;
2049 	u64 uuid;
2050 
2051 	D_ASSERT(mdev->state.disk == D_UP_TO_DATE);
2052 
2053 	uuid = mdev->ldev->md.uuid[UI_BITMAP] + UUID_NEW_BM_OFFSET;
2054 	drbd_uuid_set(mdev, UI_BITMAP, uuid);
2055 	drbd_print_uuids(mdev, "updated sync UUID");
2056 	drbd_md_sync(mdev);
2057 	p.uuid = cpu_to_be64(uuid);
2058 
2059 	return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID,
2060 			     (struct p_header80 *)&p, sizeof(p));
2061 }
2062 
drbd_send_sizes(struct drbd_conf * mdev,int trigger_reply,enum dds_flags flags)2063 int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
2064 {
2065 	struct p_sizes p;
2066 	sector_t d_size, u_size;
2067 	int q_order_type;
2068 	int ok;
2069 
2070 	if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
2071 		D_ASSERT(mdev->ldev->backing_bdev);
2072 		d_size = drbd_get_max_capacity(mdev->ldev);
2073 		u_size = mdev->ldev->dc.disk_size;
2074 		q_order_type = drbd_queue_order_type(mdev);
2075 		put_ldev(mdev);
2076 	} else {
2077 		d_size = 0;
2078 		u_size = 0;
2079 		q_order_type = QUEUE_ORDERED_NONE;
2080 	}
2081 
2082 	p.d_size = cpu_to_be64(d_size);
2083 	p.u_size = cpu_to_be64(u_size);
2084 	p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
2085 	p.max_bio_size = cpu_to_be32(queue_max_hw_sectors(mdev->rq_queue) << 9);
2086 	p.queue_order_type = cpu_to_be16(q_order_type);
2087 	p.dds_flags = cpu_to_be16(flags);
2088 
2089 	ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
2090 			   (struct p_header80 *)&p, sizeof(p));
2091 	return ok;
2092 }
2093 
2094 /**
2095  * drbd_send_state() - Sends the drbd state to the peer
2096  * @mdev:	DRBD device.
2097  */
drbd_send_state(struct drbd_conf * mdev)2098 int drbd_send_state(struct drbd_conf *mdev)
2099 {
2100 	struct socket *sock;
2101 	struct p_state p;
2102 	int ok = 0;
2103 
2104 	/* Grab state lock so we wont send state if we're in the middle
2105 	 * of a cluster wide state change on another thread */
2106 	drbd_state_lock(mdev);
2107 
2108 	mutex_lock(&mdev->data.mutex);
2109 
2110 	p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */
2111 	sock = mdev->data.socket;
2112 
2113 	if (likely(sock != NULL)) {
2114 		ok = _drbd_send_cmd(mdev, sock, P_STATE,
2115 				    (struct p_header80 *)&p, sizeof(p), 0);
2116 	}
2117 
2118 	mutex_unlock(&mdev->data.mutex);
2119 
2120 	drbd_state_unlock(mdev);
2121 	return ok;
2122 }
2123 
drbd_send_state_req(struct drbd_conf * mdev,union drbd_state mask,union drbd_state val)2124 int drbd_send_state_req(struct drbd_conf *mdev,
2125 	union drbd_state mask, union drbd_state val)
2126 {
2127 	struct p_req_state p;
2128 
2129 	p.mask    = cpu_to_be32(mask.i);
2130 	p.val     = cpu_to_be32(val.i);
2131 
2132 	return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ,
2133 			     (struct p_header80 *)&p, sizeof(p));
2134 }
2135 
drbd_send_sr_reply(struct drbd_conf * mdev,enum drbd_state_rv retcode)2136 int drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode)
2137 {
2138 	struct p_req_state_reply p;
2139 
2140 	p.retcode    = cpu_to_be32(retcode);
2141 
2142 	return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY,
2143 			     (struct p_header80 *)&p, sizeof(p));
2144 }
2145 
fill_bitmap_rle_bits(struct drbd_conf * mdev,struct p_compressed_bm * p,struct bm_xfer_ctx * c)2146 int fill_bitmap_rle_bits(struct drbd_conf *mdev,
2147 	struct p_compressed_bm *p,
2148 	struct bm_xfer_ctx *c)
2149 {
2150 	struct bitstream bs;
2151 	unsigned long plain_bits;
2152 	unsigned long tmp;
2153 	unsigned long rl;
2154 	unsigned len;
2155 	unsigned toggle;
2156 	int bits;
2157 
2158 	/* may we use this feature? */
2159 	if ((mdev->sync_conf.use_rle == 0) ||
2160 		(mdev->agreed_pro_version < 90))
2161 			return 0;
2162 
2163 	if (c->bit_offset >= c->bm_bits)
2164 		return 0; /* nothing to do. */
2165 
2166 	/* use at most thus many bytes */
2167 	bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0);
2168 	memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX);
2169 	/* plain bits covered in this code string */
2170 	plain_bits = 0;
2171 
2172 	/* p->encoding & 0x80 stores whether the first run length is set.
2173 	 * bit offset is implicit.
2174 	 * start with toggle == 2 to be able to tell the first iteration */
2175 	toggle = 2;
2176 
2177 	/* see how much plain bits we can stuff into one packet
2178 	 * using RLE and VLI. */
2179 	do {
2180 		tmp = (toggle == 0) ? _drbd_bm_find_next_zero(mdev, c->bit_offset)
2181 				    : _drbd_bm_find_next(mdev, c->bit_offset);
2182 		if (tmp == -1UL)
2183 			tmp = c->bm_bits;
2184 		rl = tmp - c->bit_offset;
2185 
2186 		if (toggle == 2) { /* first iteration */
2187 			if (rl == 0) {
2188 				/* the first checked bit was set,
2189 				 * store start value, */
2190 				DCBP_set_start(p, 1);
2191 				/* but skip encoding of zero run length */
2192 				toggle = !toggle;
2193 				continue;
2194 			}
2195 			DCBP_set_start(p, 0);
2196 		}
2197 
2198 		/* paranoia: catch zero runlength.
2199 		 * can only happen if bitmap is modified while we scan it. */
2200 		if (rl == 0) {
2201 			dev_err(DEV, "unexpected zero runlength while encoding bitmap "
2202 			    "t:%u bo:%lu\n", toggle, c->bit_offset);
2203 			return -1;
2204 		}
2205 
2206 		bits = vli_encode_bits(&bs, rl);
2207 		if (bits == -ENOBUFS) /* buffer full */
2208 			break;
2209 		if (bits <= 0) {
2210 			dev_err(DEV, "error while encoding bitmap: %d\n", bits);
2211 			return 0;
2212 		}
2213 
2214 		toggle = !toggle;
2215 		plain_bits += rl;
2216 		c->bit_offset = tmp;
2217 	} while (c->bit_offset < c->bm_bits);
2218 
2219 	len = bs.cur.b - p->code + !!bs.cur.bit;
2220 
2221 	if (plain_bits < (len << 3)) {
2222 		/* incompressible with this method.
2223 		 * we need to rewind both word and bit position. */
2224 		c->bit_offset -= plain_bits;
2225 		bm_xfer_ctx_bit_to_word_offset(c);
2226 		c->bit_offset = c->word_offset * BITS_PER_LONG;
2227 		return 0;
2228 	}
2229 
2230 	/* RLE + VLI was able to compress it just fine.
2231 	 * update c->word_offset. */
2232 	bm_xfer_ctx_bit_to_word_offset(c);
2233 
2234 	/* store pad_bits */
2235 	DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
2236 
2237 	return len;
2238 }
2239 
2240 /**
2241  * send_bitmap_rle_or_plain
2242  *
2243  * Return 0 when done, 1 when another iteration is needed, and a negative error
2244  * code upon failure.
2245  */
2246 static int
send_bitmap_rle_or_plain(struct drbd_conf * mdev,struct p_header80 * h,struct bm_xfer_ctx * c)2247 send_bitmap_rle_or_plain(struct drbd_conf *mdev,
2248 			 struct p_header80 *h, struct bm_xfer_ctx *c)
2249 {
2250 	struct p_compressed_bm *p = (void*)h;
2251 	unsigned long num_words;
2252 	int len;
2253 	int ok;
2254 
2255 	len = fill_bitmap_rle_bits(mdev, p, c);
2256 
2257 	if (len < 0)
2258 		return -EIO;
2259 
2260 	if (len) {
2261 		DCBP_set_code(p, RLE_VLI_Bits);
2262 		ok = _drbd_send_cmd(mdev, mdev->data.socket, P_COMPRESSED_BITMAP, h,
2263 			sizeof(*p) + len, 0);
2264 
2265 		c->packets[0]++;
2266 		c->bytes[0] += sizeof(*p) + len;
2267 
2268 		if (c->bit_offset >= c->bm_bits)
2269 			len = 0; /* DONE */
2270 	} else {
2271 		/* was not compressible.
2272 		 * send a buffer full of plain text bits instead. */
2273 		num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
2274 		len = num_words * sizeof(long);
2275 		if (len)
2276 			drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload);
2277 		ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP,
2278 				   h, sizeof(struct p_header80) + len, 0);
2279 		c->word_offset += num_words;
2280 		c->bit_offset = c->word_offset * BITS_PER_LONG;
2281 
2282 		c->packets[1]++;
2283 		c->bytes[1] += sizeof(struct p_header80) + len;
2284 
2285 		if (c->bit_offset > c->bm_bits)
2286 			c->bit_offset = c->bm_bits;
2287 	}
2288 	if (ok) {
2289 		if (len == 0) {
2290 			INFO_bm_xfer_stats(mdev, "send", c);
2291 			return 0;
2292 		} else
2293 			return 1;
2294 	}
2295 	return -EIO;
2296 }
2297 
2298 /* See the comment at receive_bitmap() */
_drbd_send_bitmap(struct drbd_conf * mdev)2299 int _drbd_send_bitmap(struct drbd_conf *mdev)
2300 {
2301 	struct bm_xfer_ctx c;
2302 	struct p_header80 *p;
2303 	int err;
2304 
2305 	ERR_IF(!mdev->bitmap) return false;
2306 
2307 	/* maybe we should use some per thread scratch page,
2308 	 * and allocate that during initial device creation? */
2309 	p = (struct p_header80 *) __get_free_page(GFP_NOIO);
2310 	if (!p) {
2311 		dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
2312 		return false;
2313 	}
2314 
2315 	if (get_ldev(mdev)) {
2316 		if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
2317 			dev_info(DEV, "Writing the whole bitmap, MDF_FullSync was set.\n");
2318 			drbd_bm_set_all(mdev);
2319 			if (drbd_bm_write(mdev)) {
2320 				/* write_bm did fail! Leave full sync flag set in Meta P_DATA
2321 				 * but otherwise process as per normal - need to tell other
2322 				 * side that a full resync is required! */
2323 				dev_err(DEV, "Failed to write bitmap to disk!\n");
2324 			} else {
2325 				drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
2326 				drbd_md_sync(mdev);
2327 			}
2328 		}
2329 		put_ldev(mdev);
2330 	}
2331 
2332 	c = (struct bm_xfer_ctx) {
2333 		.bm_bits = drbd_bm_bits(mdev),
2334 		.bm_words = drbd_bm_words(mdev),
2335 	};
2336 
2337 	do {
2338 		err = send_bitmap_rle_or_plain(mdev, p, &c);
2339 	} while (err > 0);
2340 
2341 	free_page((unsigned long) p);
2342 	return err == 0;
2343 }
2344 
drbd_send_bitmap(struct drbd_conf * mdev)2345 int drbd_send_bitmap(struct drbd_conf *mdev)
2346 {
2347 	int err;
2348 
2349 	if (!drbd_get_data_sock(mdev))
2350 		return -1;
2351 	err = !_drbd_send_bitmap(mdev);
2352 	drbd_put_data_sock(mdev);
2353 	return err;
2354 }
2355 
drbd_send_b_ack(struct drbd_conf * mdev,u32 barrier_nr,u32 set_size)2356 int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
2357 {
2358 	int ok;
2359 	struct p_barrier_ack p;
2360 
2361 	p.barrier  = barrier_nr;
2362 	p.set_size = cpu_to_be32(set_size);
2363 
2364 	if (mdev->state.conn < C_CONNECTED)
2365 		return false;
2366 	ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK,
2367 			(struct p_header80 *)&p, sizeof(p));
2368 	return ok;
2369 }
2370 
2371 /**
2372  * _drbd_send_ack() - Sends an ack packet
2373  * @mdev:	DRBD device.
2374  * @cmd:	Packet command code.
2375  * @sector:	sector, needs to be in big endian byte order
2376  * @blksize:	size in byte, needs to be in big endian byte order
2377  * @block_id:	Id, big endian byte order
2378  */
_drbd_send_ack(struct drbd_conf * mdev,enum drbd_packets cmd,u64 sector,u32 blksize,u64 block_id)2379 static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
2380 			  u64 sector,
2381 			  u32 blksize,
2382 			  u64 block_id)
2383 {
2384 	int ok;
2385 	struct p_block_ack p;
2386 
2387 	p.sector   = sector;
2388 	p.block_id = block_id;
2389 	p.blksize  = blksize;
2390 	p.seq_num  = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
2391 
2392 	if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED)
2393 		return false;
2394 	ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd,
2395 				(struct p_header80 *)&p, sizeof(p));
2396 	return ok;
2397 }
2398 
2399 /* dp->sector and dp->block_id already/still in network byte order,
2400  * data_size is payload size according to dp->head,
2401  * and may need to be corrected for digest size. */
drbd_send_ack_dp(struct drbd_conf * mdev,enum drbd_packets cmd,struct p_data * dp,int data_size)2402 int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
2403 		     struct p_data *dp, int data_size)
2404 {
2405 	data_size -= (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
2406 		crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
2407 	return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
2408 			      dp->block_id);
2409 }
2410 
drbd_send_ack_rp(struct drbd_conf * mdev,enum drbd_packets cmd,struct p_block_req * rp)2411 int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
2412 		     struct p_block_req *rp)
2413 {
2414 	return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id);
2415 }
2416 
2417 /**
2418  * drbd_send_ack() - Sends an ack packet
2419  * @mdev:	DRBD device.
2420  * @cmd:	Packet command code.
2421  * @e:		Epoch entry.
2422  */
drbd_send_ack(struct drbd_conf * mdev,enum drbd_packets cmd,struct drbd_epoch_entry * e)2423 int drbd_send_ack(struct drbd_conf *mdev,
2424 	enum drbd_packets cmd, struct drbd_epoch_entry *e)
2425 {
2426 	return _drbd_send_ack(mdev, cmd,
2427 			      cpu_to_be64(e->sector),
2428 			      cpu_to_be32(e->size),
2429 			      e->block_id);
2430 }
2431 
2432 /* This function misuses the block_id field to signal if the blocks
2433  * are is sync or not. */
drbd_send_ack_ex(struct drbd_conf * mdev,enum drbd_packets cmd,sector_t sector,int blksize,u64 block_id)2434 int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
2435 		     sector_t sector, int blksize, u64 block_id)
2436 {
2437 	return _drbd_send_ack(mdev, cmd,
2438 			      cpu_to_be64(sector),
2439 			      cpu_to_be32(blksize),
2440 			      cpu_to_be64(block_id));
2441 }
2442 
drbd_send_drequest(struct drbd_conf * mdev,int cmd,sector_t sector,int size,u64 block_id)2443 int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
2444 		       sector_t sector, int size, u64 block_id)
2445 {
2446 	int ok;
2447 	struct p_block_req p;
2448 
2449 	p.sector   = cpu_to_be64(sector);
2450 	p.block_id = block_id;
2451 	p.blksize  = cpu_to_be32(size);
2452 
2453 	ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd,
2454 				(struct p_header80 *)&p, sizeof(p));
2455 	return ok;
2456 }
2457 
drbd_send_drequest_csum(struct drbd_conf * mdev,sector_t sector,int size,void * digest,int digest_size,enum drbd_packets cmd)2458 int drbd_send_drequest_csum(struct drbd_conf *mdev,
2459 			    sector_t sector, int size,
2460 			    void *digest, int digest_size,
2461 			    enum drbd_packets cmd)
2462 {
2463 	int ok;
2464 	struct p_block_req p;
2465 
2466 	p.sector   = cpu_to_be64(sector);
2467 	p.block_id = BE_DRBD_MAGIC + 0xbeef;
2468 	p.blksize  = cpu_to_be32(size);
2469 
2470 	p.head.magic   = BE_DRBD_MAGIC;
2471 	p.head.command = cpu_to_be16(cmd);
2472 	p.head.length  = cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + digest_size);
2473 
2474 	mutex_lock(&mdev->data.mutex);
2475 
2476 	ok = (sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), 0));
2477 	ok = ok && (digest_size == drbd_send(mdev, mdev->data.socket, digest, digest_size, 0));
2478 
2479 	mutex_unlock(&mdev->data.mutex);
2480 
2481 	return ok;
2482 }
2483 
drbd_send_ov_request(struct drbd_conf * mdev,sector_t sector,int size)2484 int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
2485 {
2486 	int ok;
2487 	struct p_block_req p;
2488 
2489 	p.sector   = cpu_to_be64(sector);
2490 	p.block_id = BE_DRBD_MAGIC + 0xbabe;
2491 	p.blksize  = cpu_to_be32(size);
2492 
2493 	ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST,
2494 			   (struct p_header80 *)&p, sizeof(p));
2495 	return ok;
2496 }
2497 
2498 /* called on sndtimeo
2499  * returns false if we should retry,
2500  * true if we think connection is dead
2501  */
we_should_drop_the_connection(struct drbd_conf * mdev,struct socket * sock)2502 static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock)
2503 {
2504 	int drop_it;
2505 	/* long elapsed = (long)(jiffies - mdev->last_received); */
2506 
2507 	drop_it =   mdev->meta.socket == sock
2508 		|| !mdev->asender.task
2509 		|| get_t_state(&mdev->asender) != Running
2510 		|| mdev->state.conn < C_CONNECTED;
2511 
2512 	if (drop_it)
2513 		return true;
2514 
2515 	drop_it = !--mdev->ko_count;
2516 	if (!drop_it) {
2517 		dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
2518 		       current->comm, current->pid, mdev->ko_count);
2519 		request_ping(mdev);
2520 	}
2521 
2522 	return drop_it; /* && (mdev->state == R_PRIMARY) */;
2523 }
2524 
2525 /* The idea of sendpage seems to be to put some kind of reference
2526  * to the page into the skb, and to hand it over to the NIC. In
2527  * this process get_page() gets called.
2528  *
2529  * As soon as the page was really sent over the network put_page()
2530  * gets called by some part of the network layer. [ NIC driver? ]
2531  *
2532  * [ get_page() / put_page() increment/decrement the count. If count
2533  *   reaches 0 the page will be freed. ]
2534  *
2535  * This works nicely with pages from FSs.
2536  * But this means that in protocol A we might signal IO completion too early!
2537  *
2538  * In order not to corrupt data during a resync we must make sure
2539  * that we do not reuse our own buffer pages (EEs) to early, therefore
2540  * we have the net_ee list.
2541  *
2542  * XFS seems to have problems, still, it submits pages with page_count == 0!
2543  * As a workaround, we disable sendpage on pages
2544  * with page_count == 0 or PageSlab.
2545  */
_drbd_no_send_page(struct drbd_conf * mdev,struct page * page,int offset,size_t size,unsigned msg_flags)2546 static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
2547 		   int offset, size_t size, unsigned msg_flags)
2548 {
2549 	int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, msg_flags);
2550 	kunmap(page);
2551 	if (sent == size)
2552 		mdev->send_cnt += size>>9;
2553 	return sent == size;
2554 }
2555 
_drbd_send_page(struct drbd_conf * mdev,struct page * page,int offset,size_t size,unsigned msg_flags)2556 static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
2557 		    int offset, size_t size, unsigned msg_flags)
2558 {
2559 	mm_segment_t oldfs = get_fs();
2560 	int sent, ok;
2561 	int len = size;
2562 
2563 	/* e.g. XFS meta- & log-data is in slab pages, which have a
2564 	 * page_count of 0 and/or have PageSlab() set.
2565 	 * we cannot use send_page for those, as that does get_page();
2566 	 * put_page(); and would cause either a VM_BUG directly, or
2567 	 * __page_cache_release a page that would actually still be referenced
2568 	 * by someone, leading to some obscure delayed Oops somewhere else. */
2569 	if (disable_sendpage || (page_count(page) < 1) || PageSlab(page))
2570 		return _drbd_no_send_page(mdev, page, offset, size, msg_flags);
2571 
2572 	msg_flags |= MSG_NOSIGNAL;
2573 	drbd_update_congested(mdev);
2574 	set_fs(KERNEL_DS);
2575 	do {
2576 		sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page,
2577 							offset, len,
2578 							msg_flags);
2579 		if (sent == -EAGAIN) {
2580 			if (we_should_drop_the_connection(mdev,
2581 							  mdev->data.socket))
2582 				break;
2583 			else
2584 				continue;
2585 		}
2586 		if (sent <= 0) {
2587 			dev_warn(DEV, "%s: size=%d len=%d sent=%d\n",
2588 			     __func__, (int)size, len, sent);
2589 			break;
2590 		}
2591 		len    -= sent;
2592 		offset += sent;
2593 	} while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/);
2594 	set_fs(oldfs);
2595 	clear_bit(NET_CONGESTED, &mdev->flags);
2596 
2597 	ok = (len == 0);
2598 	if (likely(ok))
2599 		mdev->send_cnt += size>>9;
2600 	return ok;
2601 }
2602 
_drbd_send_bio(struct drbd_conf * mdev,struct bio * bio)2603 static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
2604 {
2605 	struct bio_vec *bvec;
2606 	int i;
2607 	/* hint all but last page with MSG_MORE */
2608 	__bio_for_each_segment(bvec, bio, i, 0) {
2609 		if (!_drbd_no_send_page(mdev, bvec->bv_page,
2610 				     bvec->bv_offset, bvec->bv_len,
2611 				     i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
2612 			return 0;
2613 	}
2614 	return 1;
2615 }
2616 
_drbd_send_zc_bio(struct drbd_conf * mdev,struct bio * bio)2617 static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
2618 {
2619 	struct bio_vec *bvec;
2620 	int i;
2621 	/* hint all but last page with MSG_MORE */
2622 	__bio_for_each_segment(bvec, bio, i, 0) {
2623 		if (!_drbd_send_page(mdev, bvec->bv_page,
2624 				     bvec->bv_offset, bvec->bv_len,
2625 				     i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
2626 			return 0;
2627 	}
2628 	return 1;
2629 }
2630 
_drbd_send_zc_ee(struct drbd_conf * mdev,struct drbd_epoch_entry * e)2631 static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
2632 {
2633 	struct page *page = e->pages;
2634 	unsigned len = e->size;
2635 	/* hint all but last page with MSG_MORE */
2636 	page_chain_for_each(page) {
2637 		unsigned l = min_t(unsigned, len, PAGE_SIZE);
2638 		if (!_drbd_send_page(mdev, page, 0, l,
2639 				page_chain_next(page) ? MSG_MORE : 0))
2640 			return 0;
2641 		len -= l;
2642 	}
2643 	return 1;
2644 }
2645 
bio_flags_to_wire(struct drbd_conf * mdev,unsigned long bi_rw)2646 static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw)
2647 {
2648 	if (mdev->agreed_pro_version >= 95)
2649 		return  (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
2650 			(bi_rw & REQ_FUA ? DP_FUA : 0) |
2651 			(bi_rw & REQ_FLUSH ? DP_FLUSH : 0) |
2652 			(bi_rw & REQ_DISCARD ? DP_DISCARD : 0);
2653 	else
2654 		return bi_rw & REQ_SYNC ? DP_RW_SYNC : 0;
2655 }
2656 
2657 /* Used to send write requests
2658  * R_PRIMARY -> Peer	(P_DATA)
2659  */
drbd_send_dblock(struct drbd_conf * mdev,struct drbd_request * req)2660 int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
2661 {
2662 	int ok = 1;
2663 	struct p_data p;
2664 	unsigned int dp_flags = 0;
2665 	void *dgb;
2666 	int dgs;
2667 
2668 	if (!drbd_get_data_sock(mdev))
2669 		return 0;
2670 
2671 	dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2672 		crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2673 
2674 	if (req->size <= DRBD_MAX_SIZE_H80_PACKET) {
2675 		p.head.h80.magic   = BE_DRBD_MAGIC;
2676 		p.head.h80.command = cpu_to_be16(P_DATA);
2677 		p.head.h80.length  =
2678 			cpu_to_be16(sizeof(p) - sizeof(union p_header) + dgs + req->size);
2679 	} else {
2680 		p.head.h95.magic   = BE_DRBD_MAGIC_BIG;
2681 		p.head.h95.command = cpu_to_be16(P_DATA);
2682 		p.head.h95.length  =
2683 			cpu_to_be32(sizeof(p) - sizeof(union p_header) + dgs + req->size);
2684 	}
2685 
2686 	p.sector   = cpu_to_be64(req->sector);
2687 	p.block_id = (unsigned long)req;
2688 	p.seq_num  = cpu_to_be32(req->seq_num =
2689 				 atomic_add_return(1, &mdev->packet_seq));
2690 
2691 	dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw);
2692 
2693 	if (mdev->state.conn >= C_SYNC_SOURCE &&
2694 	    mdev->state.conn <= C_PAUSED_SYNC_T)
2695 		dp_flags |= DP_MAY_SET_IN_SYNC;
2696 
2697 	p.dp_flags = cpu_to_be32(dp_flags);
2698 	set_bit(UNPLUG_REMOTE, &mdev->flags);
2699 	ok = (sizeof(p) ==
2700 		drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0));
2701 	if (ok && dgs) {
2702 		dgb = mdev->int_dig_out;
2703 		drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
2704 		ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
2705 	}
2706 	if (ok) {
2707 		/* For protocol A, we have to memcpy the payload into
2708 		 * socket buffers, as we may complete right away
2709 		 * as soon as we handed it over to tcp, at which point the data
2710 		 * pages may become invalid.
2711 		 *
2712 		 * For data-integrity enabled, we copy it as well, so we can be
2713 		 * sure that even if the bio pages may still be modified, it
2714 		 * won't change the data on the wire, thus if the digest checks
2715 		 * out ok after sending on this side, but does not fit on the
2716 		 * receiving side, we sure have detected corruption elsewhere.
2717 		 */
2718 		if (mdev->net_conf->wire_protocol == DRBD_PROT_A || dgs)
2719 			ok = _drbd_send_bio(mdev, req->master_bio);
2720 		else
2721 			ok = _drbd_send_zc_bio(mdev, req->master_bio);
2722 
2723 		/* double check digest, sometimes buffers have been modified in flight. */
2724 		if (dgs > 0 && dgs <= 64) {
2725 			/* 64 byte, 512 bit, is the larges digest size
2726 			 * currently supported in kernel crypto. */
2727 			unsigned char digest[64];
2728 			drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, digest);
2729 			if (memcmp(mdev->int_dig_out, digest, dgs)) {
2730 				dev_warn(DEV,
2731 					"Digest mismatch, buffer modified by upper layers during write: %llus +%u\n",
2732 					(unsigned long long)req->sector, req->size);
2733 			}
2734 		} /* else if (dgs > 64) {
2735 		     ... Be noisy about digest too large ...
2736 		} */
2737 	}
2738 
2739 	drbd_put_data_sock(mdev);
2740 
2741 	return ok;
2742 }
2743 
2744 /* answer packet, used to send data back for read requests:
2745  *  Peer       -> (diskless) R_PRIMARY   (P_DATA_REPLY)
2746  *  C_SYNC_SOURCE -> C_SYNC_TARGET         (P_RS_DATA_REPLY)
2747  */
drbd_send_block(struct drbd_conf * mdev,enum drbd_packets cmd,struct drbd_epoch_entry * e)2748 int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
2749 		    struct drbd_epoch_entry *e)
2750 {
2751 	int ok;
2752 	struct p_data p;
2753 	void *dgb;
2754 	int dgs;
2755 
2756 	dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2757 		crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2758 
2759 	if (e->size <= DRBD_MAX_SIZE_H80_PACKET) {
2760 		p.head.h80.magic   = BE_DRBD_MAGIC;
2761 		p.head.h80.command = cpu_to_be16(cmd);
2762 		p.head.h80.length  =
2763 			cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2764 	} else {
2765 		p.head.h95.magic   = BE_DRBD_MAGIC_BIG;
2766 		p.head.h95.command = cpu_to_be16(cmd);
2767 		p.head.h95.length  =
2768 			cpu_to_be32(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2769 	}
2770 
2771 	p.sector   = cpu_to_be64(e->sector);
2772 	p.block_id = e->block_id;
2773 	/* p.seq_num  = 0;    No sequence numbers here.. */
2774 
2775 	/* Only called by our kernel thread.
2776 	 * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL
2777 	 * in response to admin command or module unload.
2778 	 */
2779 	if (!drbd_get_data_sock(mdev))
2780 		return 0;
2781 
2782 	ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0);
2783 	if (ok && dgs) {
2784 		dgb = mdev->int_dig_out;
2785 		drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb);
2786 		ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
2787 	}
2788 	if (ok)
2789 		ok = _drbd_send_zc_ee(mdev, e);
2790 
2791 	drbd_put_data_sock(mdev);
2792 
2793 	return ok;
2794 }
2795 
drbd_send_oos(struct drbd_conf * mdev,struct drbd_request * req)2796 int drbd_send_oos(struct drbd_conf *mdev, struct drbd_request *req)
2797 {
2798 	struct p_block_desc p;
2799 
2800 	p.sector  = cpu_to_be64(req->sector);
2801 	p.blksize = cpu_to_be32(req->size);
2802 
2803 	return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OUT_OF_SYNC, &p.head, sizeof(p));
2804 }
2805 
2806 /*
2807   drbd_send distinguishes two cases:
2808 
2809   Packets sent via the data socket "sock"
2810   and packets sent via the meta data socket "msock"
2811 
2812 		    sock                      msock
2813   -----------------+-------------------------+------------------------------
2814   timeout           conf.timeout / 2          conf.timeout / 2
2815   timeout action    send a ping via msock     Abort communication
2816 					      and close all sockets
2817 */
2818 
2819 /*
2820  * you must have down()ed the appropriate [m]sock_mutex elsewhere!
2821  */
drbd_send(struct drbd_conf * mdev,struct socket * sock,void * buf,size_t size,unsigned msg_flags)2822 int drbd_send(struct drbd_conf *mdev, struct socket *sock,
2823 	      void *buf, size_t size, unsigned msg_flags)
2824 {
2825 	struct kvec iov;
2826 	struct msghdr msg;
2827 	int rv, sent = 0;
2828 
2829 	if (!sock)
2830 		return -1000;
2831 
2832 	/* THINK  if (signal_pending) return ... ? */
2833 
2834 	iov.iov_base = buf;
2835 	iov.iov_len  = size;
2836 
2837 	msg.msg_name       = NULL;
2838 	msg.msg_namelen    = 0;
2839 	msg.msg_control    = NULL;
2840 	msg.msg_controllen = 0;
2841 	msg.msg_flags      = msg_flags | MSG_NOSIGNAL;
2842 
2843 	if (sock == mdev->data.socket) {
2844 		mdev->ko_count = mdev->net_conf->ko_count;
2845 		drbd_update_congested(mdev);
2846 	}
2847 	do {
2848 		/* STRANGE
2849 		 * tcp_sendmsg does _not_ use its size parameter at all ?
2850 		 *
2851 		 * -EAGAIN on timeout, -EINTR on signal.
2852 		 */
2853 /* THINK
2854  * do we need to block DRBD_SIG if sock == &meta.socket ??
2855  * otherwise wake_asender() might interrupt some send_*Ack !
2856  */
2857 		rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
2858 		if (rv == -EAGAIN) {
2859 			if (we_should_drop_the_connection(mdev, sock))
2860 				break;
2861 			else
2862 				continue;
2863 		}
2864 		D_ASSERT(rv != 0);
2865 		if (rv == -EINTR) {
2866 			flush_signals(current);
2867 			rv = 0;
2868 		}
2869 		if (rv < 0)
2870 			break;
2871 		sent += rv;
2872 		iov.iov_base += rv;
2873 		iov.iov_len  -= rv;
2874 	} while (sent < size);
2875 
2876 	if (sock == mdev->data.socket)
2877 		clear_bit(NET_CONGESTED, &mdev->flags);
2878 
2879 	if (rv <= 0) {
2880 		if (rv != -EAGAIN) {
2881 			dev_err(DEV, "%s_sendmsg returned %d\n",
2882 			    sock == mdev->meta.socket ? "msock" : "sock",
2883 			    rv);
2884 			drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
2885 		} else
2886 			drbd_force_state(mdev, NS(conn, C_TIMEOUT));
2887 	}
2888 
2889 	return sent;
2890 }
2891 
drbd_open(struct block_device * bdev,fmode_t mode)2892 static int drbd_open(struct block_device *bdev, fmode_t mode)
2893 {
2894 	struct drbd_conf *mdev = bdev->bd_disk->private_data;
2895 	unsigned long flags;
2896 	int rv = 0;
2897 
2898 	mutex_lock(&drbd_main_mutex);
2899 	spin_lock_irqsave(&mdev->req_lock, flags);
2900 	/* to have a stable mdev->state.role
2901 	 * and no race with updating open_cnt */
2902 
2903 	if (mdev->state.role != R_PRIMARY) {
2904 		if (mode & FMODE_WRITE)
2905 			rv = -EROFS;
2906 		else if (!allow_oos)
2907 			rv = -EMEDIUMTYPE;
2908 	}
2909 
2910 	if (!rv)
2911 		mdev->open_cnt++;
2912 	spin_unlock_irqrestore(&mdev->req_lock, flags);
2913 	mutex_unlock(&drbd_main_mutex);
2914 
2915 	return rv;
2916 }
2917 
drbd_release(struct gendisk * gd,fmode_t mode)2918 static int drbd_release(struct gendisk *gd, fmode_t mode)
2919 {
2920 	struct drbd_conf *mdev = gd->private_data;
2921 	mutex_lock(&drbd_main_mutex);
2922 	mdev->open_cnt--;
2923 	mutex_unlock(&drbd_main_mutex);
2924 	return 0;
2925 }
2926 
drbd_set_defaults(struct drbd_conf * mdev)2927 static void drbd_set_defaults(struct drbd_conf *mdev)
2928 {
2929 	/* This way we get a compile error when sync_conf grows,
2930 	   and we forgot to initialize it here */
2931 	mdev->sync_conf = (struct syncer_conf) {
2932 		/* .rate = */		DRBD_RATE_DEF,
2933 		/* .after = */		DRBD_AFTER_DEF,
2934 		/* .al_extents = */	DRBD_AL_EXTENTS_DEF,
2935 		/* .verify_alg = */	{}, 0,
2936 		/* .cpu_mask = */	{}, 0,
2937 		/* .csums_alg = */	{}, 0,
2938 		/* .use_rle = */	0,
2939 		/* .on_no_data = */	DRBD_ON_NO_DATA_DEF,
2940 		/* .c_plan_ahead = */	DRBD_C_PLAN_AHEAD_DEF,
2941 		/* .c_delay_target = */	DRBD_C_DELAY_TARGET_DEF,
2942 		/* .c_fill_target = */	DRBD_C_FILL_TARGET_DEF,
2943 		/* .c_max_rate = */	DRBD_C_MAX_RATE_DEF,
2944 		/* .c_min_rate = */	DRBD_C_MIN_RATE_DEF
2945 	};
2946 
2947 	/* Have to use that way, because the layout differs between
2948 	   big endian and little endian */
2949 	mdev->state = (union drbd_state) {
2950 		{ .role = R_SECONDARY,
2951 		  .peer = R_UNKNOWN,
2952 		  .conn = C_STANDALONE,
2953 		  .disk = D_DISKLESS,
2954 		  .pdsk = D_UNKNOWN,
2955 		  .susp = 0,
2956 		  .susp_nod = 0,
2957 		  .susp_fen = 0
2958 		} };
2959 }
2960 
drbd_init_set_defaults(struct drbd_conf * mdev)2961 void drbd_init_set_defaults(struct drbd_conf *mdev)
2962 {
2963 	/* the memset(,0,) did most of this.
2964 	 * note: only assignments, no allocation in here */
2965 
2966 	drbd_set_defaults(mdev);
2967 
2968 	atomic_set(&mdev->ap_bio_cnt, 0);
2969 	atomic_set(&mdev->ap_pending_cnt, 0);
2970 	atomic_set(&mdev->rs_pending_cnt, 0);
2971 	atomic_set(&mdev->unacked_cnt, 0);
2972 	atomic_set(&mdev->local_cnt, 0);
2973 	atomic_set(&mdev->net_cnt, 0);
2974 	atomic_set(&mdev->packet_seq, 0);
2975 	atomic_set(&mdev->pp_in_use, 0);
2976 	atomic_set(&mdev->pp_in_use_by_net, 0);
2977 	atomic_set(&mdev->rs_sect_in, 0);
2978 	atomic_set(&mdev->rs_sect_ev, 0);
2979 	atomic_set(&mdev->ap_in_flight, 0);
2980 
2981 	mutex_init(&mdev->md_io_mutex);
2982 	mutex_init(&mdev->data.mutex);
2983 	mutex_init(&mdev->meta.mutex);
2984 	sema_init(&mdev->data.work.s, 0);
2985 	sema_init(&mdev->meta.work.s, 0);
2986 	mutex_init(&mdev->state_mutex);
2987 
2988 	spin_lock_init(&mdev->data.work.q_lock);
2989 	spin_lock_init(&mdev->meta.work.q_lock);
2990 
2991 	spin_lock_init(&mdev->al_lock);
2992 	spin_lock_init(&mdev->req_lock);
2993 	spin_lock_init(&mdev->peer_seq_lock);
2994 	spin_lock_init(&mdev->epoch_lock);
2995 
2996 	INIT_LIST_HEAD(&mdev->active_ee);
2997 	INIT_LIST_HEAD(&mdev->sync_ee);
2998 	INIT_LIST_HEAD(&mdev->done_ee);
2999 	INIT_LIST_HEAD(&mdev->read_ee);
3000 	INIT_LIST_HEAD(&mdev->net_ee);
3001 	INIT_LIST_HEAD(&mdev->resync_reads);
3002 	INIT_LIST_HEAD(&mdev->data.work.q);
3003 	INIT_LIST_HEAD(&mdev->meta.work.q);
3004 	INIT_LIST_HEAD(&mdev->resync_work.list);
3005 	INIT_LIST_HEAD(&mdev->unplug_work.list);
3006 	INIT_LIST_HEAD(&mdev->go_diskless.list);
3007 	INIT_LIST_HEAD(&mdev->md_sync_work.list);
3008 	INIT_LIST_HEAD(&mdev->start_resync_work.list);
3009 	INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
3010 
3011 	mdev->resync_work.cb  = w_resync_timer;
3012 	mdev->unplug_work.cb  = w_send_write_hint;
3013 	mdev->go_diskless.cb  = w_go_diskless;
3014 	mdev->md_sync_work.cb = w_md_sync;
3015 	mdev->bm_io_work.w.cb = w_bitmap_io;
3016 	mdev->start_resync_work.cb = w_start_resync;
3017 	init_timer(&mdev->resync_timer);
3018 	init_timer(&mdev->md_sync_timer);
3019 	init_timer(&mdev->start_resync_timer);
3020 	init_timer(&mdev->request_timer);
3021 	mdev->resync_timer.function = resync_timer_fn;
3022 	mdev->resync_timer.data = (unsigned long) mdev;
3023 	mdev->md_sync_timer.function = md_sync_timer_fn;
3024 	mdev->md_sync_timer.data = (unsigned long) mdev;
3025 	mdev->start_resync_timer.function = start_resync_timer_fn;
3026 	mdev->start_resync_timer.data = (unsigned long) mdev;
3027 	mdev->request_timer.function = request_timer_fn;
3028 	mdev->request_timer.data = (unsigned long) mdev;
3029 
3030 	init_waitqueue_head(&mdev->misc_wait);
3031 	init_waitqueue_head(&mdev->state_wait);
3032 	init_waitqueue_head(&mdev->net_cnt_wait);
3033 	init_waitqueue_head(&mdev->ee_wait);
3034 	init_waitqueue_head(&mdev->al_wait);
3035 	init_waitqueue_head(&mdev->seq_wait);
3036 
3037 	drbd_thread_init(mdev, &mdev->receiver, drbdd_init);
3038 	drbd_thread_init(mdev, &mdev->worker, drbd_worker);
3039 	drbd_thread_init(mdev, &mdev->asender, drbd_asender);
3040 
3041 	mdev->agreed_pro_version = PRO_VERSION_MAX;
3042 	mdev->write_ordering = WO_bdev_flush;
3043 	mdev->resync_wenr = LC_FREE;
3044 }
3045 
drbd_mdev_cleanup(struct drbd_conf * mdev)3046 void drbd_mdev_cleanup(struct drbd_conf *mdev)
3047 {
3048 	int i;
3049 	if (mdev->receiver.t_state != None)
3050 		dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
3051 				mdev->receiver.t_state);
3052 
3053 	/* no need to lock it, I'm the only thread alive */
3054 	if (atomic_read(&mdev->current_epoch->epoch_size) !=  0)
3055 		dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size));
3056 	mdev->al_writ_cnt  =
3057 	mdev->bm_writ_cnt  =
3058 	mdev->read_cnt     =
3059 	mdev->recv_cnt     =
3060 	mdev->send_cnt     =
3061 	mdev->writ_cnt     =
3062 	mdev->p_size       =
3063 	mdev->rs_start     =
3064 	mdev->rs_total     =
3065 	mdev->rs_failed    = 0;
3066 	mdev->rs_last_events = 0;
3067 	mdev->rs_last_sect_ev = 0;
3068 	for (i = 0; i < DRBD_SYNC_MARKS; i++) {
3069 		mdev->rs_mark_left[i] = 0;
3070 		mdev->rs_mark_time[i] = 0;
3071 	}
3072 	D_ASSERT(mdev->net_conf == NULL);
3073 
3074 	drbd_set_my_capacity(mdev, 0);
3075 	if (mdev->bitmap) {
3076 		/* maybe never allocated. */
3077 		drbd_bm_resize(mdev, 0, 1);
3078 		drbd_bm_cleanup(mdev);
3079 	}
3080 
3081 	drbd_free_resources(mdev);
3082 	clear_bit(AL_SUSPENDED, &mdev->flags);
3083 
3084 	/*
3085 	 * currently we drbd_init_ee only on module load, so
3086 	 * we may do drbd_release_ee only on module unload!
3087 	 */
3088 	D_ASSERT(list_empty(&mdev->active_ee));
3089 	D_ASSERT(list_empty(&mdev->sync_ee));
3090 	D_ASSERT(list_empty(&mdev->done_ee));
3091 	D_ASSERT(list_empty(&mdev->read_ee));
3092 	D_ASSERT(list_empty(&mdev->net_ee));
3093 	D_ASSERT(list_empty(&mdev->resync_reads));
3094 	D_ASSERT(list_empty(&mdev->data.work.q));
3095 	D_ASSERT(list_empty(&mdev->meta.work.q));
3096 	D_ASSERT(list_empty(&mdev->resync_work.list));
3097 	D_ASSERT(list_empty(&mdev->unplug_work.list));
3098 	D_ASSERT(list_empty(&mdev->go_diskless.list));
3099 
3100 	drbd_set_defaults(mdev);
3101 }
3102 
3103 
drbd_destroy_mempools(void)3104 static void drbd_destroy_mempools(void)
3105 {
3106 	struct page *page;
3107 
3108 	while (drbd_pp_pool) {
3109 		page = drbd_pp_pool;
3110 		drbd_pp_pool = (struct page *)page_private(page);
3111 		__free_page(page);
3112 		drbd_pp_vacant--;
3113 	}
3114 
3115 	/* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */
3116 
3117 	if (drbd_ee_mempool)
3118 		mempool_destroy(drbd_ee_mempool);
3119 	if (drbd_request_mempool)
3120 		mempool_destroy(drbd_request_mempool);
3121 	if (drbd_ee_cache)
3122 		kmem_cache_destroy(drbd_ee_cache);
3123 	if (drbd_request_cache)
3124 		kmem_cache_destroy(drbd_request_cache);
3125 	if (drbd_bm_ext_cache)
3126 		kmem_cache_destroy(drbd_bm_ext_cache);
3127 	if (drbd_al_ext_cache)
3128 		kmem_cache_destroy(drbd_al_ext_cache);
3129 
3130 	drbd_ee_mempool      = NULL;
3131 	drbd_request_mempool = NULL;
3132 	drbd_ee_cache        = NULL;
3133 	drbd_request_cache   = NULL;
3134 	drbd_bm_ext_cache    = NULL;
3135 	drbd_al_ext_cache    = NULL;
3136 
3137 	return;
3138 }
3139 
drbd_create_mempools(void)3140 static int drbd_create_mempools(void)
3141 {
3142 	struct page *page;
3143 	const int number = (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count;
3144 	int i;
3145 
3146 	/* prepare our caches and mempools */
3147 	drbd_request_mempool = NULL;
3148 	drbd_ee_cache        = NULL;
3149 	drbd_request_cache   = NULL;
3150 	drbd_bm_ext_cache    = NULL;
3151 	drbd_al_ext_cache    = NULL;
3152 	drbd_pp_pool         = NULL;
3153 
3154 	/* caches */
3155 	drbd_request_cache = kmem_cache_create(
3156 		"drbd_req", sizeof(struct drbd_request), 0, 0, NULL);
3157 	if (drbd_request_cache == NULL)
3158 		goto Enomem;
3159 
3160 	drbd_ee_cache = kmem_cache_create(
3161 		"drbd_ee", sizeof(struct drbd_epoch_entry), 0, 0, NULL);
3162 	if (drbd_ee_cache == NULL)
3163 		goto Enomem;
3164 
3165 	drbd_bm_ext_cache = kmem_cache_create(
3166 		"drbd_bm", sizeof(struct bm_extent), 0, 0, NULL);
3167 	if (drbd_bm_ext_cache == NULL)
3168 		goto Enomem;
3169 
3170 	drbd_al_ext_cache = kmem_cache_create(
3171 		"drbd_al", sizeof(struct lc_element), 0, 0, NULL);
3172 	if (drbd_al_ext_cache == NULL)
3173 		goto Enomem;
3174 
3175 	/* mempools */
3176 	drbd_request_mempool = mempool_create(number,
3177 		mempool_alloc_slab, mempool_free_slab, drbd_request_cache);
3178 	if (drbd_request_mempool == NULL)
3179 		goto Enomem;
3180 
3181 	drbd_ee_mempool = mempool_create(number,
3182 		mempool_alloc_slab, mempool_free_slab, drbd_ee_cache);
3183 	if (drbd_ee_mempool == NULL)
3184 		goto Enomem;
3185 
3186 	/* drbd's page pool */
3187 	spin_lock_init(&drbd_pp_lock);
3188 
3189 	for (i = 0; i < number; i++) {
3190 		page = alloc_page(GFP_HIGHUSER);
3191 		if (!page)
3192 			goto Enomem;
3193 		set_page_private(page, (unsigned long)drbd_pp_pool);
3194 		drbd_pp_pool = page;
3195 	}
3196 	drbd_pp_vacant = number;
3197 
3198 	return 0;
3199 
3200 Enomem:
3201 	drbd_destroy_mempools(); /* in case we allocated some */
3202 	return -ENOMEM;
3203 }
3204 
drbd_notify_sys(struct notifier_block * this,unsigned long code,void * unused)3205 static int drbd_notify_sys(struct notifier_block *this, unsigned long code,
3206 	void *unused)
3207 {
3208 	/* just so we have it.  you never know what interesting things we
3209 	 * might want to do here some day...
3210 	 */
3211 
3212 	return NOTIFY_DONE;
3213 }
3214 
3215 static struct notifier_block drbd_notifier = {
3216 	.notifier_call = drbd_notify_sys,
3217 };
3218 
drbd_release_ee_lists(struct drbd_conf * mdev)3219 static void drbd_release_ee_lists(struct drbd_conf *mdev)
3220 {
3221 	int rr;
3222 
3223 	rr = drbd_release_ee(mdev, &mdev->active_ee);
3224 	if (rr)
3225 		dev_err(DEV, "%d EEs in active list found!\n", rr);
3226 
3227 	rr = drbd_release_ee(mdev, &mdev->sync_ee);
3228 	if (rr)
3229 		dev_err(DEV, "%d EEs in sync list found!\n", rr);
3230 
3231 	rr = drbd_release_ee(mdev, &mdev->read_ee);
3232 	if (rr)
3233 		dev_err(DEV, "%d EEs in read list found!\n", rr);
3234 
3235 	rr = drbd_release_ee(mdev, &mdev->done_ee);
3236 	if (rr)
3237 		dev_err(DEV, "%d EEs in done list found!\n", rr);
3238 
3239 	rr = drbd_release_ee(mdev, &mdev->net_ee);
3240 	if (rr)
3241 		dev_err(DEV, "%d EEs in net list found!\n", rr);
3242 }
3243 
3244 /* caution. no locking.
3245  * currently only used from module cleanup code. */
drbd_delete_device(unsigned int minor)3246 static void drbd_delete_device(unsigned int minor)
3247 {
3248 	struct drbd_conf *mdev = minor_to_mdev(minor);
3249 
3250 	if (!mdev)
3251 		return;
3252 
3253 	/* paranoia asserts */
3254 	if (mdev->open_cnt != 0)
3255 		dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt,
3256 				__FILE__ , __LINE__);
3257 
3258 	ERR_IF (!list_empty(&mdev->data.work.q)) {
3259 		struct list_head *lp;
3260 		list_for_each(lp, &mdev->data.work.q) {
3261 			dev_err(DEV, "lp = %p\n", lp);
3262 		}
3263 	};
3264 	/* end paranoia asserts */
3265 
3266 	del_gendisk(mdev->vdisk);
3267 
3268 	/* cleanup stuff that may have been allocated during
3269 	 * device (re-)configuration or state changes */
3270 
3271 	if (mdev->this_bdev)
3272 		bdput(mdev->this_bdev);
3273 
3274 	drbd_free_resources(mdev);
3275 
3276 	drbd_release_ee_lists(mdev);
3277 
3278 	/* should be free'd on disconnect? */
3279 	kfree(mdev->ee_hash);
3280 	/*
3281 	mdev->ee_hash_s = 0;
3282 	mdev->ee_hash = NULL;
3283 	*/
3284 
3285 	lc_destroy(mdev->act_log);
3286 	lc_destroy(mdev->resync);
3287 
3288 	kfree(mdev->p_uuid);
3289 	/* mdev->p_uuid = NULL; */
3290 
3291 	kfree(mdev->int_dig_out);
3292 	kfree(mdev->int_dig_in);
3293 	kfree(mdev->int_dig_vv);
3294 
3295 	/* cleanup the rest that has been
3296 	 * allocated from drbd_new_device
3297 	 * and actually free the mdev itself */
3298 	drbd_free_mdev(mdev);
3299 }
3300 
drbd_cleanup(void)3301 static void drbd_cleanup(void)
3302 {
3303 	unsigned int i;
3304 
3305 	unregister_reboot_notifier(&drbd_notifier);
3306 
3307 	/* first remove proc,
3308 	 * drbdsetup uses it's presence to detect
3309 	 * whether DRBD is loaded.
3310 	 * If we would get stuck in proc removal,
3311 	 * but have netlink already deregistered,
3312 	 * some drbdsetup commands may wait forever
3313 	 * for an answer.
3314 	 */
3315 	if (drbd_proc)
3316 		remove_proc_entry("drbd", NULL);
3317 
3318 	drbd_nl_cleanup();
3319 
3320 	if (minor_table) {
3321 		i = minor_count;
3322 		while (i--)
3323 			drbd_delete_device(i);
3324 		drbd_destroy_mempools();
3325 	}
3326 
3327 	kfree(minor_table);
3328 
3329 	unregister_blkdev(DRBD_MAJOR, "drbd");
3330 
3331 	printk(KERN_INFO "drbd: module cleanup done.\n");
3332 }
3333 
3334 /**
3335  * drbd_congested() - Callback for pdflush
3336  * @congested_data:	User data
3337  * @bdi_bits:		Bits pdflush is currently interested in
3338  *
3339  * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
3340  */
drbd_congested(void * congested_data,int bdi_bits)3341 static int drbd_congested(void *congested_data, int bdi_bits)
3342 {
3343 	struct drbd_conf *mdev = congested_data;
3344 	struct request_queue *q;
3345 	char reason = '-';
3346 	int r = 0;
3347 
3348 	if (!may_inc_ap_bio(mdev)) {
3349 		/* DRBD has frozen IO */
3350 		r = bdi_bits;
3351 		reason = 'd';
3352 		goto out;
3353 	}
3354 
3355 	if (get_ldev(mdev)) {
3356 		q = bdev_get_queue(mdev->ldev->backing_bdev);
3357 		r = bdi_congested(&q->backing_dev_info, bdi_bits);
3358 		put_ldev(mdev);
3359 		if (r)
3360 			reason = 'b';
3361 	}
3362 
3363 	if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->flags)) {
3364 		r |= (1 << BDI_async_congested);
3365 		reason = reason == 'b' ? 'a' : 'n';
3366 	}
3367 
3368 out:
3369 	mdev->congestion_reason = reason;
3370 	return r;
3371 }
3372 
drbd_new_device(unsigned int minor)3373 struct drbd_conf *drbd_new_device(unsigned int minor)
3374 {
3375 	struct drbd_conf *mdev;
3376 	struct gendisk *disk;
3377 	struct request_queue *q;
3378 
3379 	/* GFP_KERNEL, we are outside of all write-out paths */
3380 	mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL);
3381 	if (!mdev)
3382 		return NULL;
3383 	if (!zalloc_cpumask_var(&mdev->cpu_mask, GFP_KERNEL))
3384 		goto out_no_cpumask;
3385 
3386 	mdev->minor = minor;
3387 
3388 	drbd_init_set_defaults(mdev);
3389 
3390 	q = blk_alloc_queue(GFP_KERNEL);
3391 	if (!q)
3392 		goto out_no_q;
3393 	mdev->rq_queue = q;
3394 	q->queuedata   = mdev;
3395 
3396 	disk = alloc_disk(1);
3397 	if (!disk)
3398 		goto out_no_disk;
3399 	mdev->vdisk = disk;
3400 
3401 	set_disk_ro(disk, true);
3402 
3403 	disk->queue = q;
3404 	disk->major = DRBD_MAJOR;
3405 	disk->first_minor = minor;
3406 	disk->fops = &drbd_ops;
3407 	sprintf(disk->disk_name, "drbd%d", minor);
3408 	disk->private_data = mdev;
3409 
3410 	mdev->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor));
3411 	/* we have no partitions. we contain only ourselves. */
3412 	mdev->this_bdev->bd_contains = mdev->this_bdev;
3413 
3414 	q->backing_dev_info.congested_fn = drbd_congested;
3415 	q->backing_dev_info.congested_data = mdev;
3416 
3417 	blk_queue_make_request(q, drbd_make_request);
3418 	blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE >> 9);
3419 	blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
3420 	blk_queue_merge_bvec(q, drbd_merge_bvec);
3421 	q->queue_lock = &mdev->req_lock;
3422 
3423 	mdev->md_io_page = alloc_page(GFP_KERNEL);
3424 	if (!mdev->md_io_page)
3425 		goto out_no_io_page;
3426 
3427 	if (drbd_bm_init(mdev))
3428 		goto out_no_bitmap;
3429 	/* no need to lock access, we are still initializing this minor device. */
3430 	if (!tl_init(mdev))
3431 		goto out_no_tl;
3432 
3433 	mdev->app_reads_hash = kzalloc(APP_R_HSIZE*sizeof(void *), GFP_KERNEL);
3434 	if (!mdev->app_reads_hash)
3435 		goto out_no_app_reads;
3436 
3437 	mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
3438 	if (!mdev->current_epoch)
3439 		goto out_no_epoch;
3440 
3441 	INIT_LIST_HEAD(&mdev->current_epoch->list);
3442 	mdev->epochs = 1;
3443 
3444 	return mdev;
3445 
3446 /* out_whatever_else:
3447 	kfree(mdev->current_epoch); */
3448 out_no_epoch:
3449 	kfree(mdev->app_reads_hash);
3450 out_no_app_reads:
3451 	tl_cleanup(mdev);
3452 out_no_tl:
3453 	drbd_bm_cleanup(mdev);
3454 out_no_bitmap:
3455 	__free_page(mdev->md_io_page);
3456 out_no_io_page:
3457 	put_disk(disk);
3458 out_no_disk:
3459 	blk_cleanup_queue(q);
3460 out_no_q:
3461 	free_cpumask_var(mdev->cpu_mask);
3462 out_no_cpumask:
3463 	kfree(mdev);
3464 	return NULL;
3465 }
3466 
3467 /* counterpart of drbd_new_device.
3468  * last part of drbd_delete_device. */
drbd_free_mdev(struct drbd_conf * mdev)3469 void drbd_free_mdev(struct drbd_conf *mdev)
3470 {
3471 	kfree(mdev->current_epoch);
3472 	kfree(mdev->app_reads_hash);
3473 	tl_cleanup(mdev);
3474 	if (mdev->bitmap) /* should no longer be there. */
3475 		drbd_bm_cleanup(mdev);
3476 	__free_page(mdev->md_io_page);
3477 	put_disk(mdev->vdisk);
3478 	blk_cleanup_queue(mdev->rq_queue);
3479 	free_cpumask_var(mdev->cpu_mask);
3480 	drbd_free_tl_hash(mdev);
3481 	kfree(mdev);
3482 }
3483 
3484 
drbd_init(void)3485 int __init drbd_init(void)
3486 {
3487 	int err;
3488 
3489 	if (sizeof(struct p_handshake) != 80) {
3490 		printk(KERN_ERR
3491 		       "drbd: never change the size or layout "
3492 		       "of the HandShake packet.\n");
3493 		return -EINVAL;
3494 	}
3495 
3496 	if (minor_count < DRBD_MINOR_COUNT_MIN || minor_count > DRBD_MINOR_COUNT_MAX) {
3497 		printk(KERN_ERR
3498 			"drbd: invalid minor_count (%d)\n", minor_count);
3499 #ifdef MODULE
3500 		return -EINVAL;
3501 #else
3502 		minor_count = 8;
3503 #endif
3504 	}
3505 
3506 	err = drbd_nl_init();
3507 	if (err)
3508 		return err;
3509 
3510 	err = register_blkdev(DRBD_MAJOR, "drbd");
3511 	if (err) {
3512 		printk(KERN_ERR
3513 		       "drbd: unable to register block device major %d\n",
3514 		       DRBD_MAJOR);
3515 		return err;
3516 	}
3517 
3518 	register_reboot_notifier(&drbd_notifier);
3519 
3520 	/*
3521 	 * allocate all necessary structs
3522 	 */
3523 	err = -ENOMEM;
3524 
3525 	init_waitqueue_head(&drbd_pp_wait);
3526 
3527 	drbd_proc = NULL; /* play safe for drbd_cleanup */
3528 	minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count,
3529 				GFP_KERNEL);
3530 	if (!minor_table)
3531 		goto Enomem;
3532 
3533 	err = drbd_create_mempools();
3534 	if (err)
3535 		goto Enomem;
3536 
3537 	drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
3538 	if (!drbd_proc)	{
3539 		printk(KERN_ERR "drbd: unable to register proc file\n");
3540 		goto Enomem;
3541 	}
3542 
3543 	rwlock_init(&global_state_lock);
3544 
3545 	printk(KERN_INFO "drbd: initialized. "
3546 	       "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
3547 	       API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
3548 	printk(KERN_INFO "drbd: %s\n", drbd_buildtag());
3549 	printk(KERN_INFO "drbd: registered as block device major %d\n",
3550 		DRBD_MAJOR);
3551 	printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table);
3552 
3553 	return 0; /* Success! */
3554 
3555 Enomem:
3556 	drbd_cleanup();
3557 	if (err == -ENOMEM)
3558 		/* currently always the case */
3559 		printk(KERN_ERR "drbd: ran out of memory\n");
3560 	else
3561 		printk(KERN_ERR "drbd: initialization failure\n");
3562 	return err;
3563 }
3564 
drbd_free_bc(struct drbd_backing_dev * ldev)3565 void drbd_free_bc(struct drbd_backing_dev *ldev)
3566 {
3567 	if (ldev == NULL)
3568 		return;
3569 
3570 	blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
3571 	blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
3572 
3573 	kfree(ldev);
3574 }
3575 
drbd_free_sock(struct drbd_conf * mdev)3576 void drbd_free_sock(struct drbd_conf *mdev)
3577 {
3578 	if (mdev->data.socket) {
3579 		mutex_lock(&mdev->data.mutex);
3580 		kernel_sock_shutdown(mdev->data.socket, SHUT_RDWR);
3581 		sock_release(mdev->data.socket);
3582 		mdev->data.socket = NULL;
3583 		mutex_unlock(&mdev->data.mutex);
3584 	}
3585 	if (mdev->meta.socket) {
3586 		mutex_lock(&mdev->meta.mutex);
3587 		kernel_sock_shutdown(mdev->meta.socket, SHUT_RDWR);
3588 		sock_release(mdev->meta.socket);
3589 		mdev->meta.socket = NULL;
3590 		mutex_unlock(&mdev->meta.mutex);
3591 	}
3592 }
3593 
3594 
drbd_free_resources(struct drbd_conf * mdev)3595 void drbd_free_resources(struct drbd_conf *mdev)
3596 {
3597 	crypto_free_hash(mdev->csums_tfm);
3598 	mdev->csums_tfm = NULL;
3599 	crypto_free_hash(mdev->verify_tfm);
3600 	mdev->verify_tfm = NULL;
3601 	crypto_free_hash(mdev->cram_hmac_tfm);
3602 	mdev->cram_hmac_tfm = NULL;
3603 	crypto_free_hash(mdev->integrity_w_tfm);
3604 	mdev->integrity_w_tfm = NULL;
3605 	crypto_free_hash(mdev->integrity_r_tfm);
3606 	mdev->integrity_r_tfm = NULL;
3607 
3608 	drbd_free_sock(mdev);
3609 
3610 	__no_warn(local,
3611 		  drbd_free_bc(mdev->ldev);
3612 		  mdev->ldev = NULL;);
3613 }
3614 
3615 /* meta data management */
3616 
3617 struct meta_data_on_disk {
3618 	u64 la_size;           /* last agreed size. */
3619 	u64 uuid[UI_SIZE];   /* UUIDs. */
3620 	u64 device_uuid;
3621 	u64 reserved_u64_1;
3622 	u32 flags;             /* MDF */
3623 	u32 magic;
3624 	u32 md_size_sect;
3625 	u32 al_offset;         /* offset to this block */
3626 	u32 al_nr_extents;     /* important for restoring the AL */
3627 	      /* `-- act_log->nr_elements <-- sync_conf.al_extents */
3628 	u32 bm_offset;         /* offset to the bitmap, from here */
3629 	u32 bm_bytes_per_bit;  /* BM_BLOCK_SIZE */
3630 	u32 reserved_u32[4];
3631 
3632 } __packed;
3633 
3634 /**
3635  * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
3636  * @mdev:	DRBD device.
3637  */
drbd_md_sync(struct drbd_conf * mdev)3638 void drbd_md_sync(struct drbd_conf *mdev)
3639 {
3640 	struct meta_data_on_disk *buffer;
3641 	sector_t sector;
3642 	int i;
3643 
3644 	del_timer(&mdev->md_sync_timer);
3645 	/* timer may be rearmed by drbd_md_mark_dirty() now. */
3646 	if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
3647 		return;
3648 
3649 	/* We use here D_FAILED and not D_ATTACHING because we try to write
3650 	 * metadata even if we detach due to a disk failure! */
3651 	if (!get_ldev_if_state(mdev, D_FAILED))
3652 		return;
3653 
3654 	mutex_lock(&mdev->md_io_mutex);
3655 	buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3656 	memset(buffer, 0, 512);
3657 
3658 	buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
3659 	for (i = UI_CURRENT; i < UI_SIZE; i++)
3660 		buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
3661 	buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
3662 	buffer->magic = cpu_to_be32(DRBD_MD_MAGIC);
3663 
3664 	buffer->md_size_sect  = cpu_to_be32(mdev->ldev->md.md_size_sect);
3665 	buffer->al_offset     = cpu_to_be32(mdev->ldev->md.al_offset);
3666 	buffer->al_nr_extents = cpu_to_be32(mdev->act_log->nr_elements);
3667 	buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE);
3668 	buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid);
3669 
3670 	buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
3671 
3672 	D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
3673 	sector = mdev->ldev->md.md_offset;
3674 
3675 	if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
3676 		/* this was a try anyways ... */
3677 		dev_err(DEV, "meta data update failed!\n");
3678 		drbd_chk_io_error(mdev, 1, true);
3679 	}
3680 
3681 	/* Update mdev->ldev->md.la_size_sect,
3682 	 * since we updated it on metadata. */
3683 	mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev);
3684 
3685 	mutex_unlock(&mdev->md_io_mutex);
3686 	put_ldev(mdev);
3687 }
3688 
3689 /**
3690  * drbd_md_read() - Reads in the meta data super block
3691  * @mdev:	DRBD device.
3692  * @bdev:	Device from which the meta data should be read in.
3693  *
3694  * Return 0 (NO_ERROR) on success, and an enum drbd_ret_code in case
3695  * something goes wrong.  Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID.
3696  */
drbd_md_read(struct drbd_conf * mdev,struct drbd_backing_dev * bdev)3697 int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
3698 {
3699 	struct meta_data_on_disk *buffer;
3700 	int i, rv = NO_ERROR;
3701 
3702 	if (!get_ldev_if_state(mdev, D_ATTACHING))
3703 		return ERR_IO_MD_DISK;
3704 
3705 	mutex_lock(&mdev->md_io_mutex);
3706 	buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3707 
3708 	if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
3709 		/* NOTE: can't do normal error processing here as this is
3710 		   called BEFORE disk is attached */
3711 		dev_err(DEV, "Error while reading metadata.\n");
3712 		rv = ERR_IO_MD_DISK;
3713 		goto err;
3714 	}
3715 
3716 	if (be32_to_cpu(buffer->magic) != DRBD_MD_MAGIC) {
3717 		dev_err(DEV, "Error while reading metadata, magic not found.\n");
3718 		rv = ERR_MD_INVALID;
3719 		goto err;
3720 	}
3721 	if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) {
3722 		dev_err(DEV, "unexpected al_offset: %d (expected %d)\n",
3723 		    be32_to_cpu(buffer->al_offset), bdev->md.al_offset);
3724 		rv = ERR_MD_INVALID;
3725 		goto err;
3726 	}
3727 	if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
3728 		dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n",
3729 		    be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
3730 		rv = ERR_MD_INVALID;
3731 		goto err;
3732 	}
3733 	if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
3734 		dev_err(DEV, "unexpected md_size: %u (expected %u)\n",
3735 		    be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
3736 		rv = ERR_MD_INVALID;
3737 		goto err;
3738 	}
3739 
3740 	if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
3741 		dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
3742 		    be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
3743 		rv = ERR_MD_INVALID;
3744 		goto err;
3745 	}
3746 
3747 	bdev->md.la_size_sect = be64_to_cpu(buffer->la_size);
3748 	for (i = UI_CURRENT; i < UI_SIZE; i++)
3749 		bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
3750 	bdev->md.flags = be32_to_cpu(buffer->flags);
3751 	mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents);
3752 	bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
3753 
3754 	if (mdev->sync_conf.al_extents < 7)
3755 		mdev->sync_conf.al_extents = 127;
3756 
3757  err:
3758 	mutex_unlock(&mdev->md_io_mutex);
3759 	put_ldev(mdev);
3760 
3761 	return rv;
3762 }
3763 
3764 /**
3765  * drbd_md_mark_dirty() - Mark meta data super block as dirty
3766  * @mdev:	DRBD device.
3767  *
3768  * Call this function if you change anything that should be written to
3769  * the meta-data super block. This function sets MD_DIRTY, and starts a
3770  * timer that ensures that within five seconds you have to call drbd_md_sync().
3771  */
3772 #ifdef DEBUG
drbd_md_mark_dirty_(struct drbd_conf * mdev,unsigned int line,const char * func)3773 void drbd_md_mark_dirty_(struct drbd_conf *mdev, unsigned int line, const char *func)
3774 {
3775 	if (!test_and_set_bit(MD_DIRTY, &mdev->flags)) {
3776 		mod_timer(&mdev->md_sync_timer, jiffies + HZ);
3777 		mdev->last_md_mark_dirty.line = line;
3778 		mdev->last_md_mark_dirty.func = func;
3779 	}
3780 }
3781 #else
drbd_md_mark_dirty(struct drbd_conf * mdev)3782 void drbd_md_mark_dirty(struct drbd_conf *mdev)
3783 {
3784 	if (!test_and_set_bit(MD_DIRTY, &mdev->flags))
3785 		mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
3786 }
3787 #endif
3788 
drbd_uuid_move_history(struct drbd_conf * mdev)3789 static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
3790 {
3791 	int i;
3792 
3793 	for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++)
3794 		mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i];
3795 }
3796 
_drbd_uuid_set(struct drbd_conf * mdev,int idx,u64 val)3797 void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3798 {
3799 	if (idx == UI_CURRENT) {
3800 		if (mdev->state.role == R_PRIMARY)
3801 			val |= 1;
3802 		else
3803 			val &= ~((u64)1);
3804 
3805 		drbd_set_ed_uuid(mdev, val);
3806 	}
3807 
3808 	mdev->ldev->md.uuid[idx] = val;
3809 	drbd_md_mark_dirty(mdev);
3810 }
3811 
3812 
drbd_uuid_set(struct drbd_conf * mdev,int idx,u64 val)3813 void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3814 {
3815 	if (mdev->ldev->md.uuid[idx]) {
3816 		drbd_uuid_move_history(mdev);
3817 		mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx];
3818 	}
3819 	_drbd_uuid_set(mdev, idx, val);
3820 }
3821 
3822 /**
3823  * drbd_uuid_new_current() - Creates a new current UUID
3824  * @mdev:	DRBD device.
3825  *
3826  * Creates a new current UUID, and rotates the old current UUID into
3827  * the bitmap slot. Causes an incremental resync upon next connect.
3828  */
drbd_uuid_new_current(struct drbd_conf * mdev)3829 void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
3830 {
3831 	u64 val;
3832 	unsigned long long bm_uuid = mdev->ldev->md.uuid[UI_BITMAP];
3833 
3834 	if (bm_uuid)
3835 		dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid);
3836 
3837 	mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT];
3838 
3839 	get_random_bytes(&val, sizeof(u64));
3840 	_drbd_uuid_set(mdev, UI_CURRENT, val);
3841 	drbd_print_uuids(mdev, "new current UUID");
3842 	/* get it to stable storage _now_ */
3843 	drbd_md_sync(mdev);
3844 }
3845 
drbd_uuid_set_bm(struct drbd_conf * mdev,u64 val)3846 void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
3847 {
3848 	if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0)
3849 		return;
3850 
3851 	if (val == 0) {
3852 		drbd_uuid_move_history(mdev);
3853 		mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
3854 		mdev->ldev->md.uuid[UI_BITMAP] = 0;
3855 	} else {
3856 		unsigned long long bm_uuid = mdev->ldev->md.uuid[UI_BITMAP];
3857 		if (bm_uuid)
3858 			dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid);
3859 
3860 		mdev->ldev->md.uuid[UI_BITMAP] = val & ~((u64)1);
3861 	}
3862 	drbd_md_mark_dirty(mdev);
3863 }
3864 
3865 /**
3866  * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3867  * @mdev:	DRBD device.
3868  *
3869  * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
3870  */
drbd_bmio_set_n_write(struct drbd_conf * mdev)3871 int drbd_bmio_set_n_write(struct drbd_conf *mdev)
3872 {
3873 	int rv = -EIO;
3874 
3875 	if (get_ldev_if_state(mdev, D_ATTACHING)) {
3876 		drbd_md_set_flag(mdev, MDF_FULL_SYNC);
3877 		drbd_md_sync(mdev);
3878 		drbd_bm_set_all(mdev);
3879 
3880 		rv = drbd_bm_write(mdev);
3881 
3882 		if (!rv) {
3883 			drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
3884 			drbd_md_sync(mdev);
3885 		}
3886 
3887 		put_ldev(mdev);
3888 	}
3889 
3890 	return rv;
3891 }
3892 
3893 /**
3894  * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3895  * @mdev:	DRBD device.
3896  *
3897  * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
3898  */
drbd_bmio_clear_n_write(struct drbd_conf * mdev)3899 int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
3900 {
3901 	int rv = -EIO;
3902 
3903 	drbd_resume_al(mdev);
3904 	if (get_ldev_if_state(mdev, D_ATTACHING)) {
3905 		drbd_bm_clear_all(mdev);
3906 		rv = drbd_bm_write(mdev);
3907 		put_ldev(mdev);
3908 	}
3909 
3910 	return rv;
3911 }
3912 
w_bitmap_io(struct drbd_conf * mdev,struct drbd_work * w,int unused)3913 static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3914 {
3915 	struct bm_io_work *work = container_of(w, struct bm_io_work, w);
3916 	int rv = -EIO;
3917 
3918 	D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0);
3919 
3920 	if (get_ldev(mdev)) {
3921 		drbd_bm_lock(mdev, work->why, work->flags);
3922 		rv = work->io_fn(mdev);
3923 		drbd_bm_unlock(mdev);
3924 		put_ldev(mdev);
3925 	}
3926 
3927 	clear_bit(BITMAP_IO, &mdev->flags);
3928 	smp_mb__after_clear_bit();
3929 	wake_up(&mdev->misc_wait);
3930 
3931 	if (work->done)
3932 		work->done(mdev, rv);
3933 
3934 	clear_bit(BITMAP_IO_QUEUED, &mdev->flags);
3935 	work->why = NULL;
3936 	work->flags = 0;
3937 
3938 	return 1;
3939 }
3940 
drbd_ldev_destroy(struct drbd_conf * mdev)3941 void drbd_ldev_destroy(struct drbd_conf *mdev)
3942 {
3943 	lc_destroy(mdev->resync);
3944 	mdev->resync = NULL;
3945 	lc_destroy(mdev->act_log);
3946 	mdev->act_log = NULL;
3947 	__no_warn(local,
3948 		drbd_free_bc(mdev->ldev);
3949 		mdev->ldev = NULL;);
3950 
3951 	if (mdev->md_io_tmpp) {
3952 		__free_page(mdev->md_io_tmpp);
3953 		mdev->md_io_tmpp = NULL;
3954 	}
3955 	clear_bit(GO_DISKLESS, &mdev->flags);
3956 }
3957 
w_go_diskless(struct drbd_conf * mdev,struct drbd_work * w,int unused)3958 static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3959 {
3960 	D_ASSERT(mdev->state.disk == D_FAILED);
3961 	/* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
3962 	 * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
3963 	 * the protected members anymore, though, so once put_ldev reaches zero
3964 	 * again, it will be safe to free them. */
3965 	drbd_force_state(mdev, NS(disk, D_DISKLESS));
3966 	return 1;
3967 }
3968 
drbd_go_diskless(struct drbd_conf * mdev)3969 void drbd_go_diskless(struct drbd_conf *mdev)
3970 {
3971 	D_ASSERT(mdev->state.disk == D_FAILED);
3972 	if (!test_and_set_bit(GO_DISKLESS, &mdev->flags))
3973 		drbd_queue_work(&mdev->data.work, &mdev->go_diskless);
3974 }
3975 
3976 /**
3977  * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
3978  * @mdev:	DRBD device.
3979  * @io_fn:	IO callback to be called when bitmap IO is possible
3980  * @done:	callback to be called after the bitmap IO was performed
3981  * @why:	Descriptive text of the reason for doing the IO
3982  *
3983  * While IO on the bitmap happens we freeze application IO thus we ensure
3984  * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
3985  * called from worker context. It MUST NOT be used while a previous such
3986  * work is still pending!
3987  */
drbd_queue_bitmap_io(struct drbd_conf * mdev,int (* io_fn)(struct drbd_conf *),void (* done)(struct drbd_conf *,int),char * why,enum bm_flag flags)3988 void drbd_queue_bitmap_io(struct drbd_conf *mdev,
3989 			  int (*io_fn)(struct drbd_conf *),
3990 			  void (*done)(struct drbd_conf *, int),
3991 			  char *why, enum bm_flag flags)
3992 {
3993 	D_ASSERT(current == mdev->worker.task);
3994 
3995 	D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags));
3996 	D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags));
3997 	D_ASSERT(list_empty(&mdev->bm_io_work.w.list));
3998 	if (mdev->bm_io_work.why)
3999 		dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n",
4000 			why, mdev->bm_io_work.why);
4001 
4002 	mdev->bm_io_work.io_fn = io_fn;
4003 	mdev->bm_io_work.done = done;
4004 	mdev->bm_io_work.why = why;
4005 	mdev->bm_io_work.flags = flags;
4006 
4007 	spin_lock_irq(&mdev->req_lock);
4008 	set_bit(BITMAP_IO, &mdev->flags);
4009 	if (atomic_read(&mdev->ap_bio_cnt) == 0) {
4010 		if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags))
4011 			drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
4012 	}
4013 	spin_unlock_irq(&mdev->req_lock);
4014 }
4015 
4016 /**
4017  * drbd_bitmap_io() -  Does an IO operation on the whole bitmap
4018  * @mdev:	DRBD device.
4019  * @io_fn:	IO callback to be called when bitmap IO is possible
4020  * @why:	Descriptive text of the reason for doing the IO
4021  *
4022  * freezes application IO while that the actual IO operations runs. This
4023  * functions MAY NOT be called from worker context.
4024  */
drbd_bitmap_io(struct drbd_conf * mdev,int (* io_fn)(struct drbd_conf *),char * why,enum bm_flag flags)4025 int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *),
4026 		char *why, enum bm_flag flags)
4027 {
4028 	int rv;
4029 
4030 	D_ASSERT(current != mdev->worker.task);
4031 
4032 	if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
4033 		drbd_suspend_io(mdev);
4034 
4035 	drbd_bm_lock(mdev, why, flags);
4036 	rv = io_fn(mdev);
4037 	drbd_bm_unlock(mdev);
4038 
4039 	if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
4040 		drbd_resume_io(mdev);
4041 
4042 	return rv;
4043 }
4044 
drbd_md_set_flag(struct drbd_conf * mdev,int flag)4045 void drbd_md_set_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
4046 {
4047 	if ((mdev->ldev->md.flags & flag) != flag) {
4048 		drbd_md_mark_dirty(mdev);
4049 		mdev->ldev->md.flags |= flag;
4050 	}
4051 }
4052 
drbd_md_clear_flag(struct drbd_conf * mdev,int flag)4053 void drbd_md_clear_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
4054 {
4055 	if ((mdev->ldev->md.flags & flag) != 0) {
4056 		drbd_md_mark_dirty(mdev);
4057 		mdev->ldev->md.flags &= ~flag;
4058 	}
4059 }
drbd_md_test_flag(struct drbd_backing_dev * bdev,int flag)4060 int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
4061 {
4062 	return (bdev->md.flags & flag) != 0;
4063 }
4064 
md_sync_timer_fn(unsigned long data)4065 static void md_sync_timer_fn(unsigned long data)
4066 {
4067 	struct drbd_conf *mdev = (struct drbd_conf *) data;
4068 
4069 	drbd_queue_work_front(&mdev->data.work, &mdev->md_sync_work);
4070 }
4071 
w_md_sync(struct drbd_conf * mdev,struct drbd_work * w,int unused)4072 static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused)
4073 {
4074 	dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
4075 #ifdef DEBUG
4076 	dev_warn(DEV, "last md_mark_dirty: %s:%u\n",
4077 		mdev->last_md_mark_dirty.func, mdev->last_md_mark_dirty.line);
4078 #endif
4079 	drbd_md_sync(mdev);
4080 	return 1;
4081 }
4082 
4083 #ifdef CONFIG_DRBD_FAULT_INJECTION
4084 /* Fault insertion support including random number generator shamelessly
4085  * stolen from kernel/rcutorture.c */
4086 struct fault_random_state {
4087 	unsigned long state;
4088 	unsigned long count;
4089 };
4090 
4091 #define FAULT_RANDOM_MULT 39916801  /* prime */
4092 #define FAULT_RANDOM_ADD	479001701 /* prime */
4093 #define FAULT_RANDOM_REFRESH 10000
4094 
4095 /*
4096  * Crude but fast random-number generator.  Uses a linear congruential
4097  * generator, with occasional help from get_random_bytes().
4098  */
4099 static unsigned long
_drbd_fault_random(struct fault_random_state * rsp)4100 _drbd_fault_random(struct fault_random_state *rsp)
4101 {
4102 	long refresh;
4103 
4104 	if (!rsp->count--) {
4105 		get_random_bytes(&refresh, sizeof(refresh));
4106 		rsp->state += refresh;
4107 		rsp->count = FAULT_RANDOM_REFRESH;
4108 	}
4109 	rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD;
4110 	return swahw32(rsp->state);
4111 }
4112 
4113 static char *
_drbd_fault_str(unsigned int type)4114 _drbd_fault_str(unsigned int type) {
4115 	static char *_faults[] = {
4116 		[DRBD_FAULT_MD_WR] = "Meta-data write",
4117 		[DRBD_FAULT_MD_RD] = "Meta-data read",
4118 		[DRBD_FAULT_RS_WR] = "Resync write",
4119 		[DRBD_FAULT_RS_RD] = "Resync read",
4120 		[DRBD_FAULT_DT_WR] = "Data write",
4121 		[DRBD_FAULT_DT_RD] = "Data read",
4122 		[DRBD_FAULT_DT_RA] = "Data read ahead",
4123 		[DRBD_FAULT_BM_ALLOC] = "BM allocation",
4124 		[DRBD_FAULT_AL_EE] = "EE allocation",
4125 		[DRBD_FAULT_RECEIVE] = "receive data corruption",
4126 	};
4127 
4128 	return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
4129 }
4130 
4131 unsigned int
_drbd_insert_fault(struct drbd_conf * mdev,unsigned int type)4132 _drbd_insert_fault(struct drbd_conf *mdev, unsigned int type)
4133 {
4134 	static struct fault_random_state rrs = {0, 0};
4135 
4136 	unsigned int ret = (
4137 		(fault_devs == 0 ||
4138 			((1 << mdev_to_minor(mdev)) & fault_devs) != 0) &&
4139 		(((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate));
4140 
4141 	if (ret) {
4142 		fault_count++;
4143 
4144 		if (__ratelimit(&drbd_ratelimit_state))
4145 			dev_warn(DEV, "***Simulating %s failure\n",
4146 				_drbd_fault_str(type));
4147 	}
4148 
4149 	return ret;
4150 }
4151 #endif
4152 
drbd_buildtag(void)4153 const char *drbd_buildtag(void)
4154 {
4155 	/* DRBD built from external sources has here a reference to the
4156 	   git hash of the source code. */
4157 
4158 	static char buildtag[38] = "\0uilt-in";
4159 
4160 	if (buildtag[0] == 0) {
4161 #ifdef CONFIG_MODULES
4162 		if (THIS_MODULE != NULL)
4163 			sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
4164 		else
4165 #endif
4166 			buildtag[0] = 'b';
4167 	}
4168 
4169 	return buildtag;
4170 }
4171 
4172 module_init(drbd_init)
4173 module_exit(drbd_cleanup)
4174 
4175 EXPORT_SYMBOL(drbd_conn_str);
4176 EXPORT_SYMBOL(drbd_role_str);
4177 EXPORT_SYMBOL(drbd_disk_str);
4178 EXPORT_SYMBOL(drbd_set_st_err_str);
4179