drbd: do not reset rs_pending_cnt too early Fix asserts like block drbd0: in got_BlockAck:4634: rs_pending_cnt = -35 < 0 ! We reset the resync lru cache and related information (rs_pending_cnt), once we successfully finished a resync or online verify, or if the replication connection is lost. We also need to reset it if a resync or online verify is aborted because a lower level disk failed. In that case the replication link is still established, and we may still have packets queued in the network buffers which want to touch rs_pending_cnt. We do not have any synchronization mechanism to know for sure when all such pending resync related packets have been drained. To avoid this counter to go negative (and violate the ASSERT that it will always be >= 0), just do not reset it when we lose a disk. It is good enough to make sure it is re-initialized before the next resync can start: reset it when we re-attach a disk. Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com> Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>

commit: a324896b173e569fb831c5caa04ccd02ec0bc9ca [log] [tgz]
author: Lars Ellenberg <lars.ellenberg@linbit.com> Mon Jul 30 09:10:41 2012 +0200
committer: Philipp Reisner <philipp.reisner@linbit.com> Thu Nov 08 16:58:40 2012 +0100
tree: fedb4c82e66c304c6ced91a9e83538af735ddb45
parent: 8a943170711b7a4d63528ea8eb6a41cc91e79309 [diff]
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 05ed480..a2925de 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c

@@ -1309,6 +1309,11 @@
 	/* make sure there is no leftover from previous force-detach attempts */
 	clear_bit(FORCE_DETACH, &mdev->flags);
 
+	/* and no leftover from previously aborted resync or verify, either */
+	mdev->rs_total = 0;
+	mdev->rs_failed = 0;
+	atomic_set(&mdev->rs_pending_cnt, 0);
+
 	/* allocation not in the IO path, drbdsetup context */
 	nbc = kzalloc(sizeof(struct drbd_backing_dev), GFP_KERNEL);
 	if (!nbc) {

diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c
index c9ec7d3..ad307fb 100644
--- a/drivers/block/drbd/drbd_state.c
+++ b/drivers/block/drbd/drbd_state.c

@@ -1216,6 +1216,13 @@
 	/* Do not change the order of the if above and the two below... */
 	if (os.pdsk == D_DISKLESS &&
 	    ns.pdsk > D_DISKLESS && ns.pdsk != D_UNKNOWN) {      /* attach on the peer */
+		/* we probably will start a resync soon.
+		 * make sure those things are properly reset. */
+		mdev->rs_total = 0;
+		mdev->rs_failed = 0;
+		atomic_set(&mdev->rs_pending_cnt, 0);
+		drbd_rs_cancel_all(mdev);
+
 		drbd_send_uuids(mdev);
 		drbd_send_state(mdev, ns);
 	}
@@ -1386,10 +1393,6 @@
                                 "ASSERT FAILED: disk is %s while going diskless\n",
                                 drbd_disk_str(mdev->state.disk));
 
-                mdev->rs_total = 0;
-                mdev->rs_failed = 0;
-                atomic_set(&mdev->rs_pending_cnt, 0);
-
 		if (ns.conn >= C_CONNECTED)
 			drbd_send_state(mdev, ns);
 		/* corresponding get_ldev in __drbd_set_state

diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 07a4046..9d7e1fb 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c

@@ -1549,14 +1549,6 @@
 		return;
 	}
 
-	if (mdev->state.conn < C_AHEAD) {
-		/* In case a previous resync run was aborted by an IO error/detach on the peer. */
-		drbd_rs_cancel_all(mdev);
-		/* This should be done when we abort the resync. We definitely do not
-		   want to have this for connections going back and forth between
-		   Ahead/Behind and SyncSource/SyncTarget */
-	}
-
 	if (!test_bit(B_RS_H_DONE, &mdev->flags)) {
 		if (side == C_SYNC_TARGET) {
 			/* Since application IO was locked out during C_WF_BITMAP_T and
commit	a324896b173e569fb831c5caa04ccd02ec0bc9ca	[log] [tgz]
author	Lars Ellenberg <lars.ellenberg@linbit.com>	Mon Jul 30 09:10:41 2012 +0200
committer	Philipp Reisner <philipp.reisner@linbit.com>	Thu Nov 08 16:58:40 2012 +0100
tree	fedb4c82e66c304c6ced91a9e83538af735ddb45
parent	8a943170711b7a4d63528ea8eb6a41cc91e79309 [diff]