Merge tag 'v3.15-rc1' into for-3.16/core

We don't like this, but things have diverged with the blk-mq fixes
in 3.15-rc1. So merge it in.
diff --git a/block/blk-core.c b/block/blk-core.c
index a0e3096..1fe9ff6 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1654,7 +1654,7 @@
 	struct dentry *dir = fault_create_debugfs_attr("fail_make_request",
 						NULL, &fail_make_request);
 
-	return IS_ERR(dir) ? PTR_ERR(dir) : 0;
+	return PTR_ERR_OR_ZERO(dir);
 }
 
 late_initcall(fail_make_request_debugfs);
@@ -2904,19 +2904,26 @@
 }
 EXPORT_SYMBOL_GPL(blk_rq_prep_clone);
 
-int kblockd_schedule_work(struct request_queue *q, struct work_struct *work)
+int kblockd_schedule_work(struct work_struct *work)
 {
 	return queue_work(kblockd_workqueue, work);
 }
 EXPORT_SYMBOL(kblockd_schedule_work);
 
-int kblockd_schedule_delayed_work(struct request_queue *q,
-			struct delayed_work *dwork, unsigned long delay)
+int kblockd_schedule_delayed_work(struct delayed_work *dwork,
+				  unsigned long delay)
 {
 	return queue_delayed_work(kblockd_workqueue, dwork, delay);
 }
 EXPORT_SYMBOL(kblockd_schedule_delayed_work);
 
+int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork,
+				     unsigned long delay)
+{
+	return queue_delayed_work_on(cpu, kblockd_workqueue, dwork, delay);
+}
+EXPORT_SYMBOL(kblockd_schedule_delayed_work_on);
+
 #define PLUG_MAGIC	0x91827364
 
 /**
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 43e6b47..77f2045 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -144,7 +144,7 @@
 {
 	if (rq->q->mq_ops) {
 		INIT_WORK(&rq->mq_flush_work, mq_flush_run);
-		kblockd_schedule_work(rq->q, &rq->mq_flush_work);
+		kblockd_schedule_work(&rq->mq_flush_work);
 		return false;
 	} else {
 		if (add_front)
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index b0ba264..9176a69 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -246,16 +246,12 @@
 
 static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page)
 {
-	unsigned int i, queue_num, first = 1;
+	unsigned int i, first = 1;
 	ssize_t ret = 0;
 
 	blk_mq_disable_hotplug();
 
-	for_each_online_cpu(i) {
-		queue_num = hctx->queue->mq_map[i];
-		if (queue_num != hctx->queue_num)
-			continue;
-
+	for_each_cpu(i, hctx->cpumask) {
 		if (first)
 			ret += sprintf(ret + page, "%u", i);
 		else
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 1d2a9bd..5455ed1 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -209,11 +209,14 @@
 			break;
 		}
 
-		blk_mq_put_ctx(ctx);
-		if (!(gfp & __GFP_WAIT))
+		if (gfp & __GFP_WAIT) {
+			__blk_mq_run_hw_queue(hctx);
+			blk_mq_put_ctx(ctx);
+		} else {
+			blk_mq_put_ctx(ctx);
 			break;
+		}
 
-		__blk_mq_run_hw_queue(hctx);
 		blk_mq_wait_for_tags(hctx->tags);
 	} while (1);
 
@@ -514,6 +517,8 @@
 	LIST_HEAD(rq_list);
 	int bit, queued;
 
+	WARN_ON(!preempt_count());
+
 	if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state)))
 		return;
 
@@ -606,12 +611,21 @@
 	if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state)))
 		return;
 
-	if (!async)
+	if (!async && cpumask_test_cpu(smp_processor_id(), hctx->cpumask))
 		__blk_mq_run_hw_queue(hctx);
+	else if (hctx->queue->nr_hw_queues == 1)
+		kblockd_schedule_delayed_work(&hctx->delayed_work, 0);
 	else {
-		struct request_queue *q = hctx->queue;
+		unsigned int cpu;
 
-		kblockd_schedule_delayed_work(q, &hctx->delayed_work, 0);
+		/*
+		 * It'd be great if the workqueue API had a way to pass
+		 * in a mask and had some smarts for more clever placement
+		 * than the first CPU. Or we could round-robin here. For now,
+		 * just queue on the first CPU.
+		 */
+		cpu = cpumask_first(hctx->cpumask);
+		kblockd_schedule_delayed_work_on(cpu, &hctx->delayed_work, 0);
 	}
 }
 
@@ -626,7 +640,9 @@
 		    test_bit(BLK_MQ_S_STOPPED, &hctx->state))
 			continue;
 
+		preempt_disable();
 		blk_mq_run_hw_queue(hctx, async);
+		preempt_enable();
 	}
 }
 EXPORT_SYMBOL(blk_mq_run_queues);
@@ -651,7 +667,10 @@
 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
 {
 	clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
+
+	preempt_disable();
 	__blk_mq_run_hw_queue(hctx);
+	preempt_enable();
 }
 EXPORT_SYMBOL(blk_mq_start_hw_queue);
 
@@ -665,7 +684,9 @@
 			continue;
 
 		clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
+		preempt_disable();
 		blk_mq_run_hw_queue(hctx, true);
+		preempt_enable();
 	}
 }
 EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
@@ -675,7 +696,10 @@
 	struct blk_mq_hw_ctx *hctx;
 
 	hctx = container_of(work, struct blk_mq_hw_ctx, delayed_work.work);
+
+	preempt_disable();
 	__blk_mq_run_hw_queue(hctx);
+	preempt_enable();
 }
 
 static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
@@ -719,10 +743,10 @@
 		spin_unlock(&ctx->lock);
 	}
 
-	blk_mq_put_ctx(current_ctx);
-
 	if (run_queue)
 		blk_mq_run_hw_queue(hctx, async);
+
+	blk_mq_put_ctx(current_ctx);
 }
 
 static void blk_mq_insert_requests(struct request_queue *q,
@@ -758,9 +782,8 @@
 	}
 	spin_unlock(&ctx->lock);
 
-	blk_mq_put_ctx(current_ctx);
-
 	blk_mq_run_hw_queue(hctx, from_schedule);
+	blk_mq_put_ctx(current_ctx);
 }
 
 static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b)
@@ -879,7 +902,6 @@
 
 	if (unlikely(is_flush_fua)) {
 		blk_mq_bio_to_request(rq, bio);
-		blk_mq_put_ctx(ctx);
 		blk_insert_flush(rq);
 		goto run_queue;
 	}
@@ -917,7 +939,6 @@
 	}
 
 	spin_unlock(&ctx->lock);
-	blk_mq_put_ctx(ctx);
 
 	/*
 	 * For a SYNC request, send it to the hardware immediately. For an
@@ -926,6 +947,7 @@
 	 */
 run_queue:
 	blk_mq_run_hw_queue(hctx, !is_sync || is_flush_fua);
+	blk_mq_put_ctx(ctx);
 }
 
 /*
@@ -993,9 +1015,9 @@
 	blk_mq_hctx_mark_pending(hctx, ctx);
 
 	spin_unlock(&ctx->lock);
-	blk_mq_put_ctx(ctx);
 
 	blk_mq_run_hw_queue(hctx, true);
+	blk_mq_put_ctx(ctx);
 }
 
 static int blk_mq_init_hw_commands(struct blk_mq_hw_ctx *hctx,
@@ -1258,12 +1280,13 @@
 		__ctx->queue = q;
 
 		/* If the cpu isn't online, the cpu is mapped to first hctx */
-		hctx = q->mq_ops->map_queue(q, i);
-		hctx->nr_ctx++;
-
 		if (!cpu_online(i))
 			continue;
 
+		hctx = q->mq_ops->map_queue(q, i);
+		cpumask_set_cpu(i, hctx->cpumask);
+		hctx->nr_ctx++;
+
 		/*
 		 * Set local node, IFF we have more than one hw queue. If
 		 * not, we remain on the home node of the device
@@ -1280,6 +1303,7 @@
 	struct blk_mq_ctx *ctx;
 
 	queue_for_each_hw_ctx(q, hctx, i) {
+		cpumask_clear(hctx->cpumask);
 		hctx->nr_ctx = 0;
 	}
 
@@ -1288,7 +1312,11 @@
 	 */
 	queue_for_each_ctx(q, ctx, i) {
 		/* If the cpu isn't online, the cpu is mapped to first hctx */
+		if (!cpu_online(i))
+			continue;
+
 		hctx = q->mq_ops->map_queue(q, i);
+		cpumask_set_cpu(i, hctx->cpumask);
 		ctx->index_hw = hctx->nr_ctx;
 		hctx->ctxs[hctx->nr_ctx++] = ctx;
 	}
@@ -1332,6 +1360,9 @@
 		if (!hctxs[i])
 			goto err_hctxs;
 
+		if (!zalloc_cpumask_var(&hctxs[i]->cpumask, GFP_KERNEL))
+			goto err_hctxs;
+
 		hctxs[i]->numa_node = NUMA_NO_NODE;
 		hctxs[i]->queue_num = i;
 	}
@@ -1395,6 +1426,7 @@
 	for (i = 0; i < reg->nr_hw_queues; i++) {
 		if (!hctxs[i])
 			break;
+		free_cpumask_var(hctxs[i]->cpumask);
 		reg->ops->free_hctx(hctxs[i], i);
 	}
 	kfree(hctxs);
@@ -1416,6 +1448,7 @@
 		blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
 		if (q->mq_ops->exit_hctx)
 			q->mq_ops->exit_hctx(hctx, i);
+		free_cpumask_var(hctx->cpumask);
 		q->mq_ops->free_hctx(hctx, i);
 	}
 
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index e0985f1..5063a0b 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -908,7 +908,7 @@
 {
 	if (cfqd->busy_queues) {
 		cfq_log(cfqd, "schedule dispatch");
-		kblockd_schedule_work(cfqd->queue, &cfqd->unplug_work);
+		kblockd_schedule_work(&cfqd->unplug_work);
 	}
 }
 
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 65a123d..0f3bddc 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -139,7 +139,7 @@
 	 */
 	spin_lock_irqsave(q->queue_lock, flags);
 	blk_requeue_request(q, cmd->request);
-	kblockd_schedule_work(q, &device->requeue_work);
+	kblockd_schedule_work(&device->requeue_work);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 0120451..b6ee487 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -19,6 +19,7 @@
 
 	unsigned long		state;		/* BLK_MQ_S_* flags */
 	struct delayed_work	delayed_work;
+	cpumask_var_t		cpumask;
 
 	unsigned long		flags;		/* BLK_MQ_F_* flags */
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 0d84981..86a8df1 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1370,8 +1370,9 @@
 }
 
 struct work_struct;
-int kblockd_schedule_work(struct request_queue *q, struct work_struct *work);
-int kblockd_schedule_delayed_work(struct request_queue *q, struct delayed_work *dwork, unsigned long delay);
+int kblockd_schedule_work(struct work_struct *work);
+int kblockd_schedule_delayed_work(struct delayed_work *dwork, unsigned long delay);
+int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay);
 
 #ifdef CONFIG_BLK_CGROUP
 /*