Skip to content
Snippets Groups Projects
0002-block-introduce-the-BFQ-v7r11-I-O-sched-for-4.4.0.patch 223 KiB
Newer Older
From 03d30cc06a5436c05ee338bd21903802181bafe9 Mon Sep 17 00:00:00 2001
From: Paolo Valente <paolo.valente@unimore.it>
Date: Thu, 9 May 2013 19:10:02 +0200
Subject: [PATCH 2/3] block: introduce the BFQ-v7r11 I/O sched for 4.4.0

The general structure is borrowed from CFQ, as much of the code for
handling I/O contexts. Over time, several useful features have been
ported from CFQ as well (details in the changelog in README.BFQ). A
(bfq_)queue is associated to each task doing I/O on a device, and each
time a scheduling decision has to be made a queue is selected and served
until it expires.

    - Slices are given in the service domain: tasks are assigned
      budgets, measured in number of sectors. Once got the disk, a task
      must however consume its assigned budget within a configurable
      maximum time (by default, the maximum possible value of the
      budgets is automatically computed to comply with this timeout).
      This allows the desired latency vs "throughput boosting" tradeoff
      to be set.

    - Budgets are scheduled according to a variant of WF2Q+, implemented
      using an augmented rb-tree to take eligibility into account while
      preserving an O(log N) overall complexity.

    - A low-latency tunable is provided; if enabled, both interactive
      and soft real-time applications are guaranteed a very low latency.

    - Latency guarantees are preserved also in the presence of NCQ.

    - Also with flash-based devices, a high throughput is achieved
      while still preserving latency guarantees.

    - BFQ features Early Queue Merge (EQM), a sort of fusion of the
      cooperating-queue-merging and the preemption mechanisms present
      in CFQ. EQM is in fact a unified mechanism that tries to get a
      sequential read pattern, and hence a high throughput, with any
      set of processes performing interleaved I/O over a contiguous
      sequence of sectors.

    - BFQ supports full hierarchical scheduling, exporting a cgroups
      interface.  Since each node has a full scheduler, each group can
      be assigned its own weight.

    - If the cgroups interface is not used, only I/O priorities can be
      assigned to processes, with ioprio values mapped to weights
      with the relation weight = IOPRIO_BE_NR - ioprio.

    - ioprio classes are served in strict priority order, i.e., lower
      priority queues are not served as long as there are higher
      priority queues.  Among queues in the same class the bandwidth is
      distributed in proportion to the weight of each queue. A very
      thin extra bandwidth is however guaranteed to the Idle class, to
      prevent it from starving.

Signed-off-by: Paolo Valente <paolo.valente@unimore.it>
Signed-off-by: Arianna Avanzini <avanzini@google.com>
 block/Kconfig.iosched |    6 +-
 block/bfq-cgroup.c    | 1182 ++++++++++++++++
 block/bfq-ioc.c       |   36 +
 block/bfq-iosched.c   | 3754 +++++++++++++++++++++++++++++++++++++++++++++++++
 block/bfq-sched.c     | 1200 ++++++++++++++++
 block/bfq.h           |  801 +++++++++++
 6 files changed, 6975 insertions(+), 4 deletions(-)
 create mode 100644 block/bfq-cgroup.c
 create mode 100644 block/bfq-ioc.c
 create mode 100644 block/bfq-iosched.c
 create mode 100644 block/bfq-sched.c
 create mode 100644 block/bfq.h

diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 0ee5f0f..f78cd1a 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -51,14 +51,12 @@ config IOSCHED_BFQ
 	  applications. If compiled built-in (saying Y here), BFQ can
 	  be configured to support hierarchical scheduling.
 
-config CGROUP_BFQIO
+config BFQ_GROUP_IOSCHED
 	bool "BFQ hierarchical scheduling support"
 	depends on CGROUPS && IOSCHED_BFQ=y
 	default n
 	---help---
-	  Enable hierarchical scheduling in BFQ, using the cgroups
-	  filesystem interface.  The name of the subsystem will be
-	  bfqio.
+	  Enable hierarchical scheduling in BFQ, using the blkio controller.
 
 choice
 	prompt "Default I/O scheduler"
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
new file mode 100644
index 0000000..8610cd6
--- /dev/null
+++ b/block/bfq-cgroup.c
@@ -0,0 +1,1182 @@
+/*
+ * BFQ: CGROUPS support.
+ *
+ * Based on ideas and code from CFQ:
+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
+ *
+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
+ *		      Paolo Valente <paolo.valente@unimore.it>
+ *
+ * Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it>
+ *
+ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ
+ * file.
+ */
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+/* bfqg stats flags */
+enum bfqg_stats_flags {
+	BFQG_stats_waiting = 0,
+	BFQG_stats_idling,
+	BFQG_stats_empty,
+};
+#define BFQG_FLAG_FNS(name)						\
+static void bfqg_stats_mark_##name(struct bfqg_stats *stats)	\
+{									\
+	stats->flags |= (1 << BFQG_stats_##name);			\
+}									\
+static void bfqg_stats_clear_##name(struct bfqg_stats *stats)	\
+{									\
+	stats->flags &= ~(1 << BFQG_stats_##name);			\
+}									\
+static int bfqg_stats_##name(struct bfqg_stats *stats)		\
+{									\
+	return (stats->flags & (1 << BFQG_stats_##name)) != 0;		\
+}									\
+BFQG_FLAG_FNS(waiting)
+BFQG_FLAG_FNS(idling)
+BFQG_FLAG_FNS(empty)
+#undef BFQG_FLAG_FNS
+/* This should be called with the queue_lock held. */
+static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats)
+	unsigned long long now;
+
+	if (!bfqg_stats_waiting(stats))
+		return;
+
+	now = sched_clock();
+	if (time_after64(now, stats->start_group_wait_time))
+		blkg_stat_add(&stats->group_wait_time,
+			      now - stats->start_group_wait_time);
+	bfqg_stats_clear_waiting(stats);
+/* This should be called with the queue_lock held. */
+static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg,
+						 struct bfq_group *curr_bfqg)
+	struct bfqg_stats *stats = &bfqg->stats;
+
+	if (bfqg_stats_waiting(stats))
+		return;
+	if (bfqg == curr_bfqg)
+		return;
+	stats->start_group_wait_time = sched_clock();
+	bfqg_stats_mark_waiting(stats);
+/* This should be called with the queue_lock held. */
+static void bfqg_stats_end_empty_time(struct bfqg_stats *stats)
+	unsigned long long now;
+	if (!bfqg_stats_empty(stats))
+		return;
+	now = sched_clock();
+	if (time_after64(now, stats->start_empty_time))
+		blkg_stat_add(&stats->empty_time,
+			      now - stats->start_empty_time);
+	bfqg_stats_clear_empty(stats);
+static void bfqg_stats_update_dequeue(struct bfq_group *bfqg)
+	blkg_stat_add(&bfqg->stats.dequeue, 1);
+}
+
+static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg)
+{
+	struct bfqg_stats *stats = &bfqg->stats;
+
+	if (blkg_rwstat_total(&stats->queued))
+		return;
+	 * group is already marked empty. This can happen if bfqq got new
+	 * request in parent group and moved to this group while being added
+	 * to service tree. Just ignore the event and move on.
+	if (bfqg_stats_empty(stats))
+		return;
+
+	stats->start_empty_time = sched_clock();
+	bfqg_stats_mark_empty(stats);
+}
+
+static void bfqg_stats_update_idle_time(struct bfq_group *bfqg)
+{
+	struct bfqg_stats *stats = &bfqg->stats;
+
+	if (bfqg_stats_idling(stats)) {
+		unsigned long long now = sched_clock();
+
+		if (time_after64(now, stats->start_idle_time))
+			blkg_stat_add(&stats->idle_time,
+				      now - stats->start_idle_time);
+		bfqg_stats_clear_idling(stats);
+static void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg)
+	struct bfqg_stats *stats = &bfqg->stats;
+	stats->start_idle_time = sched_clock();
+	bfqg_stats_mark_idling(stats);
+}
+static void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg)
+{
+	struct bfqg_stats *stats = &bfqg->stats;
+
+	blkg_stat_add(&stats->avg_queue_size_sum,
+		      blkg_rwstat_total(&stats->queued));
+	blkg_stat_add(&stats->avg_queue_size_samples, 1);
+	bfqg_stats_update_group_wait_time(stats);
+static struct blkcg_policy blkcg_policy_bfq;
+
+/*
+ * blk-cgroup policy-related handlers
+ * The following functions help in converting between blk-cgroup
+ * internal structures and BFQ-specific structures.
+
+static struct bfq_group *pd_to_bfqg(struct blkg_policy_data *pd)
+	return pd ? container_of(pd, struct bfq_group, pd) : NULL;
+}
+static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg)
+{
+	return pd_to_blkg(&bfqg->pd);
+}
+static struct bfq_group *blkg_to_bfqg(struct blkcg_gq *blkg)
+{
+	struct blkg_policy_data *pd = blkg_to_pd(blkg, &blkcg_policy_bfq);
+	BUG_ON(!pd);
+	return pd_to_bfqg(pd);
+}
+/*
+ * bfq_group handlers
+ * The following functions help in navigating the bfq_group hierarchy
+ * by allowing to find the parent of a bfq_group or the bfq_group
+ * associated to a bfq_queue.
+ */
+static struct bfq_group *bfqg_parent(struct bfq_group *bfqg)
+{
+	struct blkcg_gq *pblkg = bfqg_to_blkg(bfqg)->parent;
+	return pblkg ? blkg_to_bfqg(pblkg) : NULL;
+}
+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq)
+{
+	struct bfq_entity *group_entity = bfqq->entity.parent;
+	return group_entity ? container_of(group_entity, struct bfq_group,
+					   entity) :
+			      bfqq->bfqd->root_group;
+}
+/*
+ * The following two functions handle get and put of a bfq_group by
+ * wrapping the related blk-cgroup hooks.
+ */
+
+static void bfqg_get(struct bfq_group *bfqg)
+{
+	return blkg_get(bfqg_to_blkg(bfqg));
+static void bfqg_put(struct bfq_group *bfqg)
+{
+	return blkg_put(bfqg_to_blkg(bfqg));
+}
+
+static void bfqg_stats_update_io_add(struct bfq_group *bfqg,
+				     struct bfq_queue *bfqq,
+				     int rw)
+{
+	blkg_rwstat_add(&bfqg->stats.queued, rw, 1);
+	bfqg_stats_end_empty_time(&bfqg->stats);
+	if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue))
+		bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq));
+}
+
+static void bfqg_stats_update_io_remove(struct bfq_group *bfqg, int rw)
+{
+	blkg_rwstat_add(&bfqg->stats.queued, rw, -1);
+}
+
+static void bfqg_stats_update_io_merged(struct bfq_group *bfqg, int rw)
+{
+	blkg_rwstat_add(&bfqg->stats.merged, rw, 1);
+}
+
+static void bfqg_stats_update_dispatch(struct bfq_group *bfqg,
+					      uint64_t bytes, int rw)
+{
+	blkg_stat_add(&bfqg->stats.sectors, bytes >> 9);
+	blkg_rwstat_add(&bfqg->stats.serviced, rw, 1);
+	blkg_rwstat_add(&bfqg->stats.service_bytes, rw, bytes);
+}
+
+static void bfqg_stats_update_completion(struct bfq_group *bfqg,
+			uint64_t start_time, uint64_t io_start_time, int rw)
+{
+	struct bfqg_stats *stats = &bfqg->stats;
+	unsigned long long now = sched_clock();
+
+	if (time_after64(now, io_start_time))
+		blkg_rwstat_add(&stats->service_time, rw, now - io_start_time);
+	if (time_after64(io_start_time, start_time))
+		blkg_rwstat_add(&stats->wait_time, rw,
+				io_start_time - start_time);
+}
+
+/* @stats = 0 */
+static void bfqg_stats_reset(struct bfqg_stats *stats)
+{
+	if (!stats)
+		return;
+
+	/* queued stats shouldn't be cleared */
+	blkg_rwstat_reset(&stats->service_bytes);
+	blkg_rwstat_reset(&stats->serviced);
+	blkg_rwstat_reset(&stats->merged);
+	blkg_rwstat_reset(&stats->service_time);
+	blkg_rwstat_reset(&stats->wait_time);
+	blkg_stat_reset(&stats->time);
+	blkg_stat_reset(&stats->unaccounted_time);
+	blkg_stat_reset(&stats->avg_queue_size_sum);
+	blkg_stat_reset(&stats->avg_queue_size_samples);
+	blkg_stat_reset(&stats->dequeue);
+	blkg_stat_reset(&stats->group_wait_time);
+	blkg_stat_reset(&stats->idle_time);
+	blkg_stat_reset(&stats->empty_time);
+}
+
+/* @to += @from */
+static void bfqg_stats_merge(struct bfqg_stats *to, struct bfqg_stats *from)
+{
+	if (!to || !from)
+		return;
+
+	/* queued stats shouldn't be cleared */
+	blkg_rwstat_add_aux(&to->service_bytes, &from->service_bytes);
+	blkg_rwstat_add_aux(&to->serviced, &from->serviced);
+	blkg_rwstat_add_aux(&to->merged, &from->merged);
+	blkg_rwstat_add_aux(&to->service_time, &from->service_time);
+	blkg_rwstat_add_aux(&to->wait_time, &from->wait_time);
+	blkg_stat_add_aux(&from->time, &from->time);
+	blkg_stat_add_aux(&to->unaccounted_time, &from->unaccounted_time);
+	blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
+	blkg_stat_add_aux(&to->avg_queue_size_samples, &from->avg_queue_size_samples);
+	blkg_stat_add_aux(&to->dequeue, &from->dequeue);
+	blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time);
+	blkg_stat_add_aux(&to->idle_time, &from->idle_time);
+	blkg_stat_add_aux(&to->empty_time, &from->empty_time);
+}
+
+/*
+ * Transfer @bfqg's stats to its parent's dead_stats so that the ancestors'
+ * recursive stats can still account for the amount used by this bfqg after
+ * it's gone.
+static void bfqg_stats_xfer_dead(struct bfq_group *bfqg)
+	struct bfq_group *parent;
+	if (!bfqg) /* root_group */
+		return;
+	parent = bfqg_parent(bfqg);
+	lockdep_assert_held(bfqg_to_blkg(bfqg)->q->queue_lock);
+	if (unlikely(!parent))
+		return;
+	bfqg_stats_merge(&parent->dead_stats, &bfqg->stats);
+	bfqg_stats_merge(&parent->dead_stats, &bfqg->dead_stats);
+	bfqg_stats_reset(&bfqg->stats);
+	bfqg_stats_reset(&bfqg->dead_stats);
+}
+static void bfq_init_entity(struct bfq_entity *entity,
+			    struct bfq_group *bfqg)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+	entity->weight = entity->new_weight;
+	entity->orig_weight = entity->new_weight;
+	if (bfqq) {
+		bfqq->ioprio = bfqq->new_ioprio;
+		bfqq->ioprio_class = bfqq->new_ioprio_class;
+		bfqg_get(bfqg);
+	entity->parent = bfqg->my_entity;
+	entity->sched_data = &bfqg->sched_data;
+}
+static void bfqg_stats_exit(struct bfqg_stats *stats)
+{
+	blkg_rwstat_exit(&stats->service_bytes);
+	blkg_rwstat_exit(&stats->serviced);
+	blkg_rwstat_exit(&stats->merged);
+	blkg_rwstat_exit(&stats->service_time);
+	blkg_rwstat_exit(&stats->wait_time);
+	blkg_rwstat_exit(&stats->queued);
+	blkg_stat_exit(&stats->sectors);
+	blkg_stat_exit(&stats->time);
+	blkg_stat_exit(&stats->unaccounted_time);
+	blkg_stat_exit(&stats->avg_queue_size_sum);
+	blkg_stat_exit(&stats->avg_queue_size_samples);
+	blkg_stat_exit(&stats->dequeue);
+	blkg_stat_exit(&stats->group_wait_time);
+	blkg_stat_exit(&stats->idle_time);
+	blkg_stat_exit(&stats->empty_time);
+}
+
+static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp)
+{
+	if (blkg_rwstat_init(&stats->service_bytes, gfp) ||
+	    blkg_rwstat_init(&stats->serviced, gfp) ||
+	    blkg_rwstat_init(&stats->merged, gfp) ||
+	    blkg_rwstat_init(&stats->service_time, gfp) ||
+	    blkg_rwstat_init(&stats->wait_time, gfp) ||
+	    blkg_rwstat_init(&stats->queued, gfp) ||
+	    blkg_stat_init(&stats->sectors, gfp) ||
+	    blkg_stat_init(&stats->time, gfp) ||
+	    blkg_stat_init(&stats->unaccounted_time, gfp) ||
+	    blkg_stat_init(&stats->avg_queue_size_sum, gfp) ||
+	    blkg_stat_init(&stats->avg_queue_size_samples, gfp) ||
+	    blkg_stat_init(&stats->dequeue, gfp) ||
+	    blkg_stat_init(&stats->group_wait_time, gfp) ||
+	    blkg_stat_init(&stats->idle_time, gfp) ||
+	    blkg_stat_init(&stats->empty_time, gfp)) {
+		bfqg_stats_exit(stats);
+		return -ENOMEM;
+
+	return 0;
+static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd)
+ {
+	return cpd ? container_of(cpd, struct bfq_group_data, pd) : NULL;
+ }
+
+static struct bfq_group_data *blkcg_to_bfqgd(struct blkcg *blkcg)
+{
+	return cpd_to_bfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_bfq));
+}
+
+static void bfq_cpd_init(struct blkcg_policy_data *cpd)
+{
+	struct bfq_group_data *d = cpd_to_bfqgd(cpd);
+
+	d->weight = BFQ_DEFAULT_GRP_WEIGHT;
+}
+
+static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node)
+{
+	struct bfq_group *bfqg;
+
+	bfqg = kzalloc_node(sizeof(*bfqg), gfp, node);
+	if (!bfqg)
+		return NULL;
+	if (bfqg_stats_init(&bfqg->stats, gfp) ||
+	    bfqg_stats_init(&bfqg->dead_stats, gfp)) {
+		kfree(bfqg);
+		return NULL;
+	}
+
+	return &bfqg->pd;
+}
+
+static void bfq_group_set_parent(struct bfq_group *bfqg,
+					struct bfq_group *parent)
+{
+	struct bfq_entity *entity;
+
+	BUG_ON(!parent);
+	BUG_ON(!bfqg);
+	BUG_ON(bfqg == parent);
+
+	entity = &bfqg->entity;
+	entity->parent = parent->my_entity;
+	entity->sched_data = &parent->sched_data;
+}
+
+static void bfq_pd_init(struct blkg_policy_data *pd)
+{
+	struct blkcg_gq *blkg = pd_to_blkg(pd);
+	struct bfq_group *bfqg = blkg_to_bfqg(blkg);
+	struct bfq_data *bfqd = blkg->q->elevator->elevator_data;
+	struct bfq_entity *entity = &bfqg->entity;
+	struct bfq_group_data *d = blkcg_to_bfqgd(blkg->blkcg);
+
+	entity->orig_weight = entity->weight = entity->new_weight = d->weight;
+	entity->my_sched_data = &bfqg->sched_data;
+	bfqg->my_entity = entity; /*
+				   * the root_group's will be set to NULL
+				   * in bfq_init_queue()
+				   */
+	bfqg->bfqd = bfqd;
+	bfqg->active_entities = 0;
+}
+
+static void bfq_pd_free(struct blkg_policy_data *pd)
+{
+	struct bfq_group *bfqg = pd_to_bfqg(pd);
+
+	bfqg_stats_exit(&bfqg->stats);
+	bfqg_stats_exit(&bfqg->dead_stats);
+
+	return kfree(bfqg);
+}
+
+/* offset delta from bfqg->stats to bfqg->dead_stats */
+static const int dead_stats_off_delta = offsetof(struct bfq_group, dead_stats) -
+					offsetof(struct bfq_group, stats);
+
+/* to be used by recursive prfill, sums live and dead stats recursively */
+static u64 bfqg_stat_pd_recursive_sum(struct blkg_policy_data *pd, int off)
+{
+	u64 sum = 0;
+
+	sum += blkg_stat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off);
+	sum += blkg_stat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq,
+				       off + dead_stats_off_delta);
+	return sum;
+}
+
+/* to be used by recursive prfill, sums live and dead rwstats recursively */
+static struct blkg_rwstat bfqg_rwstat_pd_recursive_sum(struct blkg_policy_data *pd,
+						       int off)
+{
+	struct blkg_rwstat a, b;
+
+	a = blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off);
+	b = blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq,
+				      off + dead_stats_off_delta);
+	blkg_rwstat_add_aux(&a, &b);
+	return a;
+}
+
+static void bfq_pd_reset_stats(struct blkg_policy_data *pd)
+{
+	struct bfq_group *bfqg = pd_to_bfqg(pd);
+
+	bfqg_stats_reset(&bfqg->stats);
+	bfqg_stats_reset(&bfqg->dead_stats);
+}
+
+static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd,
+					      struct blkcg *blkcg)
+{
+	struct request_queue *q = bfqd->queue;
+	struct bfq_group *bfqg = NULL, *parent;
+	struct bfq_entity *entity = NULL;
+
+	assert_spin_locked(bfqd->queue->queue_lock);
+
+	/* avoid lookup for the common case where there's no blkcg */
+	if (blkcg == &blkcg_root) {
+		bfqg = bfqd->root_group;
+	} else {
+		struct blkcg_gq *blkg;
+
+		blkg = blkg_lookup_create(blkcg, q);
+		if (!IS_ERR(blkg))
+			bfqg = blkg_to_bfqg(blkg);
+		else /* fallback to root_group */
+			bfqg = bfqd->root_group;
+	}
+
+	BUG_ON(!bfqg);
+
+	/*
+	 * Update chain of bfq_groups as we might be handling a leaf group
+	 * which, along with some of its relatives, has not been hooked yet
+	 * to the private hierarchy of BFQ.
+	 */
+	entity = &bfqg->entity;
+	for_each_entity(entity) {
+		bfqg = container_of(entity, struct bfq_group, entity);
+		BUG_ON(!bfqg);
+		if (bfqg != bfqd->root_group) {
+			parent = bfqg_parent(bfqg);
+			if (!parent)
+				parent = bfqd->root_group;
+			BUG_ON(!parent);
+			bfq_group_set_parent(bfqg, parent);
+		}
+	}
+
+	return bfqg;
+}
+
+/**
+ * bfq_bfqq_move - migrate @bfqq to @bfqg.
+ * @bfqd: queue descriptor.
+ * @bfqq: the queue to move.
+ * @entity: @bfqq's entity.
+ * @bfqg: the group to move to.
+ *
+ * Move @bfqq to @bfqg, deactivating it from its old group and reactivating
+ * it on the new one.  Avoid putting the entity on the old group idle tree.
+ *
+ * Must be called under the queue lock; the cgroup owning @bfqg must
+ * not disappear (by now this just means that we are called under
+ * rcu_read_lock()).
+ */
+static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+			  struct bfq_entity *entity, struct bfq_group *bfqg)
+{
+	int busy, resume;
+
+	busy = bfq_bfqq_busy(bfqq);
+	resume = !RB_EMPTY_ROOT(&bfqq->sort_list);
+
+	BUG_ON(resume && !entity->on_st);
+	BUG_ON(busy && !resume && entity->on_st &&
+	       bfqq != bfqd->in_service_queue);
+
+	if (busy) {
+		BUG_ON(atomic_read(&bfqq->ref) < 2);
+
+		if (!resume)
+			bfq_del_bfqq_busy(bfqd, bfqq, 0);
+		else
+			bfq_deactivate_bfqq(bfqd, bfqq, 0);
+	} else if (entity->on_st)
+		bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
+	bfqg_put(bfqq_group(bfqq));
+
+	/*
+	 * Here we use a reference to bfqg.  We don't need a refcounter
+	 * as the cgroup reference will not be dropped, so that its
+	 * destroy() callback will not be invoked.
+	 */
+	entity->parent = bfqg->my_entity;
+	entity->sched_data = &bfqg->sched_data;
+	bfqg_get(bfqg);
+	if (busy) {
+		if (resume)
+			bfq_activate_bfqq(bfqd, bfqq);
+	}
+	if (!bfqd->in_service_queue && !bfqd->rq_in_driver)
+		bfq_schedule_dispatch(bfqd);
+}
+
+/**
+ * __bfq_bic_change_cgroup - move @bic to @cgroup.
+ * @bfqd: the queue descriptor.
+ * @bic: the bic to move.
+ * @blkcg: the blk-cgroup to move to.
+ * Move bic to blkcg, assuming that bfqd->queue is locked; the caller
+ * has to make sure that the reference to cgroup is valid across the call.
+ *
+ * NOTE: an alternative approach might have been to store the current
+ * cgroup in bfqq and getting a reference to it, reducing the lookup
+ * time here, at the price of slightly more complex code.
+ */
+static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,
+						struct bfq_io_cq *bic,
+						struct blkcg *blkcg)
+{
+	struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0);
+	struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1);
+	struct bfq_group *bfqg;
+	struct bfq_entity *entity;
+	lockdep_assert_held(bfqd->queue->queue_lock);
+	bfqg = bfq_find_alloc_group(bfqd, blkcg);
+	if (async_bfqq) {
+		entity = &async_bfqq->entity;
+
+		if (entity->sched_data != &bfqg->sched_data) {
+			bic_set_bfqq(bic, NULL, 0);
+			bfq_log_bfqq(bfqd, async_bfqq,
+				     "bic_change_group: %p %d",
+				     async_bfqq, atomic_read(&async_bfqq->ref));
+			bfq_put_queue(async_bfqq);
+		}
+	}
+
+	if (sync_bfqq) {
+		entity = &sync_bfqq->entity;
+		if (entity->sched_data != &bfqg->sched_data)
+			bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg);
+	}
+
+	return bfqg;
+}
+
+static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio)
+{
+	struct bfq_data *bfqd = bic_to_bfqd(bic);
+	struct blkcg *blkcg;
+	struct bfq_group *bfqg = NULL;
+	uint64_t id;
+	blkcg = bio_blkcg(bio);
+	id = blkcg->css.serial_nr;
+	rcu_read_unlock();
+
+	/*
+	 * Check whether blkcg has changed.  The condition may trigger
+	 * spuriously on a newly created cic but there's no harm.
+	 */
+	if (unlikely(!bfqd) || likely(bic->blkcg_id == id))
+		return;
+
+	bfqg = __bfq_bic_change_cgroup(bfqd, bic, blkcg);
+	BUG_ON(!bfqg);
+	bic->blkcg_id = id;
+}
+
+/**
+ * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.
+ * @st: the service tree being flushed.
+ */
+static void bfq_flush_idle_tree(struct bfq_service_tree *st)
+{
+	struct bfq_entity *entity = st->first_idle;
+
+	for (; entity ; entity = st->first_idle)
+		__bfq_deactivate_entity(entity, 0);
+}
+
+/**
+ * bfq_reparent_leaf_entity - move leaf entity to the root_group.
+ * @bfqd: the device data structure with the root group.
+ * @entity: the entity to move.
+ */
+static void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
+				     struct bfq_entity *entity)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+	BUG_ON(!bfqq);
+	bfq_bfqq_move(bfqd, bfqq, entity, bfqd->root_group);
+	return;
+}
+
+/**
+ * bfq_reparent_active_entities - move to the root group all active
+ *                                entities.
+ * @bfqd: the device data structure with the root group.
+ * @bfqg: the group to move from.
+ * @st: the service tree with the entities.
+ *
+ * Needs queue_lock to be taken and reference to be valid over the call.
+ */
+static void bfq_reparent_active_entities(struct bfq_data *bfqd,
+					 struct bfq_group *bfqg,
+					 struct bfq_service_tree *st)
+{
+	struct rb_root *active = &st->active;
+	struct bfq_entity *entity = NULL;
+
+	if (!RB_EMPTY_ROOT(&st->active))
+		entity = bfq_entity_of(rb_first(active));
+
+	for (; entity ; entity = bfq_entity_of(rb_first(active)))
+		bfq_reparent_leaf_entity(bfqd, entity);
+
+	if (bfqg->sched_data.in_service_entity)
+		bfq_reparent_leaf_entity(bfqd,
+			bfqg->sched_data.in_service_entity);
+
+	return;
+}
+
+/**
+ * bfq_destroy_group - destroy @bfqg.
+ * @bfqg: the group being destroyed.
+ *
+ * Destroy @bfqg, making sure that it is not referenced from its parent.
+ * blkio already grabs the queue_lock for us, so no need to use RCU-based magic
+static void bfq_pd_offline(struct blkg_policy_data *pd)
+{
+	struct bfq_service_tree *st;
+	struct bfq_group *bfqg;
+	struct bfq_data *bfqd;
+	struct bfq_entity *entity;
+	BUG_ON(!pd);
+	bfqg = pd_to_bfqg(pd);
+	BUG_ON(!bfqg);
+	bfqd = bfqg->bfqd;
+	BUG_ON(bfqd && !bfqd->root_group);
+
+	entity = bfqg->my_entity;
+
+	if (!entity) /* root group */
+		return;
+
+	/*
+	 * Empty all service_trees belonging to this group before
+	 * deactivating the group itself.
+	 */
+	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) {
+		BUG_ON(!bfqg->sched_data.service_tree);
+		st = bfqg->sched_data.service_tree + i;
+		/*
+		 * The idle tree may still contain bfq_queues belonging
+		 * to exited task because they never migrated to a different
+		 * cgroup from the one being destroyed now.  No one else
+		 * can access them so it's safe to act without any lock.
+		 */
+		bfq_flush_idle_tree(st);
+
+		/*
+		 * It may happen that some queues are still active
+		 * (busy) upon group destruction (if the corresponding
+		 * processes have been forced to terminate). We move
+		 * all the leaf entities corresponding to these queues
+		 * to the root_group.
+		 * Also, it may happen that the group has an entity
+		 * in service, which is disconnected from the active
+		 * tree: it must be moved, too.
+		 * There is no need to put the sync queues, as the
+		 * scheduler has taken no reference.
+		 */
+		bfq_reparent_active_entities(bfqd, bfqg, st);
+		BUG_ON(!RB_EMPTY_ROOT(&st->active));
+		BUG_ON(!RB_EMPTY_ROOT(&st->idle));
+	}
+	BUG_ON(bfqg->sched_data.next_in_service);
+	BUG_ON(bfqg->sched_data.in_service_entity);
+	__bfq_deactivate_entity(entity, 0);
+	bfq_put_async_queues(bfqd, bfqg);
+	BUG_ON(entity->tree);
+	bfqg_stats_xfer_dead(bfqg);
+}
+
+static void bfq_end_wr_async(struct bfq_data *bfqd)
+{
+	struct blkcg_gq *blkg;
+	list_for_each_entry(blkg, &bfqd->queue->blkg_list, q_node) {
+		struct bfq_group *bfqg = blkg_to_bfqg(blkg);
+		bfq_end_wr_async_queues(bfqd, bfqg);
+	bfq_end_wr_async_queues(bfqd, bfqd->root_group);
+static u64 bfqio_cgroup_weight_read(struct cgroup_subsys_state *css,
+				       struct cftype *cftype)
+	struct blkcg *blkcg = css_to_blkcg(css);
+	struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
+	int ret = -EINVAL;
+	spin_lock_irq(&blkcg->lock);
+	ret = bfqgd->weight;
+	spin_unlock_irq(&blkcg->lock);
+	return ret;
+static int bfqio_cgroup_weight_read_dfl(struct seq_file *sf, void *v)
+	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
+	struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
+	spin_lock_irq(&blkcg->lock);
+	seq_printf(sf, "%u\n", bfqgd->weight);
+	spin_unlock_irq(&blkcg->lock);
+	return 0;
+static int bfqio_cgroup_weight_write(struct cgroup_subsys_state *css,
+					struct cftype *cftype,
+					u64 val)
+	struct blkcg *blkcg = css_to_blkcg(css);
+	struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
+	struct blkcg_gq *blkg;
+	int ret = -EINVAL;
+	if (val < BFQ_MIN_WEIGHT || val > BFQ_MAX_WEIGHT)
+		return ret;
+
+	ret = 0;
+	spin_lock_irq(&blkcg->lock);
+	bfqgd->weight = (unsigned short)val;
+	hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
+		struct bfq_group *bfqg = blkg_to_bfqg(blkg);
+		if (!bfqg)
+			continue;
+		 * Setting the prio_changed flag of the entity
+		 * to 1 with new_weight == weight would re-set
+		 * the value of the weight to its ioprio mapping.
+		 * Set the flag only if necessary.
+		if ((unsigned short)val != bfqg->entity.new_weight) {
+			bfqg->entity.new_weight = (unsigned short)val;
+			 * Make sure that the above new value has been
+			 * stored in bfqg->entity.new_weight before
+			 * setting the prio_changed flag. In fact,
+			 * this flag may be read asynchronously (in
+			 * critical sections protected by a different
+			 * lock than that held here), and finding this
+			 * flag set may cause the execution of the code
+			 * for updating parameters whose value may
+			 * depend also on bfqg->entity.new_weight (in
+			 * __bfq_entity_update_weight_prio).
+			 * This barrier makes sure that the new value
+			 * of bfqg->entity.new_weight is correctly
+			 * seen in that code.
+			smp_wmb();
+			bfqg->entity.prio_changed = 1;
+		}
+	spin_unlock_irq(&blkcg->lock);
+static ssize_t bfqio_cgroup_weight_write_dfl(struct kernfs_open_file *of,
+					     char *buf, size_t nbytes,
+					     loff_t off)
+	/* First unsigned long found in the file is used */
+	return bfqio_cgroup_weight_write(of_css(of), NULL,
+					 simple_strtoull(strim(buf), NULL, 0));
+}
+static int bfqg_print_stat(struct seq_file *sf, void *v)
+{
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat,
+			  &blkcg_policy_bfq, seq_cft(sf)->private, false);
+	return 0;
+static int bfqg_print_rwstat(struct seq_file *sf, void *v)
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat,
+			  &blkcg_policy_bfq, seq_cft(sf)->private, true);
+	return 0;
+}
+static u64 bfqg_prfill_stat_recursive(struct seq_file *sf,
+				      struct blkg_policy_data *pd, int off)
+{
+	u64 sum = bfqg_stat_pd_recursive_sum(pd, off);
+
+	return __blkg_prfill_u64(sf, pd, sum);
+}
+
+static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf,
+					struct blkg_policy_data *pd, int off)
+{
+	struct blkg_rwstat sum = bfqg_rwstat_pd_recursive_sum(pd, off);
+
+	return __blkg_prfill_rwstat(sf, pd, &sum);
+}
+static int bfqg_print_stat_recursive(struct seq_file *sf, void *v)
+{
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+			  bfqg_prfill_stat_recursive, &blkcg_policy_bfq,
+			  seq_cft(sf)->private, false);
+	return 0;
+}
+static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v)
+{
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+			  bfqg_prfill_rwstat_recursive, &blkcg_policy_bfq,
+			  seq_cft(sf)->private, true);
+	return 0;
+static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf,
+				      struct blkg_policy_data *pd, int off)
+	struct bfq_group *bfqg = pd_to_bfqg(pd);
+	u64 samples = blkg_stat_read(&bfqg->stats.avg_queue_size_samples);
+	u64 v = 0;
+	if (samples) {
+		v = blkg_stat_read(&bfqg->stats.avg_queue_size_sum);
+		v = div64_u64(v, samples);
+	}
+	__blkg_prfill_u64(sf, pd, v);
+	return 0;
+}
+/* print avg_queue_size */
+static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v)
+{
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+			  bfqg_prfill_avg_queue_size, &blkcg_policy_bfq,
+			  0, false);
+static struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node)
+	int ret;
+
+	ret = blkcg_activate_policy(bfqd->queue, &blkcg_policy_bfq);
+	if (ret)
+		return NULL;
+
+        return blkg_to_bfqg(bfqd->queue->root_blkg);
+}
+
+static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp)
+{
+        struct bfq_group_data *bgd;
+
+        bgd = kzalloc(sizeof(*bgd), GFP_KERNEL);
+        if (!bgd)
+                return NULL;
+        return &bgd->pd;
+}
+static void bfq_cpd_free(struct blkcg_policy_data *cpd)
+{
+        kfree(cpd_to_bfqgd(cpd));
+static struct cftype bfqio_files_dfl[] = {
+	{
+		.name = "weight",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = bfqio_cgroup_weight_read_dfl,
+		.write = bfqio_cgroup_weight_write_dfl,
+	},
+	{} /* terminate */
+};
+
+static struct cftype bfqio_files[] = {
+	{
+		.name = "bfq.weight",
+		.read_u64 = bfqio_cgroup_weight_read,
+		.write_u64 = bfqio_cgroup_weight_write,
+	},
+	/* statistics, cover only the tasks in the bfqg */
+	{
+		.name = "bfq.time",
+		.private = offsetof(struct bfq_group, stats.time),
+		.seq_show = bfqg_print_stat,
+	},
+	{
+		.name = "bfq.sectors",
+		.private = offsetof(struct bfq_group, stats.sectors),
+		.seq_show = bfqg_print_stat,
+	},
+	{
+		.name = "bfq.io_service_bytes",
+		.private = offsetof(struct bfq_group, stats.service_bytes),
+		.seq_show = bfqg_print_rwstat,
+	},
+	{
+		.name = "bfq.io_serviced",
+		.private = offsetof(struct bfq_group, stats.serviced),
+		.seq_show = bfqg_print_rwstat,
+	},
+	{
+		.name = "bfq.io_service_time",
+		.private = offsetof(struct bfq_group, stats.service_time),
+		.seq_show = bfqg_print_rwstat,
+	},
+	{
+		.name = "bfq.io_wait_time",
+		.private = offsetof(struct bfq_group, stats.wait_time),
+		.seq_show = bfqg_print_rwstat,
+	},
+	{
+		.name = "bfq.io_merged",
+		.private = offsetof(struct bfq_group, stats.merged),
+		.seq_show = bfqg_print_rwstat,
+	},
+	{
+		.name = "bfq.io_queued",
+		.private = offsetof(struct bfq_group, stats.queued),
+		.seq_show = bfqg_print_rwstat,
+	},
+
+	/* the same statictics which cover the bfqg and its descendants */
+	{
+		.name = "bfq.time_recursive",
+		.private = offsetof(struct bfq_group, stats.time),
+		.seq_show = bfqg_print_stat_recursive,
+	},
+	{
+		.name = "bfq.sectors_recursive",
+		.private = offsetof(struct bfq_group, stats.sectors),
+		.seq_show = bfqg_print_stat_recursive,
+	},
+	{
+		.name = "bfq.io_service_bytes_recursive",
+		.private = offsetof(struct bfq_group, stats.service_bytes),
+		.seq_show = bfqg_print_rwstat_recursive,
+	},
+	{
+		.name = "bfq.io_serviced_recursive",
+		.private = offsetof(struct bfq_group, stats.serviced),
+		.seq_show = bfqg_print_rwstat_recursive,
+	},
+	{
+		.name = "bfq.io_service_time_recursive",
+		.private = offsetof(struct bfq_group, stats.service_time),
+		.seq_show = bfqg_print_rwstat_recursive,
+	},
+	{
+		.name = "bfq.io_wait_time_recursive",
+		.private = offsetof(struct bfq_group, stats.wait_time),
+		.seq_show = bfqg_print_rwstat_recursive,
+	},
+	{
+		.name = "bfq.io_merged_recursive",
+		.private = offsetof(struct bfq_group, stats.merged),
+		.seq_show = bfqg_print_rwstat_recursive,
+	},
+	{
+		.name = "bfq.io_queued_recursive",
+		.private = offsetof(struct bfq_group, stats.queued),
+		.seq_show = bfqg_print_rwstat_recursive,
+	},
+	{
+		.name = "bfq.avg_queue_size",
+		.seq_show = bfqg_print_avg_queue_size,
+	},
+	{
+		.name = "bfq.group_wait_time",
+		.private = offsetof(struct bfq_group, stats.group_wait_time),
+		.seq_show = bfqg_print_stat,
+	},
+	{
+		.name = "bfq.idle_time",
+		.private = offsetof(struct bfq_group, stats.idle_time),
+		.seq_show = bfqg_print_stat,
+	},
+	{
+		.name = "bfq.empty_time",
+		.private = offsetof(struct bfq_group, stats.empty_time),
+		.seq_show = bfqg_print_stat,
+	},
+	{
+		.name = "bfq.dequeue",
+		.private = offsetof(struct bfq_group, stats.dequeue),
+		.seq_show = bfqg_print_stat,
+	},
+	{
+		.name = "bfq.unaccounted_time",
+		.private = offsetof(struct bfq_group, stats.unaccounted_time),
+		.seq_show = bfqg_print_stat,
+	},
+	{ }	/* terminate */
+};
+
+static struct blkcg_policy blkcg_policy_bfq = {
+       .dfl_cftypes            = bfqio_files_dfl,
+       .legacy_cftypes         = bfqio_files,
+
+       .pd_alloc_fn            = bfq_pd_alloc,
+       .pd_init_fn             = bfq_pd_init,
+       .pd_offline_fn          = bfq_pd_offline,
+       .pd_free_fn             = bfq_pd_free,
+       .pd_reset_stats_fn      = bfq_pd_reset_stats,
+
+       .cpd_alloc_fn           = bfq_cpd_alloc,
+       .cpd_init_fn            = bfq_cpd_init,
+       .cpd_bind_fn	       = bfq_cpd_init,
+       .cpd_free_fn            = bfq_cpd_free,
+
+
+static void bfq_init_entity(struct bfq_entity *entity,
+			    struct bfq_group *bfqg)
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+	entity->weight = entity->new_weight;
+	entity->orig_weight = entity->new_weight;
+	if (bfqq) {
+		bfqq->ioprio = bfqq->new_ioprio;
+		bfqq->ioprio_class = bfqq->new_ioprio_class;
+	}
+	entity->sched_data = &bfqg->sched_data;
+}
+
+static struct bfq_group *
+bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio)
+{
+	struct bfq_data *bfqd = bic_to_bfqd(bic);
+	return bfqd->root_group;
+}
+
+static void bfq_bfqq_move(struct bfq_data *bfqd,
+			  struct bfq_queue *bfqq,
+			  struct bfq_entity *entity,
+			  struct bfq_group *bfqg)
+{
+}
+
+static void bfq_end_wr_async(struct bfq_data *bfqd)
+{
+	bfq_end_wr_async_queues(bfqd, bfqd->root_group);
+}
+
+static void bfq_disconnect_groups(struct bfq_data *bfqd)
+{
+	bfq_put_async_queues(bfqd, bfqd->root_group);
+}
+
+static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd,
+                                              struct blkcg *blkcg)
+	return bfqd->root_group;
+static struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node)
+{
+	struct bfq_group *bfqg;
+	int i;
+
+	bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);
+	if (!bfqg)
+		return NULL;
+
+	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
+		bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
+
+	return bfqg;
+}
+#endif
diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c
new file mode 100644
index 0000000..fb7bb8f
--- /dev/null
+++ b/block/bfq-ioc.c
@@ -0,0 +1,36 @@
+/*
+ * BFQ: I/O context handling.
+ *
+ * Based on ideas and code from CFQ:
+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
+ *
+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
+ *		      Paolo Valente <paolo.valente@unimore.it>
+ *
+ * Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it>
+ */
+
+/**
+ * icq_to_bic - convert iocontext queue structure to bfq_io_cq.
+ * @icq: the iocontext queue.
+ */
+static struct bfq_io_cq *icq_to_bic(struct io_cq *icq)
+{
+	/* bic->icq is the first member, %NULL will convert to %NULL */
+	return container_of(icq, struct bfq_io_cq, icq);
+}
+
+/**
+ * bfq_bic_lookup - search into @ioc a bic associated to @bfqd.
+ * @bfqd: the lookup key.
+ * @ioc: the io_context of the process doing I/O.
+ *
+ * Queue lock must be held.
+ */
+static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,
+					struct io_context *ioc)
+{
+	if (ioc)
+		return icq_to_bic(ioc_lookup_icq(ioc, bfqd->queue));
+	return NULL;
+}
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
new file mode 100644
index 0000000..f9787a6
--- /dev/null
+++ b/block/bfq-iosched.c
@@ -0,0 +1,3754 @@
+/*
+ * Budget Fair Queueing (BFQ) disk scheduler.
+ *
+ * Based on ideas and code from CFQ:
+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
+ *
+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
+ *		      Paolo Valente <paolo.valente@unimore.it>
+ *
+ * Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it>
+ *
+ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ
+ * file.
+ *
+ * BFQ is a proportional-share storage-I/O scheduling algorithm based on
+ * the slice-by-slice service scheme of CFQ. But BFQ assigns budgets,
+ * measured in number of sectors, to processes instead of time slices. The
+ * device is not granted to the in-service process for a given time slice,
+ * but until it has exhausted its assigned budget. This change from the time
+ * to the service domain allows BFQ to distribute the device throughput
+ * among processes as desired, without any distortion due to ZBR, workload
+ * fluctuations or other factors. BFQ uses an ad hoc internal scheduler,
+ * called B-WF2Q+, to schedule processes according to their budgets. More
+ * precisely, BFQ schedules queues associated to processes. Thanks to the
+ * accurate policy of B-WF2Q+, BFQ can afford to assign high budgets to
+ * I/O-bound processes issuing sequential requests (to boost the
+ * throughput), and yet guarantee a low latency to interactive and soft
+ * real-time applications.
+ *
+ * BFQ is described in [1], where also a reference to the initial, more
+ * theoretical paper on BFQ can be found. The interested reader can find
+ * in the latter paper full details on the main algorithm, as well as
+ * formulas of the guarantees and formal proofs of all the properties.
+ * With respect to the version of BFQ presented in these papers, this
+ * implementation adds a few more heuristics, such as the one that
+ * guarantees a low latency to soft real-time applications, and a
+ * hierarchical extension based on H-WF2Q+.
+ *
+ * B-WF2Q+ is based on WF2Q+, that is described in [2], together with
+ * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N)
+ * complexity derives from the one introduced with EEVDF in [3].
+ *
+ * [1] P. Valente and M. Andreolini, ``Improving Application Responsiveness
+ *     with the BFQ Disk I/O Scheduler'',
+ *     Proceedings of the 5th Annual International Systems and Storage
+ *     Conference (SYSTOR '12), June 2012.
+ *
+ * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf
+ *
+ * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing
+ *     Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689,
+ *     Oct 1997.
+ *
+ * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz
+ *
+ * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline
+ *     First: A Flexible and Accurate Mechanism for Proportional Share
+ *     Resource Allocation,'' technical report.
+ *
+ * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf
+ */
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/cgroup.h>
+#include <linux/elevator.h>
+#include <linux/jiffies.h>
+#include <linux/rbtree.h>
+#include <linux/ioprio.h>
+#include "bfq.h"
+#include "blk.h"
+
+/* Expiration time of sync (0) and async (1) requests, in jiffies. */
+static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
+
+/* Maximum backwards seek, in KiB. */
+static const int bfq_back_max = 16 * 1024;
+
+/* Penalty of a backwards seek, in number of sectors. */
+static const int bfq_back_penalty = 2;
+
+/* Idling period duration, in jiffies. */
+static int bfq_slice_idle = HZ / 125;
+
+/* Minimum number of assigned budgets for which stats are safe to compute. */
+static const int bfq_stats_min_budgets = 194;
+
+/* Default maximum budget values, in sectors and number of requests. */
+static const int bfq_default_max_budget = 16 * 1024;
+static const int bfq_max_budget_async_rq = 4;
+
+/*
+ * Async to sync throughput distribution is controlled as follows:
+ * when an async request is served, the entity is charged the number
+ * of sectors of the request, multiplied by the factor below
+ */
+static const int bfq_async_charge_factor = 10;
+
+/* Default timeout values, in jiffies, approximating CFQ defaults. */
+static const int bfq_timeout_sync = HZ / 8;
+static int bfq_timeout_async = HZ / 25;
+
+struct kmem_cache *bfq_pool;
+
+/* Below this threshold (in ms), we consider thinktime immediate. */
+#define BFQ_MIN_TT		2
+
+/* hw_tag detection: parallel requests threshold and min samples needed. */
+#define BFQ_HW_QUEUE_THRESHOLD	4
+#define BFQ_HW_QUEUE_SAMPLES	32
+
+#define BFQQ_SEEK_THR	 (sector_t)(8 * 1024)
+#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR)
+
+/* Min samples used for peak rate estimation (for autotuning). */
+#define BFQ_PEAK_RATE_SAMPLES	32
+
+/* Shift used for peak rate fixed precision calculations. */
+#define BFQ_RATE_SHIFT		16
+
+/*
+ * By default, BFQ computes the duration of the weight raising for
+ * interactive applications automatically, using the following formula:
+ * duration = (R / r) * T, where r is the peak rate of the device, and
+ * R and T are two reference parameters.
+ * In particular, R is the peak rate of the reference device (see below),
+ * and T is a reference time: given the systems that are likely to be
+ * installed on the reference device according to its speed class, T is
+ * about the maximum time needed, under BFQ and while reading two files in
+ * parallel, to load typical large applications on these systems.
+ * In practice, the slower/faster the device at hand is, the more/less it
+ * takes to load applications with respect to the reference device.
+ * Accordingly, the longer/shorter BFQ grants weight raising to interactive
+ * applications.
+ *
+ * BFQ uses four different reference pairs (R, T), depending on:
+ * . whether the device is rotational or non-rotational;
+ * . whether the device is slow, such as old or portable HDDs, as well as
+ *   SD cards, or fast, such as newer HDDs and SSDs.
+ *
+ * The device's speed class is dynamically (re)detected in
+ * bfq_update_peak_rate() every time the estimated peak rate is updated.
+ *
+ * In the following definitions, R_slow[0]/R_fast[0] and T_slow[0]/T_fast[0]
+ * are the reference values for a slow/fast rotational device, whereas
+ * R_slow[1]/R_fast[1] and T_slow[1]/T_fast[1] are the reference values for
+ * a slow/fast non-rotational device. Finally, device_speed_thresh are the
+ * thresholds used to switch between speed classes.
+ * Both the reference peak rates and the thresholds are measured in
+ * sectors/usec, left-shifted by BFQ_RATE_SHIFT.
+ */
+static int R_slow[2] = {1536, 10752};
+static int R_fast[2] = {17415, 34791};
+/*
+ * To improve readability, a conversion function is used to initialize the
+ * following arrays, which entails that they can be initialized only in a
+ * function.
+ */
+static int T_slow[2];
+static int T_fast[2];
+static int device_speed_thresh[2];
+
+#define BFQ_SERVICE_TREE_INIT	((struct bfq_service_tree)		\
+				{ RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
+
+#define RQ_BIC(rq)		((struct bfq_io_cq *) (rq)->elv.priv[0])
+#define RQ_BFQQ(rq)		((rq)->elv.priv[1])
+
+static void bfq_schedule_dispatch(struct bfq_data *bfqd);
+
+#include "bfq-ioc.c"
+#include "bfq-sched.c"
+#include "bfq-cgroup.c"
+
+#define bfq_class_idle(bfqq)	((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
+#define bfq_class_rt(bfqq)	((bfqq)->ioprio_class == IOPRIO_CLASS_RT)
+
+#define bfq_sample_valid(samples)	((samples) > 80)
+
+/*
+ * We regard a request as SYNC, if either it's a read or has the SYNC bit
+ * set (in which case it could also be a direct WRITE).
+ */
+static int bfq_bio_sync(struct bio *bio)
+{
+	if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC))
+		return 1;
+
+	return 0;
+}
+
+/*
+ * Scheduler run of queue, if there are requests pending and no one in the
+ * driver that will restart queueing.
+ */
+static void bfq_schedule_dispatch(struct bfq_data *bfqd)
+{
+	if (bfqd->queued != 0) {
+		bfq_log(bfqd, "schedule dispatch");
+		kblockd_schedule_work(&bfqd->unplug_work);
+	}
+}
+
+/*
+ * Lifted from AS - choose which of rq1 and rq2 that is best served now.
+ * We choose the request that is closesr to the head right now.  Distance
+ * behind the head is penalized and only allowed to a certain extent.
+ */
+static struct request *bfq_choose_req(struct bfq_data *bfqd,
+				      struct request *rq1,
+				      struct request *rq2,
+				      sector_t last)
+{
+	sector_t s1, s2, d1 = 0, d2 = 0;
+	unsigned long back_max;
+#define BFQ_RQ1_WRAP	0x01 /* request 1 wraps */
+#define BFQ_RQ2_WRAP	0x02 /* request 2 wraps */
+	unsigned wrap = 0; /* bit mask: requests behind the disk head? */
+
+	if (!rq1 || rq1 == rq2)
+	if (!rq2)
+		return rq1;
+
+	if (rq_is_sync(rq1) && !rq_is_sync(rq2))
+		return rq1;
+	else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
+		return rq2;
+	if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))
+		return rq1;
+	else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META))
+		return rq2;
+
+	s1 = blk_rq_pos(rq1);
+	s2 = blk_rq_pos(rq2);
+
+	/*
+	 * By definition, 1KiB is 2 sectors.
+	 */
+	back_max = bfqd->bfq_back_max * 2;
+
+	/*
+	 * Strict one way elevator _except_ in the case where we allow
+	 * short backward seeks which are biased as twice the cost of a
+	 * similar forward seek.
+	 */
+	if (s1 >= last)
+		d1 = s1 - last;
+	else if (s1 + back_max >= last)
+		d1 = (last - s1) * bfqd->bfq_back_penalty;
+	else
+		wrap |= BFQ_RQ1_WRAP;
+
+	if (s2 >= last)
+		d2 = s2 - last;
+	else if (s2 + back_max >= last)
+		d2 = (last - s2) * bfqd->bfq_back_penalty;
+	else
+		wrap |= BFQ_RQ2_WRAP;
+
+	/* Found required data */
+
+	/*
+	 * By doing switch() on the bit mask "wrap" we avoid having to
+	 * check two variables for all permutations: --> faster!
+	 */
+	switch (wrap) {
+	case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
+		if (d1 < d2)
+			return rq1;
+		else if (d2 < d1)
+			return rq2;
+		else {
+			if (s1 >= s2)
+				return rq1;
+			else
+				return rq2;
+		}
+
+	case BFQ_RQ2_WRAP:
+		return rq1;
+	case BFQ_RQ1_WRAP:
+		return rq2;
+	case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */
+	default:
+		/*
+		 * Since both rqs are wrapped,
+		 * start with the one that's further behind head
+		 * (--> only *one* back seek required),
+		 * since back seek takes more time than forward.
+		 */
+		if (s1 <= s2)
+			return rq1;
+		else
+			return rq2;
+	}
+}
+
+/*
+ * Tell whether there are active queues or groups with differentiated weights.
+ */
+static bool bfq_differentiated_weights(struct bfq_data *bfqd)
+{
+	/*
+	 * For weights to differ, at least one of the trees must contain
+	 * at least two nodes.
+	 */
+	return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) &&
+		(bfqd->queue_weights_tree.rb_node->rb_left ||
+		 bfqd->queue_weights_tree.rb_node->rb_right)
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	       ) ||
+	       (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) &&
+		(bfqd->group_weights_tree.rb_node->rb_left ||
+		 bfqd->group_weights_tree.rb_node->rb_right)
+#endif
+	       );
+}
+
+/*
+ * The following function returns true if every queue must receive the
+ * same share of the throughput (this condition is used when deciding
+ * whether idling may be disabled, see the comments in the function
+ * bfq_bfqq_may_idle()).
+ *
+ * Such a scenario occurs when:
+ * 1) all active queues have the same weight,
+ * 2) all active groups at the same level in the groups tree have the same
+ *    weight,
+ * 3) all active groups at the same level in the groups tree have the same
+ *    number of children.
+ *
+ * Unfortunately, keeping the necessary state for evaluating exactly the
+ * above symmetry conditions would be quite complex and time-consuming.
+ * Therefore this function evaluates, instead, the following stronger
+ * sub-conditions, for which it is much easier to maintain the needed
+ * state:
+ * 1) all active queues have the same weight,
+ * 2) all active groups have the same weight,
+ * 3) all active groups have at most one active child each.
+ * In particular, the last two conditions are always true if hierarchical
+ * support and the cgroups interface are not enabled, thus no state needs
+ * to be maintained in this case.
+ */
+static bool bfq_symmetric_scenario(struct bfq_data *bfqd)
+{
+	return
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+		!bfqd->active_numerous_groups &&
+#endif
+		!bfq_differentiated_weights(bfqd);
+}
+
+/*
+ * If the weight-counter tree passed as input contains no counter for
+ * the weight of the input entity, then add that counter; otherwise just
+ * increment the existing counter.
+ *
+ * Note that weight-counter trees contain few nodes in mostly symmetric
+ * scenarios. For example, if all queues have the same weight, then the
+ * weight-counter tree for the queues may contain at most one node.
+ * This holds even if low_latency is on, because weight-raised queues
+ * are not inserted in the tree.
+ * In most scenarios, the rate at which nodes are created/destroyed
+ * should be low too.
+ */
+static void bfq_weights_tree_add(struct bfq_data *bfqd,
+				 struct bfq_entity *entity,
+				 struct rb_root *root)
+{
+	struct rb_node **new = &(root->rb_node), *parent = NULL;
+
+	/*
+	 * Do not insert if the entity is already associated with a
+	 * counter, which happens if:
+	 *   1) the entity is associated with a queue,
+	 *   2) a request arrival has caused the queue to become both
+	 *      non-weight-raised, and hence change its weight, and
+	 *      backlogged; in this respect, each of the two events
+	 *      causes an invocation of this function,
+	 *   3) this is the invocation of this function caused by the
+	 *      second event. This second invocation is actually useless,
+	 *      and we handle this fact by exiting immediately. More
+	 *      efficient or clearer solutions might possibly be adopted.
+	 */
+	if (entity->weight_counter)
+		return;
+
+	while (*new) {
+		struct bfq_weight_counter *__counter = container_of(*new,
+						struct bfq_weight_counter,
+						weights_node);
+		parent = *new;
+
+		if (entity->weight == __counter->weight) {
+			entity->weight_counter = __counter;
+			goto inc_counter;
+		}
+		if (entity->weight < __counter->weight)
+			new = &((*new)->rb_left);
+		else
+			new = &((*new)->rb_right);
+	}
+
+	entity->weight_counter = kzalloc(sizeof(struct bfq_weight_counter),
+					 GFP_ATOMIC);
+	entity->weight_counter->weight = entity->weight;
+	rb_link_node(&entity->weight_counter->weights_node, parent, new);
+	rb_insert_color(&entity->weight_counter->weights_node, root);
+
+inc_counter:
+	entity->weight_counter->num_active++;
+}
+
+/*
+ * Decrement the weight counter associated with the entity, and, if the
+ * counter reaches 0, remove the counter from the tree.
+ * See the comments to the function bfq_weights_tree_add() for considerations
+ * about overhead.
+ */
+static void bfq_weights_tree_remove(struct bfq_data *bfqd,
+				    struct bfq_entity *entity,
+				    struct rb_root *root)
+{
+	if (!entity->weight_counter)
+		return;
+
+	BUG_ON(RB_EMPTY_ROOT(root));
+	BUG_ON(entity->weight_counter->weight != entity->weight);
+
+	BUG_ON(!entity->weight_counter->num_active);
+	entity->weight_counter->num_active--;
+	if (entity->weight_counter->num_active > 0)
+		goto reset_entity_pointer;
+
+	rb_erase(&entity->weight_counter->weights_node, root);
+	kfree(entity->weight_counter);
+
+reset_entity_pointer:
+	entity->weight_counter = NULL;
+}
+
+static struct request *bfq_find_next_rq(struct bfq_data *bfqd,
+					struct bfq_queue *bfqq,
+					struct request *last)
+{
+	struct rb_node *rbnext = rb_next(&last->rb_node);
+	struct rb_node *rbprev = rb_prev(&last->rb_node);
+	struct request *next = NULL, *prev = NULL;
+
+	BUG_ON(RB_EMPTY_NODE(&last->rb_node));
+
+	if (rbprev)
+		prev = rb_entry_rq(rbprev);
+
+	if (rbnext)
+		next = rb_entry_rq(rbnext);
+	else {
+		rbnext = rb_first(&bfqq->sort_list);
+		if (rbnext && rbnext != &last->rb_node)
+			next = rb_entry_rq(rbnext);
+	}
+
+	return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last));
+}
+
+/* see the definition of bfq_async_charge_factor for details */
+static unsigned long bfq_serv_to_charge(struct request *rq,
+					struct bfq_queue *bfqq)
+{
+	return blk_rq_sectors(rq) *
+		(1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->wr_coeff == 1) *
+		bfq_async_charge_factor));
+}
+
+/**
+ * bfq_updated_next_req - update the queue after a new next_rq selection.
+ * @bfqd: the device data the queue belongs to.
+ * @bfqq: the queue to update.
+ *
+ * If the first request of a queue changes we make sure that the queue
+ * has enough budget to serve at least its first request (if the
+ * request has grown).  We do this because if the queue has not enough
+ * budget for its first request, it has to go through two dispatch
+ * rounds to actually get it dispatched.
+ */
+static void bfq_updated_next_req(struct bfq_data *bfqd,
+				 struct bfq_queue *bfqq)
+{
+	struct bfq_entity *entity = &bfqq->entity;
+	struct bfq_service_tree *st = bfq_entity_service_tree(entity);
+	struct request *next_rq = bfqq->next_rq;
+	unsigned long new_budget;
+
+	if (!next_rq)
+		return;
+
+	if (bfqq == bfqd->in_service_queue)
+		/*
+		 * In order not to break guarantees, budgets cannot be
+		 * changed after an entity has been selected.
+		 */
+		return;
+
+	BUG_ON(entity->tree != &st->active);
+	BUG_ON(entity == entity->sched_data->in_service_entity);
+
+	new_budget = max_t(unsigned long, bfqq->max_budget,
+			   bfq_serv_to_charge(next_rq, bfqq));
+	if (entity->budget != new_budget) {
+		entity->budget = new_budget;
+		bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu",
+					 new_budget);
+		bfq_activate_bfqq(bfqd, bfqq);
+	}
+}
+
+static unsigned int bfq_wr_duration(struct bfq_data *bfqd)
+{
+	u64 dur;
+
+	if (bfqd->bfq_wr_max_time > 0)
+		return bfqd->bfq_wr_max_time;
+
+	dur = bfqd->RT_prod;
+	do_div(dur, bfqd->peak_rate);
+
+	return dur;
+}
+
+/* Empty burst list and add just bfqq (see comments to bfq_handle_burst) */
+static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+	struct bfq_queue *item;
+	struct hlist_node *n;
+
+	hlist_for_each_entry_safe(item, n, &bfqd->burst_list, burst_list_node)
+		hlist_del_init(&item->burst_list_node);
+	hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);
+	bfqd->burst_size = 1;
+}
+
+/* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */
+static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+	/* Increment burst size to take into account also bfqq */
+	bfqd->burst_size++;
+
+	if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) {
+		struct bfq_queue *pos, *bfqq_item;
+		struct hlist_node *n;
+
+		/*
+		 * Enough queues have been activated shortly after each
+		 * other to consider this burst as large.
+		 */
+		bfqd->large_burst = true;
+
+		/*
+		 * We can now mark all queues in the burst list as
+		 * belonging to a large burst.
+		 */
+		hlist_for_each_entry(bfqq_item, &bfqd->burst_list,
+				     burst_list_node)
+		        bfq_mark_bfqq_in_large_burst(bfqq_item);
+		bfq_mark_bfqq_in_large_burst(bfqq);
+
+		/*
+		 * From now on, and until the current burst finishes, any
+		 * new queue being activated shortly after the last queue
+		 * was inserted in the burst can be immediately marked as
+		 * belonging to a large burst. So the burst list is not
+		 * needed any more. Remove it.
+		 */
+		hlist_for_each_entry_safe(pos, n, &bfqd->burst_list,
+					  burst_list_node)
+			hlist_del_init(&pos->burst_list_node);
+	} else /* burst not yet large: add bfqq to the burst list */
+		hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);
+}
+
+/*
+ * If many queues happen to become active shortly after each other, then,
+ * to help the processes associated to these queues get their job done as
+ * soon as possible, it is usually better to not grant either weight-raising
+ * or device idling to these queues. In this comment we describe, firstly,
+ * the reasons why this fact holds, and, secondly, the next function, which
+ * implements the main steps needed to properly mark these queues so that
+ * they can then be treated in a different way.
+ *
+ * As for the terminology, we say that a queue becomes active, i.e.,
+ * switches from idle to backlogged, either when it is created (as a
+ * consequence of the arrival of an I/O request), or, if already existing,
+ * when a new request for the queue arrives while the queue is idle.
+ * Bursts of activations, i.e., activations of different queues occurring
+ * shortly after each other, are typically caused by services or applications
+ * that spawn or reactivate many parallel threads/processes. Examples are
+ * systemd during boot or git grep.
+ *
+ * These services or applications benefit mostly from a high throughput:
+ * the quicker the requests of the activated queues are cumulatively served,
+ * the sooner the target job of these queues gets completed. As a consequence,
+ * weight-raising any of these queues, which also implies idling the device
+ * for it, is almost always counterproductive: in most cases it just lowers
+ * throughput.
+ *
+ * On the other hand, a burst of activations may be also caused by the start
+ * of an application that does not consist in a lot of parallel I/O-bound
+ * threads. In fact, with a complex application, the burst may be just a
+ * consequence of the fact that several processes need to be executed to
+ * start-up the application. To start an application as quickly as possible,
+ * the best thing to do is to privilege the I/O related to the application
+ * with respect to all other I/O. Therefore, the best strategy to start as
+ * quickly as possible an application that causes a burst of activations is
+ * to weight-raise all the queues activated during the burst. This is the
+ * exact opposite of the best strategy for the other type of bursts.
+ *
+ * In the end, to take the best action for each of the two cases, the two
+ * types of bursts need to be distinguished. Fortunately, this seems
+ * relatively easy to do, by looking at the sizes of the bursts. In
+ * particular, we found a threshold such that bursts with a larger size
+ * than that threshold are apparently caused only by services or commands
+ * such as systemd or git grep. For brevity, hereafter we call just 'large'
+ * these bursts. BFQ *does not* weight-raise queues whose activations occur
+ * in a large burst. In addition, for each of these queues BFQ performs or
+ * does not perform idling depending on which choice boosts the throughput
+ * most. The exact choice depends on the device and request pattern at
+ * hand.
+ *
+ * Turning back to the next function, it implements all the steps needed
+ * to detect the occurrence of a large burst and to properly mark all the
+ * queues belonging to it (so that they can then be treated in a different
+ * way). This goal is achieved by maintaining a special "burst list" that
+ * holds, temporarily, the queues that belong to the burst in progress. The
+ * list is then used to mark these queues as belonging to a large burst if
+ * the burst does become large. The main steps are the following.
+ *
+ * . when the very first queue is activated, the queue is inserted into the
+ *   list (as it could be the first queue in a possible burst)
+ *
+ * . if the current burst has not yet become large, and a queue Q that does
+ *   not yet belong to the burst is activated shortly after the last time
+ *   at which a new queue entered the burst list, then the function appends
+ *   Q to the burst list
+ *
+ * . if, as a consequence of the previous step, the burst size reaches
+ *   the large-burst threshold, then
+ *
+ *     . all the queues in the burst list are marked as belonging to a
+ *       large burst
+ *
+ *     . the burst list is deleted; in fact, the burst list already served
+ *       its purpose (keeping temporarily track of the queues in a burst,
+ *       so as to be able to mark them as belonging to a large burst in the
+ *       previous sub-step), and now is not needed any more
+ *
+ *     . the device enters a large-burst mode
+ *
+ * . if a queue Q that does not belong to the burst is activated while
+ *   the device is in large-burst mode and shortly after the last time
+ *   at which a queue either entered the burst list or was marked as
+ *   belonging to the current large burst, then Q is immediately marked
+ *   as belonging to a large burst.
+ *
+ * . if a queue Q that does not belong to the burst is activated a while
+ *   later, i.e., not shortly after, than the last time at which a queue
+ *   either entered the burst list or was marked as belonging to the
+ *   current large burst, then the current burst is deemed as finished and:
+ *
+ *        . the large-burst mode is reset if set
+ *
+ *        . the burst list is emptied
+ *
+ *        . Q is inserted in the burst list, as Q may be the first queue
+ *          in a possible new burst (then the burst list contains just Q
+ *          after this step).
+ */
+static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+			     bool idle_for_long_time)
+{
+	/*
+	 * If bfqq happened to be activated in a burst, but has been idle
+	 * for at least as long as an interactive queue, then we assume
+	 * that, in the overall I/O initiated in the burst, the I/O
+	 * associated to bfqq is finished. So bfqq does not need to be
+	 * treated as a queue belonging to a burst anymore. Accordingly,
+	 * we reset bfqq's in_large_burst flag if set, and remove bfqq
+	 * from the burst list if it's there. We do not decrement instead
+	 * burst_size, because the fact that bfqq does not need to belong
+	 * to the burst list any more does not invalidate the fact that
+	 * bfqq may have been activated during the current burst.
+	 */
+	if (idle_for_long_time) {
+		hlist_del_init(&bfqq->burst_list_node);
+		bfq_clear_bfqq_in_large_burst(bfqq);
+	}
+
+	/*
+	 * If bfqq is already in the burst list or is part of a large
+	 * burst, then there is nothing else to do.
+	 */
+	if (!hlist_unhashed(&bfqq->burst_list_node) ||
+	    bfq_bfqq_in_large_burst(bfqq))
+		return;
+
+	/*
+	 * If bfqq's activation happens late enough, then the current
+	 * burst is finished, and related data structures must be reset.
+	 *
+	 * In this respect, consider the special case where bfqq is the very
+	 * first queue being activated. In this case, last_ins_in_burst is
+	 * not yet significant when we get here. But it is easy to verify
+	 * that, whether or not the following condition is true, bfqq will
+	 * end up being inserted into the burst list. In particular the
+	 * list will happen to contain only bfqq. And this is exactly what
+	 * has to happen, as bfqq may be the first queue in a possible
+	 * burst.
+	 */
+	if (time_is_before_jiffies(bfqd->last_ins_in_burst +
+	    bfqd->bfq_burst_interval)) {
+		bfqd->large_burst = false;
+		bfq_reset_burst_list(bfqd, bfqq);
+		return;
+	}
+
+	/*
+	 * If we get here, then bfqq is being activated shortly after the
+	 * last queue. So, if the current burst is also large, we can mark
+	 * bfqq as belonging to this large burst immediately.
+	 */
+	if (bfqd->large_burst) {
+		bfq_mark_bfqq_in_large_burst(bfqq);
+		return;
+	}
+
+	/*
+	 * If we get here, then a large-burst state has not yet been
+	 * reached, but bfqq is being activated shortly after the last
+	 * queue. Then we add bfqq to the burst.
+	 */
+	bfq_add_to_burst(bfqd, bfqq);
+}
+
+static void bfq_add_request(struct request *rq)
+{
+	struct bfq_queue *bfqq = RQ_BFQQ(rq);
+	struct bfq_entity *entity = &bfqq->entity;
+	struct bfq_data *bfqd = bfqq->bfqd;
+	struct request *next_rq, *prev;
+	unsigned long old_wr_coeff = bfqq->wr_coeff;
+	bool interactive = false;
+
+	bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq));
+	bfqq->queued[rq_is_sync(rq)]++;
+	bfqd->queued++;
+
+	elv_rb_add(&bfqq->sort_list, rq);
+
+	/*
+	 * Check if this request is a better next-serve candidate.
+	 */
+	prev = bfqq->next_rq;
+	next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);
+	BUG_ON(!next_rq);
+	bfqq->next_rq = next_rq;
+
+	if (!bfq_bfqq_busy(bfqq)) {
+		bool soft_rt, in_burst,
+		     idle_for_long_time = time_is_before_jiffies(
+						bfqq->budget_timeout +
+						bfqd->bfq_wr_min_idle_time);
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+		bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq,
+					 rq->cmd_flags);
+#endif
+		if (bfq_bfqq_sync(bfqq)) {
+			bool already_in_burst =
+			   !hlist_unhashed(&bfqq->burst_list_node) ||
+			   bfq_bfqq_in_large_burst(bfqq);
+			bfq_handle_burst(bfqd, bfqq, idle_for_long_time);
+			/*
+			 * If bfqq was not already in the current burst,
+			 * then, at this point, bfqq either has been
+			 * added to the current burst or has caused the
+			 * current burst to terminate. In particular, in
+			 * the second case, bfqq has become the first
+			 * queue in a possible new burst.
+			 * In both cases last_ins_in_burst needs to be
+			 * moved forward.
+			 */
+			if (!already_in_burst)
+				bfqd->last_ins_in_burst = jiffies;
+		}
+
+		in_burst = bfq_bfqq_in_large_burst(bfqq);
+		soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 &&
+			!in_burst &&
+			time_is_before_jiffies(bfqq->soft_rt_next_start);
+		interactive = !in_burst && idle_for_long_time;
+		entity->budget = max_t(unsigned long, bfqq->max_budget,
+				       bfq_serv_to_charge(next_rq, bfqq));
+
+		if (!bfq_bfqq_IO_bound(bfqq)) {
+			if (time_before(jiffies,
+					RQ_BIC(rq)->ttime.last_end_request +
+					bfqd->bfq_slice_idle)) {
+				bfqq->requests_within_timer++;
+				if (bfqq->requests_within_timer >=
+				    bfqd->bfq_requests_within_timer)
+					bfq_mark_bfqq_IO_bound(bfqq);
+			} else
+				bfqq->requests_within_timer = 0;
+		}
+
+		if (!bfqd->low_latency)
+			goto add_bfqq_busy;
+
+		/*
+		 * If the queue:
+		 * - is not being boosted,
+		 * - has been idle for enough time,
+		 * - is not a sync queue or is linked to a bfq_io_cq (it is
+		 *   shared "for its nature" or it is not shared and its
+		 *   requests have not been redirected to a shared queue)
+		 * start a weight-raising period.
+		if (old_wr_coeff == 1 && (interactive || soft_rt) &&
+		    (!bfq_bfqq_sync(bfqq) || bfqq->bic)) {
+			bfqq->wr_coeff = bfqd->bfq_wr_coeff;
+			if (interactive)
+				bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
+			else
+				bfqq->wr_cur_max_time =
+					bfqd->bfq_wr_rt_max_time;
+			bfq_log_bfqq(bfqd, bfqq,
+				     "wrais starting at %lu, rais_max_time %u",
+				     jiffies,
+				     jiffies_to_msecs(bfqq->wr_cur_max_time));
+		} else if (old_wr_coeff > 1) {
+			if (interactive)
+				bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
+			else if (in_burst ||
+				 (bfqq->wr_cur_max_time ==
+				  bfqd->bfq_wr_rt_max_time &&
+				  !soft_rt)) {
+				bfqq->wr_coeff = 1;
+				bfq_log_bfqq(bfqd, bfqq,
+					"wrais ending at %lu, rais_max_time %u",
+					jiffies,
+					jiffies_to_msecs(bfqq->
+						wr_cur_max_time));
+			} else if (time_before(
+					bfqq->last_wr_start_finish +
+					bfqq->wr_cur_max_time,
+					jiffies +
+					bfqd->bfq_wr_rt_max_time) &&
+				   soft_rt) {
+				/*
+				 *
+				 * The remaining weight-raising time is lower
+				 * than bfqd->bfq_wr_rt_max_time, which means
+				 * that the application is enjoying weight
+				 * raising either because deemed soft-rt in
+				 * the near past, or because deemed interactive
+				 * a long ago.
+				 * In both cases, resetting now the current
+				 * remaining weight-raising time for the
+				 * application to the weight-raising duration
+				 * for soft rt applications would not cause any
+				 * latency increase for the application (as the
+				 * new duration would be higher than the
+				 * remaining time).
Loading
Loading full blame...