From 3b527d479c2237f51f1df19e964f81f245517841 Mon Sep 17 00:00:00 2001
From: Arianna Avanzini <avanzini.arianna@gmail.com>
Date: Fri, 14 Jun 2013 16:18:30 +0200
Subject: [PATCH] block: Switch from BFQ-v6r1 for 2.6.38 to BFQ-v6r2 for
 2.6.38.

Improvements:

- Fairness fix: the case of queue expiration for budget timeout is
  now correctly handled also for sync queues, thus allowing also
  the processes corresponding to these queues to be guaranteed their
  reserved share of the disk throughput.
- Fixed a bug that prevented group weights from being correctly
  set via the sysfs interface.
- Fixed a bug that cleared a previously-set group weight if the
  same value was re-inserted via the sysfs interface.
- Fixed an EQM bug that allowed a newly-started process to skip
  its initial weight-raising period if its queue was merged before
  its first request was inserted.
- Fixed a bug that preserved already-started weight-raising periods
  even if the low_latency tunable was disabled.
- The raising_max_time tunable now shows, more user-friendly, the
  maximum raising time in milliseconds.

Signed-off-by: Paolo Valente <paolo.valente@unimore.it>
Signed-off-by: Arianna Avanzini <avanzini.arianna@gmail.com>
---
 block/bfq-cgroup.c  |  47 ++++++++++++---
 block/bfq-ioc.c     |  20 +++++-
 block/bfq-iosched.c | 171 ++++++++++++++++++++++++++++++++++++++--------------
 block/bfq.h         |   4 +-
 4 files changed, 188 insertions(+), 54 deletions(-)

diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index 18099f5..09800da 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -61,11 +61,22 @@ static inline void bfq_group_init_entity(struct bfqio_cgroup *bgrp,
 {
 	struct bfq_entity *entity = &bfqg->entity;
 
-	entity->weight = entity->new_weight = bgrp->weight;
-	entity->orig_weight = entity->new_weight;
-	entity->ioprio = entity->new_ioprio = bgrp->ioprio;
+	/*
+	 * If the weight of the entity has never been set via the sysfs
+	 * interface, then bgrp->weight == 0. In this case we initialize
+	 * the weight from the current ioprio value. Otherwise, the group
+	 * weight, if set, has priority over the ioprio value.
+	 */
+	if (bgrp->weight == 0) {
+		entity->new_weight = bfq_ioprio_to_weight(bgrp->ioprio);
+		entity->new_ioprio = bgrp->ioprio;
+	} else {
+		entity->new_weight = bgrp->weight;
+		entity->new_ioprio = bfq_weight_to_ioprio(bgrp->weight);
+	}
+	entity->orig_weight = entity->weight = entity->new_weight;
+	entity->ioprio = entity->new_ioprio;
 	entity->ioprio_class = entity->new_ioprio_class = bgrp->ioprio_class;
-	entity->ioprio_changed = 1;
 	entity->my_sched_data = &bfqg->sched_data;
 }
 
@@ -525,6 +536,15 @@ static void bfq_destroy_group(struct bfqio_cgroup *bgrp, struct bfq_group *bfqg)
 	kfree(bfqg);
 }
 
+static void bfq_end_raising_async(struct bfq_data *bfqd)
+{
+	struct hlist_node *pos, *n;
+	struct bfq_group *bfqg;
+
+	hlist_for_each_entry_safe(bfqg, pos, n, &bfqd->group_list, bfqd_node)
+		bfq_end_raising_async_queues(bfqd, bfqg);
+}
+
 /**
  * bfq_disconnect_groups - diconnect @bfqd from all its groups.
  * @bfqd: the device descriptor being exited.
@@ -645,9 +665,17 @@ static int bfqio_cgroup_##__VAR##_write(struct cgroup *cgroup,		\
 	spin_lock_irq(&bgrp->lock);					\
 	bgrp->__VAR = (unsigned short)val;				\
 	hlist_for_each_entry(bfqg, n, &bgrp->group_data, group_node) {	\
-		bfqg->entity.new_##__VAR = (unsigned short)val;		\
-		smp_wmb();						\
-		bfqg->entity.ioprio_changed = 1;			\
+		/*                                                      \
+		 * Setting the ioprio_changed flag of the entity        \
+		 * to 1 with new_##__VAR == ##__VAR would re-set        \
+		 * the value of the weight to its ioprio mapping.       \
+		 * Set the flag only if necessary.                      \
+		 */                                                     \
+		if ((unsigned short)val != bfqg->entity.new_##__VAR) {  \
+			bfqg->entity.new_##__VAR = (unsigned short)val; \
+			smp_wmb();                                      \
+			bfqg->entity.ioprio_changed = 1;                \
+		}                                                       \
 	}								\
 	spin_unlock_irq(&bgrp->lock);					\
 									\
@@ -817,6 +845,11 @@ static inline void bfq_bfqq_move(struct bfq_data *bfqd,
 {
 }
 
+static void bfq_end_raising_async(struct bfq_data *bfqd)
+{
+	bfq_end_raising_async_queues(bfqd, bfqd->root_group);
+}
+
 static inline void bfq_disconnect_groups(struct bfq_data *bfqd)
 {
 	bfq_put_async_queues(bfqd, bfqd->root_group);
diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c
index bfe81c2..7dcf60f 100644
--- a/block/bfq-ioc.c
+++ b/block/bfq-ioc.c
@@ -181,7 +181,25 @@ static struct cfq_io_context *bfq_alloc_io_context(struct bfq_data *bfqd,
 							bfqd->queue->node);
 	if (cic != NULL) {
 		cic->last_end_request = jiffies;
-		cic->raising_time_left = 0;
+		/*
+		 * A newly created cic indicates that the process has just
+		 * started doing I/O, and is probably mapping into memory its
+		 * executable and libraries: it definitely needs weight raising.
+		 * There is however the possibility that the process performs,
+		 * for a while, I/O close to some other process. EQM intercepts
+		 * this behavior and may merge the queue corresponding to the
+		 * process  with some other queue, BEFORE the weight of the queue
+		 * is raised. Merged queues are not weight-raised (they are assumed
+		 * to belong to processes that benefit only from high throughput).
+		 * If the merge is basically the consequence of an accident, then
+		 * the queue will be split soon and will get back its old weight.
+		 * It is then important to write down somewhere that this queue
+		 * does need weight raising, even if it did not make it to get its
+		 * weight raised before being merged. To this purpose, we overload
+		 * the field raising_time_left and assign 1 to it, to mark the queue
+		 * as needing weight raising.
+		 */
+		cic->raising_time_left = 1;
 		INIT_LIST_HEAD(&cic->queue_list);
 		INIT_HLIST_NODE(&cic->cic_list);
 		cic->dtor = bfq_free_io_context;
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 6129209..59f0556 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -466,7 +466,7 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct cfq_io_context *cic)
 		bfq_mark_bfqq_idle_window(bfqq);
 	else
 		bfq_clear_bfqq_idle_window(bfqq);
-	if (cic->raising_time_left) {
+	if (cic->raising_time_left && bfqq->bfqd->low_latency) {
 		/*
 		 * Start a weight raising period with the duration given by
 		 * the raising_time_left snapshot.
@@ -475,6 +475,12 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct cfq_io_context *cic)
 		bfqq->raising_cur_max_time = cic->raising_time_left;
 		bfqq->last_rais_start_finish = jiffies;
 	}
+	/*
+	 * Clear raising_time_left to prevent bfq_bfqq_save_state() from
+	 * getting confused about the queue's need of a weight-raising
+	 * period.
+	 */
+	cic->raising_time_left = 0;
 }
 
 /*
@@ -734,6 +740,44 @@ static void bfq_merged_requests(struct request_queue *q, struct request *rq,
 	bfq_remove_request(next);
 }
 
+/* Must be called with bfqq != NULL */
+static inline void bfq_bfqq_end_raising(struct bfq_queue *bfqq)
+{
+	BUG_ON(bfqq == NULL);
+	bfqq->raising_coeff = 1;
+	bfqq->raising_cur_max_time = 0;
+	/* Trigger a weight change on the next activation of the queue */
+	bfqq->entity.ioprio_changed = 1;
+}
+
+static void bfq_end_raising_async_queues(struct bfq_data *bfqd,
+					struct bfq_group *bfqg)
+{
+	int i, j;
+
+	for (i = 0; i < 2; i++)
+		for (j = 0; j < IOPRIO_BE_NR; j++)
+			if (bfqg->async_bfqq[i][j] != NULL)
+				bfq_bfqq_end_raising(bfqg->async_bfqq[i][j]);
+	if (bfqg->async_idle_bfqq != NULL)
+		bfq_bfqq_end_raising(bfqg->async_idle_bfqq);
+}
+
+static void bfq_end_raising(struct bfq_data *bfqd)
+{
+	struct bfq_queue *bfqq;
+
+	spin_lock_irq(bfqd->queue->queue_lock);
+
+	list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)
+		bfq_bfqq_end_raising(bfqq);
+	list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list)
+		bfq_bfqq_end_raising(bfqq);
+	bfq_end_raising_async(bfqd);
+
+	spin_unlock_irq(bfqd->queue->queue_lock);
+}
+
 static inline sector_t bfq_io_struct_pos(void *io_struct, bool request)
 {
 	if (request)
@@ -972,7 +1016,16 @@ bfq_bfqq_save_state(struct bfq_queue *bfqq)
 	 */
 	if (bfqq->cic == NULL)
 		return;
-	if (bfqq->raising_coeff > 1) {
+	if (bfqq->cic->raising_time_left)
+		/*
+		 * This is the queue of a just-started process, and would
+		 * deserve weight raising: we set raising_time_left to the full
+		 * weight-raising duration to trigger weight-raising when and
+		 * if the queue is split and the first request of the queue
+		 * is enqueued.
+		 */
+		bfqq->cic->raising_time_left = bfq_wrais_duration(bfqq->bfqd);
+	else if (bfqq->raising_coeff > 1) {
 		unsigned long wrais_duration =
 			jiffies - bfqq->last_rais_start_finish;
 		/*
@@ -994,8 +1047,7 @@ bfq_bfqq_save_state(struct bfq_queue *bfqq)
 		 * both cases it should not be owned by an interactive or soft
 		 * real-time application.
 		 */
-		bfqq->raising_coeff = 1;
-		bfqq->entity.ioprio_changed = 1;
+		bfq_bfqq_end_raising(bfqq);
 	} else
 		bfqq->cic->raising_time_left = 0;
 	bfqq->cic->saved_idle_window = bfq_bfqq_idle_window(bfqq);
@@ -1172,13 +1224,6 @@ static void bfq_arm_slice_timer(struct bfq_data *bfqd)
 
 	WARN_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
 
-	if (bfq_queue_nonrot_noidle(bfqd, bfqq))
-		return;
-
-	/* Idling is disabled, either manually or by past process history. */
-	if (bfqd->bfq_slice_idle == 0 || !bfq_bfqq_idle_window(bfqq))
-		return;
-
 	/* Tasks have exited, don't wait. */
 	cic = bfqd->active_cic;
 	if (cic == NULL || atomic_read(&cic->ioc->nr_tasks) == 0)
@@ -1687,6 +1732,50 @@ static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
 }
 
 /*
+ * If the active queue is empty, but it is sync and either of the following
+ * conditions holds, then: 1) the queue must remain active and cannot be
+ * expired, and 2) the disk must be idled to wait for the possible arrival
+ * of a new request for the queue. The conditions are:
+ * - the device is rotational and not performing NCQ, and the queue has its
+ *   idle window set (in this case, waiting for a new request for the queue
+ *   is likely to boost the disk throughput);
+ * - the queue is weight-raised (waiting for the request is necessary for
+ *   providing the queue with fairness and latency guarantees).
+ *
+ * In any case, idling can be disabled for cooperation issues, if
+ * 1) there is a close cooperator for the queue, or
+ * 2) the queue is shared and some cooperator is likely to be idle (in this
+ *    case, by not arming the idle timer, we try to slow down the queue, to
+ *    prevent the zones of the disk accessed by the active cooperators to
+ *    become too distant from the zone that will be accessed by the currently
+ *    idle cooperators).
+ */
+static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq,
+				      int budg_timeout)
+{
+	struct bfq_data *bfqd = bfqq->bfqd;
+
+	struct bfq_queue *coop_bfqq;
+
+	spin_lock(&bfqd->eqm_lock);
+	coop_bfqq = bfq_close_cooperator(bfqd, bfqq, bfqd->last_position);
+	spin_unlock(&bfqd->eqm_lock);
+
+	return (bfq_bfqq_sync(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list) &&
+		bfqd->bfq_slice_idle != 0 &&
+		((bfq_bfqq_idle_window(bfqq) && !bfqd->hw_tag &&
+		  !blk_queue_nonrot(bfqd->queue))
+		 || bfqq->raising_coeff > 1) &&
+		(bfqd->rq_in_driver == 0 ||
+				budg_timeout ||
+                                bfqq->raising_coeff > 1) &&
+                !coop_bfqq &&
+                (!bfq_bfqq_coop(bfqq) ||
+			!bfq_bfqq_some_coop_idle(bfqq)) &&
+		!bfq_queue_nonrot_noidle(bfqd, bfqq));
+}
+
+/*
  * Select a queue for service.  If we have a current active queue,
  * check whether to continue servicing it, or retrieve and set a new one.
  */
@@ -1695,6 +1784,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
 	struct bfq_queue *bfqq;
 	struct request *next_rq;
 	enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;
+	int budg_timeout;
 
 	bfqq = bfqd->active_queue;
 	if (bfqq == NULL)
@@ -1702,7 +1792,9 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
 
 	bfq_log_bfqq(bfqd, bfqq, "select_queue: already active queue");
 
-	if (bfq_may_expire_for_budg_timeout(bfqq))
+	budg_timeout = bfq_may_expire_for_budg_timeout(bfqq);
+	if (budg_timeout &&
+	    !bfq_bfqq_must_idle(bfqq, budg_timeout))
 		goto expire;
 
 	next_rq = bfqq->next_rq;
@@ -1746,8 +1838,9 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
 	 * then keep it.
 	 */
 	if (timer_pending(&bfqd->idle_slice_timer) ||
-	    (bfqq->dispatched != 0 && bfq_bfqq_idle_window(bfqq) &&
-	    !bfq_queue_nonrot_noidle(bfqd, bfqq))) {
+	    (bfqq->dispatched != 0 &&
+	     (bfq_bfqq_idle_window(bfqq) || bfqq->raising_coeff > 1) &&
+	     !bfq_queue_nonrot_noidle(bfqd, bfqq))) {
 		bfqq = NULL;
 		goto keep_queue;
 	}
@@ -1795,10 +1888,8 @@ static void update_raising_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 			if (soft_rt)
 				bfqq->raising_cur_max_time =
 					bfqd->bfq_raising_rt_max_time;
-			else {
-				bfqq->raising_coeff = 1;
-				entity->ioprio_changed = 1;
-			}
+			else
+				bfq_bfqq_end_raising(bfqq);
 		}
 	}
 	/* Update weight both if it must be raised and if it must be lowered */
@@ -1808,7 +1899,6 @@ static void update_raising_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 			entity);
 }
 
-
 /*
  * Dispatch one request from bfqq, moving it to the request queue
  * dispatch list.
@@ -2461,6 +2551,14 @@ static void bfq_insert_request(struct request_queue *q, struct request *rq)
 
 	bfq_add_rq_rb(rq);
 
+	/*
+	 * Here a newly-created bfq_queue has already started a weight-raising
+	 * period: clear raising_time_left to prevent bfq_bfqq_save_state()
+	 * from assigning it a full weight-raising period. See the detailed
+	 * comments about this field in bfq_init_icq().
+	 */
+	if (bfqq->cic != NULL)
+		bfqq->cic->raising_time_left = 0;
 	rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]);
 	list_add_tail(&rq->queuelist, &bfqq->fifo);
 
@@ -2494,7 +2592,7 @@ static void bfq_update_hw_tag(struct bfq_data *bfqd)
 
 static void bfq_completed_request(struct request_queue *q, struct request *rq)
 {
-	struct bfq_queue *bfqq = RQ_BFQQ(rq), *coop_bfqq;
+	struct bfq_queue *bfqq = RQ_BFQQ(rq);
 	struct bfq_data *bfqd = bfqq->bfqd;
 	const int sync = rq_is_sync(rq);
 
@@ -2519,33 +2617,14 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq)
 	 * or if we want to idle in case it has no pending requests.
 	 */
 	if (bfqd->active_queue == bfqq) {
+		int budg_timeout = bfq_may_expire_for_budg_timeout(bfqq);
 		if (bfq_bfqq_budget_new(bfqq))
 			bfq_set_budget_timeout(bfqd);
 
-		/* Idling is disabled also for cooperation issues:
-		 * 1) there is a close cooperator for the queue, or
-		 * 2) the queue is shared and some cooperator is likely
-		 *    to be idle (in this case, by not arming the idle timer,
-		 *    we try to slow down the queue, to prevent the zones
-		 *    of the disk accessed by the active cooperators to become
-		 *    too distant from the zone that will be accessed by the
-		 *    currently idle cooperators)
-		 */
-		if (bfq_may_expire_for_budg_timeout(bfqq))
+		if (bfq_bfqq_must_idle(bfqq, budg_timeout))
+			bfq_arm_slice_timer(bfqd);
+		else if (budg_timeout)
 			bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);
-		else if (sync &&
-			(bfqd->rq_in_driver == 0 ||
-				bfqq->raising_coeff > 1)
-			&& RB_EMPTY_ROOT(&bfqq->sort_list)
-			&& (!bfq_bfqq_coop(bfqq) ||
-				!bfq_bfqq_some_coop_idle(bfqq))) {
-				spin_lock(&bfqd->eqm_lock);
-				coop_bfqq = bfq_close_cooperator(bfqd, bfqq,
-						bfqd->last_position);
-				spin_unlock(&bfqd->eqm_lock);
-				if (!coop_bfqq)
-					bfq_arm_slice_timer(bfqd);
-			}
 	}
 
 	if (!bfqd->rq_in_driver)
@@ -3042,8 +3121,8 @@ static ssize_t bfq_raising_max_time_show(struct elevator_queue *e, char *page)
 {
 	struct bfq_data *bfqd = e->elevator_data;
 	return sprintf(page, "%d\n", bfqd->bfq_raising_max_time > 0 ?
-		       bfqd->bfq_raising_max_time :
-		       bfq_wrais_duration(bfqd));
+		       jiffies_to_msecs(bfqd->bfq_raising_max_time) :
+		       jiffies_to_msecs(bfq_wrais_duration(bfqd)));
 }
 
 static ssize_t bfq_weights_show(struct elevator_queue *e, char *page)
@@ -3220,6 +3299,8 @@ static ssize_t bfq_low_latency_store(struct elevator_queue *e,
 
 	if (__data > 1)
 		__data = 1;
+	if (__data == 0 && bfqd->low_latency != 0)
+		bfq_end_raising(bfqd);
 	bfqd->low_latency = __data;
 
 	return ret;
diff --git a/block/bfq.h b/block/bfq.h
index fc4ec1e..d8335f1 100644
--- a/block/bfq.h
+++ b/block/bfq.h
@@ -1,5 +1,5 @@
 /*
- * BFQ-v6r1 for 2.6.38: data structures and common functions prototypes.
+ * BFQ-v6r2 for 2.6.38: data structures and common functions prototypes.
  *
  * Based on ideas and code from CFQ:
  * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
@@ -601,6 +601,8 @@ static void bfq_dispatch_insert(struct request_queue *q, struct request *rq);
 static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
 				       struct bfq_group *bfqg, int is_sync,
 				       struct io_context *ioc, gfp_t gfp_mask);
+static void bfq_end_raising_async_queues(struct bfq_data *bfqd,
+					 struct bfq_group *bfqg);
 static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
 static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
 #endif
-- 
1.8.1.4