From a0b843052817e18d02f7d4848a80b0f85b7f0644 Mon Sep 17 00:00:00 2001 From: Arianna Avanzini Date: Fri, 14 Feb 2014 15:40:17 +0100 Subject: [PATCH] block: Switch from BFQ-v7r1 for 3.2.0 to BFQ-v7r2 for 3.2.0 1) BUGFIX/IMPROVEMENT. One of the requirements for an application to be deemed as soft real-time is that it issues its requests in batches, and stops doing I/O for a well-defined amount of time before issuing a new batch. Imposing this minimum idle time allows BFQ to filter out I/O-bound applications that may otherwise be incorrectly deemed as soft real-time (under the circumstances described in detail in the comments to the function bfq_bfqq_softrt_next_start()). Unfortunately, BFQ could however start counting this idle time from two different events: either from the expiration of the queue, if all requests of the queue had also been already completed when the queue expired, or, if the previous condition did not hold, from the first completion of one of the still outstanding requests. In the second case, an application had more chances to be deemed as soft real-time. Actually, there was no reason for this differentiated treatment. We addressed this issue by defining more precisely the above requirement for an application to be deemed as soft real-time, and changing the code consequently: a well-defined amount of time must elapse between the completion of *all the requests* of the current pending batch and the issuing of the first request of the next batch (this is, in the end, what happens with a true soft real-time application). This change further reduced false positives, and, as such, improved responsiveness and reduced latency for actual soft real-time applications. 2) CODE IMPROVEMENT. We cleaned up the code a little bit and addressed some issues pointed out by the checkpatch.pl script. Signed-off-by: Paolo Valente Signed-off-by: Arianna Avanzini --- block/Kconfig.iosched | 2 +- block/bfq-cgroup.c | 29 ++++++++--- block/bfq-iosched.c | 142 +++++++++++++++++++++++++------------------------- block/bfq.h | 26 +++++---- 4 files changed, 112 insertions(+), 87 deletions(-) diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index 337c1f9..2b6d805 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched @@ -53,7 +53,7 @@ config IOSCHED_BFQ It aims at distributing the bandwidth as desired, independently of the disk parameters and with any workload. It also tries to guarantee low latency to interactive and soft real-time - applications. If compiled built-in (saying Y here), BFQ can + applications. If compiled built-in (saying Y here), BFQ can be configured to support hierarchical scheduling. config CGROUP_BFQIO diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 8de0181..5ad20fd 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -156,7 +156,7 @@ cleanup: } /** - * bfq_group_chain_link - link an allocatd group chain to a cgroup hierarchy. + * bfq_group_chain_link - link an allocated group chain to a cgroup hierarchy. * @bfqd: the queue descriptor. * @cgroup: the leaf cgroup to start from. * @leaf: the leaf group (to be associated to @cgroup). @@ -216,7 +216,7 @@ static void bfq_group_chain_link(struct bfq_data *bfqd, struct cgroup *cgroup, * to the root have a group associated to @bfqd. * * If the allocation fails, return the root group: this breaks guarantees - * but is a safe fallbak. If this loss becames a problem it can be + * but is a safe fallback. If this loss becomes a problem it can be * mitigated using the equivalent weight (given by the product of the * weights of the groups in the path from @group to the root) in the * root scheduler. @@ -488,7 +488,7 @@ static void bfq_destroy_group(struct bfqio_cgroup *bgrp, struct bfq_group *bfqg) /* * The idle tree may still contain bfq_queues belonging * to exited task because they never migrated to a different - * cgroup from the one being destroyed now. Noone else + * cgroup from the one being destroyed now. No one else * can access them so it's safe to act without any lock. */ bfq_flush_idle_tree(st); @@ -532,7 +532,7 @@ static void bfq_destroy_group(struct bfqio_cgroup *bgrp, struct bfq_group *bfqg) /* * No need to defer the kfree() to the end of the RCU grace * period: we are called from the destroy() callback of our - * cgroup, so we can be sure that noone is a) still using + * cgroup, so we can be sure that no one is a) still using * this cgroup or b) doing lookups in it. */ kfree(bfqg); @@ -549,7 +549,7 @@ static void bfq_end_raising_async(struct bfq_data *bfqd) } /** - * bfq_disconnect_groups - diconnect @bfqd from all its groups. + * bfq_disconnect_groups - disconnect @bfqd from all its groups. * @bfqd: the device descriptor being exited. * * When the device exits we just make sure that no lookup can return @@ -672,10 +672,25 @@ static int bfqio_cgroup_##__VAR##_write(struct cgroup *cgroup, \ * Setting the ioprio_changed flag of the entity \ * to 1 with new_##__VAR == ##__VAR would re-set \ * the value of the weight to its ioprio mapping. \ - * Set the flag only if necessary. \ - */ \ + * Set the flag only if necessary. \ + */ \ if ((unsigned short)val != bfqg->entity.new_##__VAR) { \ bfqg->entity.new_##__VAR = (unsigned short)val; \ + /* \ + * Make sure that the above new value has been \ + * stored in bfqg->entity.new_##__VAR before \ + * setting the ioprio_changed flag. In fact, \ + * this flag may be read asynchronously (in \ + * critical sections protected by a different \ + * lock than that held here), and finding this \ + * flag set may cause the execution of the code \ + * for updating parameters whose value may \ + * depend also on bfqg->entity.new_##__VAR (in \ + * __bfq_entity_update_weight_prio). \ + * This barrier makes sure that the new value \ + * of bfqg->entity.new_##__VAR is correctly \ + * seen in that code. \ + */ \ smp_wmb(); \ bfqg->entity.ioprio_changed = 1; \ } \ diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index feb03fa..44e09a0 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -1,5 +1,5 @@ /* - * BFQ, or Budget Fair Queueing, disk scheduler. + * Budget Fair Queueing (BFQ) disk scheduler. * * Based on ideas and code from CFQ: * Copyright (C) 2003 Jens Axboe @@ -14,7 +14,7 @@ * BFQ is a proportional share disk scheduling algorithm based on the * slice-by-slice service scheme of CFQ. But BFQ assigns budgets, measured in * number of sectors, to tasks instead of time slices. The disk is not granted - * to the in-service task for a given time slice, but until it has exahusted + * to the in-service task for a given time slice, but until it has exhausted * its assigned budget. This change from the time to the service domain allows * BFQ to distribute the disk bandwidth among tasks as desired, without any * distortion due to ZBR, workload fluctuations or other factors. BFQ uses an @@ -87,7 +87,7 @@ static const int bfq_max_budget_async_rq = 4; /* * Async to sync throughput distribution is controlled as follows: * when an async request is served, the entity is charged the number - * of sectors of the request, multipled by the factor below + * of sectors of the request, multiplied by the factor below */ static const int bfq_async_charge_factor = 10; @@ -591,8 +591,8 @@ static void bfq_add_rq_rb(struct request *rq) * The remaining weight-raising time is lower * than bfqd->bfq_raising_rt_max_time, which * means that the application is enjoying - * weight raising either because deemed soft rt - * in the near past, or because deemed + * weight raising either because deemed soft- + * rt in the near past, or because deemed * interactive a long ago. In both cases, * resetting now the current remaining weight- * raising time for the application to the @@ -1190,9 +1190,9 @@ static int bfq_allow_merge(struct request_queue *q, struct request *rq, if (new_bfqq != NULL) { bfq_merge_bfqqs(bfqd, cic, bfqq, new_bfqq); /* - * If we get here, the bio will be queued in the shared queue, - * i.e., new_bfqq, so use new_bfqq to decide whether bio and - * rq can be merged. + * If we get here, the bio will be queued in the shared + * queue, i.e., new_bfqq, so use new_bfqq to decide + * whether bio and rq can be merged. */ bfqq = new_bfqq; } @@ -1272,8 +1272,8 @@ static inline bool bfq_queue_nonrot_noidle(struct bfq_data *bfqd, * a problem with sync vs async workloads; * - the queue is not weight-raised, to preserve guarantees. */ - return (blk_queue_nonrot(bfqd->queue) && bfqd->hw_tag && - in_service_bfqq->raising_coeff == 1); + return blk_queue_nonrot(bfqd->queue) && bfqd->hw_tag && + (in_service_bfqq->raising_coeff == 1); } static void bfq_arm_slice_timer(struct bfq_data *bfqd) @@ -1455,7 +1455,7 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, case BFQ_BFQQ_TOO_IDLE: /* * This is the only case where we may reduce - * the budget: if there is no requets of the + * the budget: if there is no request of the * process still waiting for completion, then * we assume (tentatively) that the timer has * expired because the batch of requests of @@ -1471,13 +1471,13 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, * requests, then the process may have not yet * issued its next request just because it is * still waiting for the completion of some of - * the still oustanding ones. So in this + * the still outstanding ones. So in this * subcase we do not reduce its budget, on the * contrary we increase it to possibly boost * the throughput, as discussed in the * comments to the BUDGET_TIMEOUT case. */ - if (bfqq->dispatched > 0) /* still oustanding reqs */ + if (bfqq->dispatched > 0) /* still outstanding reqs */ budget = min(budget * 2, bfqd->bfq_max_budget); else { if (budget > 5 * min_budget) @@ -1674,42 +1674,44 @@ static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq, /* * To be deemed as soft real-time, an application must meet two requirements. - * The first is that the application must not require an average bandwidth - * higher than the approximate bandwidth required to playback or record a - * compressed high-definition video. + * First, the application must not require an average bandwidth higher than + * the approximate bandwidth required to playback or record a compressed high- + * definition video. * The next function is invoked on the completion of the last request of a * batch, to compute the next-start time instant, soft_rt_next_start, such * that, if the next request of the application does not arrive before * soft_rt_next_start, then the above requirement on the bandwidth is met. * * The second requirement is that the request pattern of the application is - * isochronous, i.e., that, after issuing a request or a batch of requests, the - * application stops for a while, then issues a new batch, and so on. For this - * reason the next function is invoked to compute soft_rt_next_start only for - * applications that meet this requirement, whereas soft_rt_next_start is set - * to infinity for applications that do not. + * isochronous, i.e., that, after issuing a request or a batch of requests, + * the application stops issuing new requests until all its pending requests + * have been completed. After that, the application may issue a new batch, + * and so on. + * For this reason the next function is invoked to compute soft_rt_next_start + * only for applications that meet this requirement, whereas soft_rt_next_start + * is set to infinity for applications that do not. * * Unfortunately, even a greedy application may happen to behave in an - * isochronous way if several processes are competing for the CPUs. In fact, - * in this scenario the application stops issuing requests while the CPUs are - * busy serving other processes, then restarts, then stops again for a while, - * and so on. In addition, if the disk achieves a low enough throughput with - * the request pattern issued by the application (e.g., because the request - * pattern is random and/or the device is slow), then the above bandwidth - * requirement may happen to be met too. To prevent such a greedy application - * to be deemed as soft real-time, a further rule is used in the computation - * of soft_rt_next_start: soft_rt_next_start must be higher than the current - * time plus the maximum time for which the arrival of a request is waited - * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle. This - * filters out greedy applications, as the latter issue instead their next + * isochronous way if the CPU load is high. In fact, the application may stop + * issuing requests while the CPUs are busy serving other processes, then + * restart, then stop again for a while, and so on. In addition, if the disk + * achieves a low enough throughput with the request pattern issued by the + * application (e.g., because the request pattern is random and/or the device + * is slow), then the application may meet the above bandwidth requirement too. + * To prevent such a greedy application to be deemed as soft real-time, a + * further rule is used in the computation of soft_rt_next_start: + * soft_rt_next_start must be higher than the current time plus the maximum + * time for which the arrival of a request is waited for when a sync queue + * becomes idle, namely bfqd->bfq_slice_idle. + * This filters out greedy applications, as the latter issue instead their next * request as soon as possible after the last one has been completed (in - * contrast, when a batch of requests is completed, a soft real-time - * application spends some time processing data). + * contrast, when a batch of requests is completed, a soft real-time application + * spends some time processing data). * - * Actually, the last filter may easily generate false positives if: only - * bfqd->bfq_slice_idle is used as a reference time interval, and one or - * both the following two cases occur: - * 1) HZ is so low that the duration of a jiffie is comparable to or higher + * Unfortunately, the last filter may easily generate false positives if only + * bfqd->bfq_slice_idle is used as a reference time interval and one or both + * the following cases occur: + * 1) HZ is so low that the duration of a jiffy is comparable to or higher * than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with * HZ=100. * 2) jiffies, instead of increasing at a constant rate, may stop increasing @@ -1730,8 +1732,8 @@ static inline unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, } /* - * Largest-possible time instant such that, for as long as possible, the - * current time will be lower than this time instant according to the macro + * Return the largest-possible time instant such that, for as long as possible, + * the current time will be lower than this time instant according to the macro * time_is_before_jiffies(). */ static inline unsigned long bfq_infinity_from_now(unsigned long now) @@ -1805,22 +1807,18 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, if (bfqd->low_latency && bfqd->bfq_raising_max_softrt_rate > 0 && RB_EMPTY_ROOT(&bfqq->sort_list)) { /* - * If we get here, then the request pattern is - * isochronous (see the comments to the function - * bfq_bfqq_softrt_next_start()). However, if the - * queue still has in-flight requests, then it is - * better to postpone the computation of next_start - * to the next request completion. In fact, if we - * computed it now, then the application might pass - * the greedy-application filter improperly, because - * the arrival of its next request may happen to be - * higher than (jiffies + bfqq->bfqd->bfq_slice_idle) - * not because the application is truly soft real- - * time, but just because the application is currently - * waiting for the completion of some request before - * issuing, as quickly as possible, its next request. + * If we get here, and there are no outstanding requests, + * then the request pattern is isochronous (see the comments + * to the function bfq_bfqq_softrt_next_start()). Hence we can + * compute soft_rt_next_start. If, instead, the queue still + * has outstanding requests, then we have to wait for the + * completion of all the outstanding requests to discover + * whether the request pattern is actually isochronous. */ - if (bfqq->dispatched > 0) { + if (bfqq->dispatched == 0) + bfqq->soft_rt_next_start = + bfq_bfqq_softrt_next_start(bfqd, bfqq); + else { /* * The application is still waiting for the * completion of one or more requests: @@ -1838,10 +1836,12 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, */ bfqq->soft_rt_next_start = bfq_infinity_from_now(jiffies); + /* + * Schedule an update of soft_rt_next_start to when + * the task may be discovered to be isochronous. + */ bfq_mark_bfqq_softrt_update(bfqq); - } else - bfqq->soft_rt_next_start = - bfq_bfqq_softrt_next_start(bfqd, bfqq); + } } bfq_log_bfqq(bfqd, bfqq, @@ -1873,7 +1873,7 @@ static int bfq_bfqq_budget_timeout(struct bfq_queue *bfqq) /* * If we expire a queue that is waiting for the arrival of a new - * request, we may prevent the fictitious timestamp backshifting that + * request, we may prevent the fictitious timestamp back-shifting that * allows the guarantees of the queue to be preserved (see [1] for * this tricky aspect). Hence we return true only if this condition * does not hold, or if the queue is slow enough to deserve only to be @@ -1964,9 +1964,9 @@ static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq) { struct bfq_data *bfqd = bfqq->bfqd; - return (RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 && - bfq_bfqq_must_not_expire(bfqq) && - !bfq_queue_nonrot_noidle(bfqd, bfqq)); + return RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 && + bfq_bfqq_must_not_expire(bfqq) && + !bfq_queue_nonrot_noidle(bfqd, bfqq); } /* @@ -2810,10 +2810,14 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq) RQ_CIC(rq)->ttime.last_end_request = jiffies; /* - * The computation of softrt_next_start was scheduled for the next - * request completion: it is now time to compute it. + * If we are waiting to discover whether the request pattern of the + * task associated with the queue is actually isochronous, and + * both requisites for this condition to hold are satisfied, then + * compute soft_rt_next_start (see the comments to the function + * bfq_bfqq_softrt_next_start()). */ - if (bfq_bfqq_softrt_update(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list)) + if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 && + RB_EMPTY_ROOT(&bfqq->sort_list)) bfqq->soft_rt_next_start = bfq_bfqq_softrt_next_start(bfqd, bfqq); @@ -3108,7 +3112,7 @@ static inline void __bfq_put_async_bfqq(struct bfq_data *bfqd, * Release all the bfqg references to its async queues. If we are * deallocating the group these queues may still contain requests, so * we reparent them to the root cgroup (i.e., the only one that will - * exist for sure untill all the requests on a device are gone). + * exist for sure until all the requests on a device are gone). */ static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) { @@ -3565,7 +3569,7 @@ static int __init bfq_init(void) return -ENOMEM; elv_register(&iosched_bfq); - printk(KERN_INFO "BFQ I/O-scheduler version: v7r1"); + pr_info("BFQ I/O-scheduler version: v7r2"); return 0; } @@ -3587,5 +3591,3 @@ module_init(bfq_init); module_exit(bfq_exit); MODULE_AUTHOR("Fabio Checconi, Paolo Valente"); -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("Budget Fair Queueing IO scheduler"); diff --git a/block/bfq.h b/block/bfq.h index f8dae6c..fecaf06 100644 --- a/block/bfq.h +++ b/block/bfq.h @@ -1,5 +1,5 @@ /* - * BFQ-v7r1 for 3.2.0: data structures and common functions prototypes. + * BFQ-v7r2 for 3.2.0: data structures and common functions prototypes. * * Based on ideas and code from CFQ: * Copyright (C) 2003 Jens Axboe @@ -186,8 +186,16 @@ struct bfq_group; * @seek_mean: mean seek distance * @last_request_pos: position of the last request enqueued * @pid: pid of the process owning the queue, used for logging purposes. - * @last_rais_start_time: last (idle -> weight-raised) transition attempt + * @last_rais_start_finish: start time of the current weight-raising period if + * the @bfq-queue is being weight-raised, otherwise + * finish time of the last weight-raising period * @raising_cur_max_time: current max raising time for this queue + * @soft_rt_next_start: minimum time instant such that, only if a new request + * is enqueued after this time instant in an idle + * @bfq_queue with no outstanding requests, then the + * task associated with the queue it is deemed as soft + * real-time (see the comments to the function + * bfq_bfqq_softrt_next_start()) * @last_idle_bklogged: time of the last transition of the @bfq_queue from * idle to backlogged * @service_from_backlogged: cumulative service received from the @bfq_queue @@ -195,11 +203,11 @@ struct bfq_group; * @cic: pointer to the cfq_io_context owning the bfq_queue, set to %NULL if the * queue is shared * - * A bfq_queue is a leaf request queue; it can be associated to an io_context - * or more (if it is an async one). @cgroup holds a reference to the - * cgroup, to be sure that it does not disappear while a bfqq still - * references it (mostly to avoid races between request issuing and task - * migration followed by cgroup distruction). + * A bfq_queue is a leaf request queue; it can be associated with an io_context + * or more, if it is async or shared between cooperating processes. @cgroup + * holds a reference to the cgroup, to be sure that it does not disappear while + * a bfqq still references it (mostly to avoid races between request issuing and + * task migration followed by cgroup destruction). * All the fields are protected by the queue lock of the containing bfqd. */ struct bfq_queue { @@ -394,9 +402,9 @@ enum bfqq_state_flags { BFQ_BFQQ_FLAG_sync, /* synchronous queue */ BFQ_BFQQ_FLAG_budget_new, /* no completion with this budget */ BFQ_BFQQ_FLAG_coop, /* bfqq is shared */ - BFQ_BFQQ_FLAG_split_coop, /* shared bfqq will be splitted */ + BFQ_BFQQ_FLAG_split_coop, /* shared bfqq will be split */ BFQ_BFQQ_FLAG_just_split, /* queue has just been split */ - BFQ_BFQQ_FLAG_softrt_update, /* needs softrt-next-start update */ + BFQ_BFQQ_FLAG_softrt_update, /* may need softrt-next-start update */ }; #define BFQ_BFQQ_FNS(name) \ -- 1.8.5.3