From ee2d7f2440c3aba2b8973a78e3a81028bb957fde Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Fri, 20 Jun 2014 16:39:09 +0200 Subject: [PATCH] block: Switch from BFQ-v7r4 for 3.10.0 to BFQ-v7r5 for 3.10.0 . IMPROVEMENT: Improve throughput boosting by idling the device only for processes that, in addition to perform sequential I/O, are I/O-bound (apart from weight-raised queues, for which idling is always performed to guarantee them a low latency). . IMPROVEMENT: Improve throughput boosting by depriving processes that cooperate often of weight-raising. . CODE IMPROVEMENT: Pass of improvement of the readability of both comments and actual code. Signed-off-by: Paolo Valente Tested-by: Takashi Iwai Reported-by: Pavel Machek Signed-off-by: Arianna Avanzini --- block/bfq-cgroup.c | 29 ++- block/bfq-iosched.c | 696 +++++++++++++++++++++++++++++----------------------- block/bfq-sched.c | 35 +-- block/bfq.h | 181 +++++++++----- 4 files changed, 538 insertions(+), 403 deletions(-) diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 666123f..43bfb35 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -9,7 +9,8 @@ * * Copyright (C) 2010 Paolo Valente * - * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file. + * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ + * file. */ #ifdef CONFIG_CGROUP_BFQIO @@ -143,8 +144,9 @@ static struct bfq_group *bfq_group_chain_alloc(struct bfq_data *bfqd, bfq_group_set_parent(prev, bfqg); /* * Build a list of allocated nodes using the bfqd - * filed, that is still unused and will be initialized - * only after the node will be connected. + * filed, that is still unused and will be + * initialized only after the node will be + * connected. */ prev->bfqd = bfqg; prev = bfqg; @@ -164,7 +166,8 @@ cleanup: } /** - * bfq_group_chain_link - link an allocated group chain to a cgroup hierarchy. + * bfq_group_chain_link - link an allocated group chain to a cgroup + * hierarchy. * @bfqd: the queue descriptor. * @cgroup: the leaf cgroup to start from. * @leaf: the leaf group (to be associated to @cgroup). @@ -437,7 +440,8 @@ static inline void bfq_reparent_leaf_entity(struct bfq_data *bfqd, } /** - * bfq_reparent_active_entities - move to the root group all active entities. + * bfq_reparent_active_entities - move to the root group all active + * entities. * @bfqd: the device data structure with the root group. * @bfqg: the group to move from. * @st: the service tree with the entities. @@ -482,8 +486,8 @@ static void bfq_destroy_group(struct bfqio_cgroup *bgrp, struct bfq_group *bfqg) hlist_del(&bfqg->group_node); /* - * Empty all service_trees belonging to this group before deactivating - * the group itself. + * Empty all service_trees belonging to this group before + * deactivating the group itself. */ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) { st = bfqg->sched_data.service_tree + i; @@ -503,7 +507,7 @@ static void bfq_destroy_group(struct bfqio_cgroup *bgrp, struct bfq_group *bfqg) * all the leaf entities corresponding to these queues * to the root_group. * Also, it may happen that the group has an entity - * under service, which is disconnected from the active + * in service, which is disconnected from the active * tree: it must be moved, too. * There is no need to put the sync queues, as the * scheduler has taken no reference. @@ -772,10 +776,11 @@ static int bfqio_can_attach(struct cgroup *cgroup, struct cgroup_taskset *tset) ioc = task->io_context; if (ioc != NULL && atomic_read(&ioc->nr_tasks) > 1) /* - * ioc == NULL means that the task is either too young or - * exiting: if it has still no ioc the ioc can't be shared, - * if the task is exiting the attach will fail anyway, no - * matter what we return here. + * ioc == NULL means that the task is either too + * young or exiting: if it has still no ioc the + * ioc can't be shared, if the task is exiting the + * attach will fail anyway, no matter what we + * return here. */ ret = -EINVAL; task_unlock(task); diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 2924f3f..89063f9 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -9,28 +9,32 @@ * * Copyright (C) 2010 Paolo Valente * - * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file. + * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ + * file. * - * BFQ is a proportional share disk scheduling algorithm based on the - * slice-by-slice service scheme of CFQ. But BFQ assigns budgets, measured in - * number of sectors, to tasks instead of time slices. The disk is not granted - * to the in-service task for a given time slice, but until it has exhausted - * its assigned budget. This change from the time to the service domain allows - * BFQ to distribute the disk bandwidth among tasks as desired, without any - * distortion due to ZBR, workload fluctuations or other factors. BFQ uses an - * ad hoc internal scheduler, called B-WF2Q+, to schedule tasks according to - * their budgets (more precisely BFQ schedules queues associated to tasks). - * Thanks to this accurate scheduler, BFQ can afford to assign high budgets to - * disk-bound non-seeky tasks (to boost the throughput), and yet guarantee low - * latencies to interactive and soft real-time applications. + * BFQ is a proportional-share storage-I/O scheduling algorithm based on + * the slice-by-slice service scheme of CFQ. But BFQ assigns budgets, + * measured in number of sectors, to processes instead of time slices. The + * device is not granted to the in-service process for a given time slice, + * but until it has exhausted its assigned budget. This change from the time + * to the service domain allows BFQ to distribute the device throughput + * among processes as desired, without any distortion due to ZBR, workload + * fluctuations or other factors. BFQ uses an ad hoc internal scheduler, + * called B-WF2Q+, to schedule processes according to their budgets. More + * precisely, BFQ schedules queues associated to processes. Thanks to the + * accurate policy of B-WF2Q+, BFQ can afford to assign high budgets to + * I/O-bound processes issuing sequential requests (to boost the + * throughput), and yet guarantee a low latency to interactive and soft + * real-time applications. * * BFQ is described in [1], where also a reference to the initial, more - * theoretical paper on BFQ can be found. The interested reader can find in - * the latter paper full details on the main algorithm as well as formulas of - * the guarantees, plus formal proofs of all the properties. With respect to - * the version of BFQ presented in these papers, this implementation adds a - * few more heuristics, such as the one that guarantees a low latency to soft - * real-time applications, and a hierarchical extension based on H-WF2Q+. + * theoretical paper on BFQ can be found. The interested reader can find + * in the latter paper full details on the main algorithm, as well as + * formulas of the guarantees and formal proofs of all the properties. + * With respect to the version of BFQ presented in these papers, this + * implementation adds a few more heuristics, such as the one that + * guarantees a low latency to soft real-time applications, and a + * hierarchical extension based on H-WF2Q+. * * B-WF2Q+ is based on WF2Q+, that is described in [2], together with * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N) @@ -115,18 +119,19 @@ struct kmem_cache *bfq_pool; #define BFQ_RATE_SHIFT 16 /* - * By default, BFQ computes the duration of the weight raising for interactive - * applications automatically, using the following formula: - * duration = (R / r) * T, where r is the peak rate of the device, and R and T - * are two reference parameters. - * In particular, R is the peak rate of the reference device (see below), and T - * is a reference time: given the systems that are likely to be installed on - * the reference device according to its speed class, T is about the maximum - * time needed, under BFQ and while reading two files in parallel, to load - * typical large applications on these systems. - * In practice, the slower/faster the device at hand is, the more/less it takes - * to load applications with respect to the reference device. Accordingly, the - * longer/shorter BFQ grants weight raising to interactive applications. + * By default, BFQ computes the duration of the weight raising for + * interactive applications automatically, using the following formula: + * duration = (R / r) * T, where r is the peak rate of the device, and + * R and T are two reference parameters. + * In particular, R is the peak rate of the reference device (see below), + * and T is a reference time: given the systems that are likely to be + * installed on the reference device according to its speed class, T is + * about the maximum time needed, under BFQ and while reading two files in + * parallel, to load typical large applications on these systems. + * In practice, the slower/faster the device at hand is, the more/less it + * takes to load applications with respect to the reference device. + * Accordingly, the longer/shorter BFQ grants weight raising to interactive + * applications. * * BFQ uses four different reference pairs (R, T), depending on: * . whether the device is rotational or non-rotational; @@ -148,8 +153,8 @@ static int R_slow[2] = {1536, 10752}; static int R_fast[2] = {17415, 34791}; /* * To improve readability, a conversion function is used to initialize the - * following arrays, which entails that the latter can be initialized only - * in a function. + * following arrays, which entails that they can be initialized only in a + * function. */ static int T_slow[2]; static int T_fast[2]; @@ -391,8 +396,8 @@ static inline bool bfq_differentiated_weights(struct bfq_data *bfqd) * weight-counter tree for the queues may contain at most one node. * This holds even if low_latency is on, because weight-raised queues * are not inserted in the tree. - * In most scenarios, also the rate at which nodes are created/destroyed - * should be low. + * In most scenarios, the rate at which nodes are created/destroyed + * should be low too. */ static void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_entity *entity, @@ -454,7 +459,7 @@ static void bfq_weights_tree_remove(struct bfq_data *bfqd, { /* * Check whether the entity is actually associated with a counter. - * In fact, the device may be not be considered NCQ-capable for a while, + * In fact, the device may not be considered NCQ-capable for a while, * which implies that no insertion in the weight trees is performed, * after which the device may start to be deemed NCQ-capable, and hence * this function may start to be invoked. This may cause the function @@ -566,6 +571,12 @@ static inline unsigned int bfq_wr_duration(struct bfq_data *bfqd) return dur; } +static inline unsigned +bfq_bfqq_cooperations(struct bfq_queue *bfqq) +{ + return bfqq->bic ? bfqq->bic->cooperations : 0; +} + static inline void bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic) { @@ -573,13 +584,18 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic) bfq_mark_bfqq_idle_window(bfqq); else bfq_clear_bfqq_idle_window(bfqq); - if (bic->wr_time_left && bfqq->bfqd->low_latency) { + if (bic->saved_IO_bound) + bfq_mark_bfqq_IO_bound(bfqq); + else + bfq_clear_bfqq_IO_bound(bfqq); + if (bic->wr_time_left && bfqq->bfqd->low_latency && + bic->cooperations < bfqq->bfqd->bfq_coop_thresh) { /* * Start a weight raising period with the duration given by * the raising_time_left snapshot. */ if (bfq_bfqq_busy(bfqq)) - bfqq->bfqd->raised_busy_queues++; + bfqq->bfqd->wr_busy_queues++; bfqq->wr_coeff = bfqq->bfqd->bfq_wr_coeff; bfqq->wr_cur_max_time = bic->wr_time_left; bfqq->last_wr_start_finish = jiffies; @@ -637,13 +653,28 @@ static void bfq_add_request(struct request *rq) if (!bfq_bfqq_busy(bfqq)) { int soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 && + bfq_bfqq_cooperations(bfqq) < bfqd->bfq_coop_thresh && time_is_before_jiffies(bfqq->soft_rt_next_start); - idle_for_long_time = time_is_before_jiffies( + idle_for_long_time = bfq_bfqq_cooperations(bfqq) < + bfqd->bfq_coop_thresh && + time_is_before_jiffies( bfqq->budget_timeout + bfqd->bfq_wr_min_idle_time); entity->budget = max_t(unsigned long, bfqq->max_budget, bfq_serv_to_charge(next_rq, bfqq)); + if (!bfq_bfqq_IO_bound(bfqq)) { + if (time_before(jiffies, + RQ_BIC(rq)->ttime.last_end_request + + bfqd->bfq_slice_idle)) { + bfqq->requests_within_timer++; + if (bfqq->requests_within_timer >= + bfqd->bfq_requests_within_timer) + bfq_mark_bfqq_IO_bound(bfqq); + } else + bfqq->requests_within_timer = 0; + } + if (!bfqd->low_latency) goto add_bfqq_busy; @@ -674,15 +705,17 @@ static void bfq_add_request(struct request *rq) } else if (old_wr_coeff > 1) { if (idle_for_long_time) bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); - else if (bfqq->wr_cur_max_time == - bfqd->bfq_wr_rt_max_time && - !soft_rt) { + else if (bfq_bfqq_cooperations(bfqq) >= + bfqd->bfq_coop_thresh || + (bfqq->wr_cur_max_time == + bfqd->bfq_wr_rt_max_time && + !soft_rt)) { bfqq->wr_coeff = 1; bfq_log_bfqq(bfqd, bfqq, "wrais ending at %lu, rais_max_time %u", jiffies, jiffies_to_msecs(bfqq-> - wr_cur_max_time)); + wr_cur_max_time)); } else if (time_before( bfqq->last_wr_start_finish + bfqq->wr_cur_max_time, @@ -754,12 +787,12 @@ add_bfqq_busy: bfqq->wr_coeff = bfqd->bfq_wr_coeff; bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); - bfqd->raised_busy_queues++; + bfqd->wr_busy_queues++; entity->ioprio_changed = 1; bfq_log_bfqq(bfqd, bfqq, - "non-idle wrais starting at %lu, rais_max_time %u", - jiffies, - jiffies_to_msecs(bfqq->wr_cur_max_time)); + "non-idle wrais starting at %lu, rais_max_time %u", + jiffies, + jiffies_to_msecs(bfqq->wr_cur_max_time)); } if (prev != bfqq->next_rq) bfq_updated_next_req(bfqd, bfqq); @@ -799,11 +832,12 @@ static void bfq_activate_request(struct request_queue *q, struct request *rq) (long long unsigned)bfqd->last_position); } -static void bfq_deactivate_request(struct request_queue *q, struct request *rq) +static inline void bfq_deactivate_request(struct request_queue *q, + struct request *rq) { struct bfq_data *bfqd = q->elevator->elevator_data; - WARN_ON(bfqd->rq_in_driver == 0); + BUG_ON(bfqd->rq_in_driver == 0); bfqd->rq_in_driver--; } @@ -837,7 +871,7 @@ static void bfq_remove_request(struct request *rq) } if (rq->cmd_flags & REQ_META) { - WARN_ON(bfqq->meta_pending == 0); + BUG_ON(bfqq->meta_pending == 0); bfqq->meta_pending--; } } @@ -879,8 +913,9 @@ static void bfq_merged_request(struct request_queue *q, struct request *req, BUG_ON(next_rq == NULL); bfqq->next_rq = next_rq; /* - * If next_rq changes, update both the queue's budget to fit - * the new request and the queue's position in its rq_pos_tree. + * If next_rq changes, update both the queue's budget to + * fit the new request and the queue's position in its + * rq_pos_tree. */ if (prev != bfqq->next_rq) { bfq_updated_next_req(bfqd, bfqq); @@ -914,7 +949,7 @@ static inline void bfq_bfqq_end_wr(struct bfq_queue *bfqq) { BUG_ON(bfqq == NULL); if (bfq_bfqq_busy(bfqq)) - bfqq->bfqd->raised_busy_queues--; + bfqq->bfqd->wr_busy_queues--; bfqq->wr_coeff = 1; bfqq->wr_cur_max_time = 0; /* Trigger a weight change on the next activation of the queue */ @@ -992,8 +1027,8 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd, sector_t sector) /* * If the exact sector wasn't found, the parent of the NULL leaf - * will contain the closest sector (rq_pos_tree sorted by next_request - * position). + * will contain the closest sector (rq_pos_tree sorted by + * next_request position). */ __bfqq = rb_entry(parent, struct bfq_queue, pos_node); if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) @@ -1103,24 +1138,26 @@ bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) new_bfqq->pid); /* - * Merging is just a redirection: the requests of the process owning - * one of the two queues are redirected to the other queue. The latter - * queue, in its turn, is set as shared if this is the first time that - * the requests of some process are redirected to it. + * Merging is just a redirection: the requests of the process + * owning one of the two queues are redirected to the other queue. + * The latter queue, in its turn, is set as shared if this is the + * first time that the requests of some process are redirected to + * it. * * We redirect bfqq to new_bfqq and not the opposite, because we - * are in the context of the process owning bfqq, hence we have the - * io_cq of this process. So we can immediately configure this io_cq - * to redirect the requests of the process to new_bfqq. + * are in the context of the process owning bfqq, hence we have + * the io_cq of this process. So we can immediately configure this + * io_cq to redirect the requests of the process to new_bfqq. * * NOTE, even if new_bfqq coincides with the in-service queue, the - * io_cq of new_bfqq is not available, because, if the in-service queue - * is shared, bfqd->in_service_bic may not point to the io_cq of the - * in-service queue. - * Redirecting the requests of the process owning bfqq to the currently - * in-service queue is in any case the best option, as we feed the - * in-service queue with new requests close to the last request served - * and, by doing so, hopefully increase the throughput. + * io_cq of new_bfqq is not available, because, if the in-service + * queue is shared, bfqd->in_service_bic may not point to the + * io_cq of the in-service queue. + * Redirecting the requests of the process owning bfqq to the + * currently in-service queue is in any case the best option, as + * we feed the in-service queue with new requests close to the + * last request served and, by doing so, hopefully increase the + * throughput. */ bfqq->new_bfqq = new_bfqq; atomic_add(process_refs, &new_bfqq->ref); @@ -1128,8 +1165,8 @@ bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) } /* - * Attempt to schedule a merge of bfqq with the currently in-service queue or - * with a close queue among the scheduled queues. + * Attempt to schedule a merge of bfqq with the currently in-service queue + * or with a close queue among the scheduled queues. * Return NULL if no merge was scheduled, a pointer to the shared bfq_queue * structure otherwise. */ @@ -1164,7 +1201,7 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfq_bfqq_sync(in_service_bfqq) && bfq_bfqq_sync(bfqq)) { new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq); if (new_bfqq != NULL) - return new_bfqq; /* Merge with the in-service queue */ + return new_bfqq; /* Merge with in-service queue */ } /* @@ -1195,9 +1232,9 @@ bfq_bfqq_save_state(struct bfq_queue *bfqq) /* * This is the queue of a just-started process, and would * deserve weight raising: we set wr_time_left to the full - * weight-raising duration to trigger weight-raising when and - * if the queue is split and the first request of the queue - * is enqueued. + * weight-raising duration to trigger weight-raising when + * and if the queue is split and the first request of the + * queue is enqueued. */ bfqq->bic->wr_time_left = bfq_wr_duration(bfqq->bfqd); else if (bfqq->wr_coeff > 1) { @@ -1219,13 +1256,16 @@ bfq_bfqq_save_state(struct bfq_queue *bfqq) * The bfq_queue is becoming shared or the requests of the * process owning the queue are being redirected to a shared * queue. Stop the weight raising period of the queue, as in - * both cases it should not be owned by an interactive or soft - * real-time application. + * both cases it should not be owned by an interactive or + * soft real-time application. */ bfq_bfqq_end_wr(bfqq); } else bfqq->bic->wr_time_left = 0; bfqq->bic->saved_idle_window = bfq_bfqq_idle_window(bfqq); + bfqq->bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); + bfqq->bic->cooperations++; + bfqq->bic->failed_cooperations = 0; } static inline void @@ -1248,23 +1288,28 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, /* Save weight raising and idle window of the merged queues */ bfq_bfqq_save_state(bfqq); bfq_bfqq_save_state(new_bfqq); + if (bfq_bfqq_IO_bound(bfqq)) + bfq_mark_bfqq_IO_bound(new_bfqq); + bfq_clear_bfqq_IO_bound(bfqq); /* * Grab a reference to the bic, to prevent it from being destroyed * before being possibly touched by a bfq_split_bfqq(). */ bfq_get_bic_reference(bfqq); bfq_get_bic_reference(new_bfqq); - /* Merge queues (that is, let bic redirect its requests to new_bfqq) */ + /* + * Merge queues (that is, let bic redirect its requests to new_bfqq) + */ bic_set_bfqq(bic, new_bfqq, 1); bfq_mark_bfqq_coop(new_bfqq); /* - * new_bfqq now belongs to at least two bics (it is a shared queue): set - * new_bfqq->bic to NULL. bfqq either: + * new_bfqq now belongs to at least two bics (it is a shared queue): + * set new_bfqq->bic to NULL. bfqq either: * - does not belong to any bic any more, and hence bfqq->bic must * be set to NULL, or * - is a queue whose owning bics have already been redirected to a - * different queue, hence the queue is destined to not belong to any - * bic soon and bfqq->bic is already NULL (therefore the next + * different queue, hence the queue is destined to not belong to + * any bic soon and bfqq->bic is already NULL (therefore the next * assignment causes no harm). */ new_bfqq->bic = NULL; @@ -1272,6 +1317,18 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, bfq_put_queue(bfqq); } +static inline void bfq_bfqq_increase_failed_cooperations(struct bfq_queue *bfqq) +{ + struct bfq_io_cq *bic = bfqq->bic; + struct bfq_data *bfqd = bfqq->bfqd; + + if (bic && bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh) { + bic->failed_cooperations++; + if (bic->failed_cooperations >= bfqd->bfq_failed_cooperations) + bic->cooperations = 0; + } +} + static int bfq_allow_merge(struct request_queue *q, struct request *rq, struct bio *bio) { @@ -1304,12 +1361,13 @@ static int bfq_allow_merge(struct request_queue *q, struct request *rq, if (new_bfqq != NULL) { bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq); /* - * If we get here, the bio will be queued in the shared - * queue, i.e., new_bfqq, so use new_bfqq to decide - * whether bio and rq can be merged. + * If we get here, the bio will be queued in the + * shared queue, i.e., new_bfqq, so use new_bfqq + * to decide whether bio and rq can be merged. */ bfqq = new_bfqq; - } + } else + bfq_bfqq_increase_failed_cooperations(bfqq); } return bfqq == RQ_BFQQ(rq); @@ -1375,9 +1433,9 @@ static void bfq_arm_slice_timer(struct bfq_data *bfqd) struct bfq_io_cq *bic; unsigned long sl; - WARN_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); + BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); - /* Tasks have exited, don't wait. */ + /* Processes have exited, don't wait. */ bic = bfqd->in_service_bic; if (bic == NULL || atomic_read(&bic->icq.ioc->active_ref) == 0) return; @@ -1452,9 +1510,9 @@ static void bfq_dispatch_insert(struct request_queue *q, struct request *rq) * We execute instead this instruction before bfq_remove_request() * (and hence introduce a temporary inconsistency), for efficiency. * In fact, in a forced_dispatch, this prevents two counters related - * to bfqq->dispatched to risk to be uselessly decremented if bfqq is - * not in service, and then to be incremented again after incrementing - * bfqq->dispatched. + * to bfqq->dispatched to risk to be uselessly decremented if bfqq + * is not in service, and then to be incremented again after + * incrementing bfqq->dispatched. */ bfqq->dispatched++; bfq_remove_request(rq); @@ -1510,9 +1568,9 @@ static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) if (RB_EMPTY_ROOT(&bfqq->sort_list)) { /* - * overloading budget_timeout field to store when - * the queue remains with no backlog, used by - * the weight-raising mechanism + * Overloading budget_timeout field to store the time + * at which the queue remains with no backlog; used by + * the weight-raising mechanism. */ bfqq->budget_timeout = jiffies; bfq_del_bfqq_busy(bfqd, bfqq, 1); @@ -1795,10 +1853,10 @@ static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq, } /* - * To be deemed as soft real-time, an application must meet two requirements. - * First, the application must not require an average bandwidth higher than - * the approximate bandwidth required to playback or record a compressed high- - * definition video. + * To be deemed as soft real-time, an application must meet two + * requirements. First, the application must not require an average + * bandwidth higher than the approximate bandwidth required to playback or + * record a compressed high-definition video. * The next function is invoked on the completion of the last request of a * batch, to compute the next-start time instant, soft_rt_next_start, such * that, if the next request of the application does not arrive before @@ -1809,30 +1867,31 @@ static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq, * the application stops issuing new requests until all its pending requests * have been completed. After that, the application may issue a new batch, * and so on. - * For this reason the next function is invoked to compute soft_rt_next_start - * only for applications that meet this requirement, whereas soft_rt_next_start - * is set to infinity for applications that do not. + * For this reason the next function is invoked to compute + * soft_rt_next_start only for applications that meet this requirement, + * whereas soft_rt_next_start is set to infinity for applications that do + * not. * * Unfortunately, even a greedy application may happen to behave in an - * isochronous way if the CPU load is high. In fact, the application may stop - * issuing requests while the CPUs are busy serving other processes, then - * restart, then stop again for a while, and so on. In addition, if the disk - * achieves a low enough throughput with the request pattern issued by the - * application (e.g., because the request pattern is random and/or the device - * is slow), then the application may meet the above bandwidth requirement too. - * To prevent such a greedy application to be deemed as soft real-time, a - * further rule is used in the computation of soft_rt_next_start: - * soft_rt_next_start must be higher than the current time plus the maximum - * time for which the arrival of a request is waited for when a sync queue - * becomes idle, namely bfqd->bfq_slice_idle. - * This filters out greedy applications, as the latter issue instead their next - * request as soon as possible after the last one has been completed (in - * contrast, when a batch of requests is completed, a soft real-time application - * spends some time processing data). + * isochronous way if the CPU load is high. In fact, the application may + * stop issuing requests while the CPUs are busy serving other processes, + * then restart, then stop again for a while, and so on. In addition, if + * the disk achieves a low enough throughput with the request pattern + * issued by the application (e.g., because the request pattern is random + * and/or the device is slow), then the application may meet the above + * bandwidth requirement too. To prevent such a greedy application to be + * deemed as soft real-time, a further rule is used in the computation of + * soft_rt_next_start: soft_rt_next_start must be higher than the current + * time plus the maximum time for which the arrival of a request is waited + * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle. + * This filters out greedy applications, as the latter issue instead their + * next request as soon as possible after the last one has been completed + * (in contrast, when a batch of requests is completed, a soft real-time + * application spends some time processing data). * - * Unfortunately, the last filter may easily generate false positives if only - * bfqd->bfq_slice_idle is used as a reference time interval and one or both - * the following cases occur: + * Unfortunately, the last filter may easily generate false positives if + * only bfqd->bfq_slice_idle is used as a reference time interval and one + * or both the following cases occur: * 1) HZ is so low that the duration of a jiffy is comparable to or higher * than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with * HZ=100. @@ -1841,8 +1900,9 @@ static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq, * increments. This seems to happen, e.g., inside virtual machines. * To address this issue, we do not use as a reference time interval just * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In - * particular we add the minimum number of jiffies for which the filter seems - * to be quite precise also in embedded systems and KVM/QEMU virtual machines. + * particular we add the minimum number of jiffies for which the filter + * seems to be quite precise also in embedded systems and KVM/QEMU virtual + * machines. */ static inline unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, struct bfq_queue *bfqq) @@ -1910,7 +1970,7 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, * As above explained, 'punish' slow (i.e., seeky), timed-out * and async queues, to favor sequential sync workloads. * - * Processes doing IO in the slower disk zones will tend to be + * Processes doing I/O in the slower disk zones will tend to be * slow(er) even if not seeky. Hence, since the estimated peak * rate is actually an average over the disk surface, these * processes may timeout just for bad luck. To avoid punishing @@ -1930,6 +1990,10 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, bfqd->const_seeky_busy_in_flight_queues++; } + if (reason == BFQ_BFQQ_TOO_IDLE && + bfqq->entity.service <= 2 * bfqq->entity.budget / 10 ) + bfq_clear_bfqq_IO_bound(bfqq); + if (bfqd->low_latency && bfqq->wr_coeff == 1) bfqq->last_wr_start_finish = jiffies; @@ -1938,11 +2002,12 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, /* * If we get here, and there are no outstanding requests, * then the request pattern is isochronous (see the comments - * to the function bfq_bfqq_softrt_next_start()). Hence we can - * compute soft_rt_next_start. If, instead, the queue still - * has outstanding requests, then we have to wait for the - * completion of all the outstanding requests to discover - * whether the request pattern is actually isochronous. + * to the function bfq_bfqq_softrt_next_start()). Hence we + * can compute soft_rt_next_start. If, instead, the queue + * still has outstanding requests, then we have to wait + * for the completion of all the outstanding requests to + * discover whether the request pattern is actually + * isochronous. */ if (bfqq->dispatched == 0) bfqq->soft_rt_next_start = @@ -1974,10 +2039,13 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, } bfq_log_bfqq(bfqd, bfqq, - "expire (%d, slow %d, num_disp %d, idle_win %d)", reason, slow, - bfqq->dispatched, bfq_bfqq_idle_window(bfqq)); + "expire (%d, slow %d, num_disp %d, idle_win %d)", reason, + slow, bfqq->dispatched, bfq_bfqq_idle_window(bfqq)); - /* Increase, decrease or leave budget unchanged according to reason */ + /* + * Increase, decrease or leave budget unchanged according to + * reason. + */ __bfq_bfqq_recalc_budget(bfqd, bfqq, reason); __bfq_bfqq_expire(bfqd, bfqq); } @@ -2018,58 +2086,63 @@ static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) } /* - * Device idling is allowed only for the queues for which this function returns - * true. For this reason, the return value of this function plays a critical - * role for both throughput boosting and service guarantees. This return value - * is computed through a logical expression. In this rather long comment, we - * try to briefly describe all the details and motivations behind the - * components of this logical expression. + * Device idling is allowed only for the queues for which this function + * returns true. For this reason, the return value of this function plays a + * critical role for both throughput boosting and service guarantees. The + * return value is computed through a logical expression. In this rather + * long comment, we try to briefly describe all the details and motivations + * behind the components of this logical expression. * - * First, the expression may be true only for sync queues. Besides, if bfqq is - * also being weight-raised, then the expression always evaluates to true, as - * device idling is instrumental for preserving low-latency guarantees - * (see [1]). Otherwise, the expression evaluates to true only if bfqq has - * a non-null idle window and either the device is not performing NCQ - * (because, when both of the last two conditions hold, idling most certainly - * boosts the throughput), or the following compound condition is true. + * First, the expression may be true only for sync queues. Besides, if + * bfqq is also being weight-raised, then the expression always evaluates + * to true, as device idling is instrumental for preserving low-latency + * guarantees (see [1]). Otherwise, the expression evaluates to true only + * if bfqq has a non-null idle window and at least one of the following + * two conditions holds. The first condition is that the device is not + * performing NCQ, because idling the device most certainly boosts the + * throughput if this condition holds and bfqq has been granted a non-null + * idle window. The second compound condition is made of the logical AND of + * two components. * - * The compound condition contains a first component that lets the whole - * compound condition evaluate to false if there is at least one - * weight-raised busy queue. This guarantees that, in this case, the device - * is not idled for a sync non-weight-raised queue. The latter is then expired - * immediately if empty. Combined with the timestamping rules of BFQ (see [1] - * for details), this causes sync non-weight-raised queues to get a lower - * number of requests served, and hence to ask for a lower number of requests - * from the request pool, before the busy weight-raised queues get served - * again. + * The first component is true only if there is no weight-raised busy + * queue. This guarantees that the device is not idled for a sync non- + * weight-raised queue when there are busy weight-raised queues. The former + * is then expired immediately if empty. Combined with the timestamping + * rules of BFQ (see [1] for details), this causes sync non-weight-raised + * queues to get a lower number of requests served, and hence to ask for a + * lower number of requests from the request pool, before the busy weight- + * raised queues get served again. * - * This is beneficial for the processes associated with weight-raised queues, - * when the system operates in request-pool saturation conditions (e.g., in - * the presence of write hogs). In fact, if the processes associated with the - * other queues ask for requests at a lower rate, then weight-raised processes - * have a higher probability to get a request from the pool immediately (or at + * This is beneficial for the processes associated with weight-raised + * queues, when the request pool is saturated (e.g., in the presence of + * write hogs). In fact, if the processes associated with the other queues + * ask for requests at a lower rate, then weight-raised processes have a + * higher probability to get a request from the pool immediately (or at * least soon) when they need one. Hence they have a higher probability to - * actually get a fraction of the disk throughput proportional to their high - * weight. This is especially true with NCQ-enabled drives, which enqueue - * several requests in advance and further reorder internally-queued requests. + * actually get a fraction of the disk throughput proportional to their + * high weight. This is especially true with NCQ-capable drives, which + * enqueue several requests in advance and further reorder internally- + * queued requests. * - * In the end, mistreating non-weight-raised queues when there are busy weight- - * raised queues seems to mitigate starvation problems in the presence of heavy - * write workloads and NCQ, and hence to guarantee a higher application and - * system responsiveness in these hostile scenarios. + * In the end, mistreating non-weight-raised queues when there are busy + * weight-raised queues seems to mitigate starvation problems in the + * presence of heavy write workloads and NCQ, and hence to guarantee a + * higher application and system responsiveness in these hostile scenarios. * - * If the first component of the compound condition is instead true (i.e., - * there is no weight-raised busy queue), then the rest of the compound - * condition takes into account service-guarantee and throughput issues. + * If the first component of the compound condition is instead true, i.e., + * there is no weight-raised busy queue, then the second component of the + * compound condition takes into account service-guarantee and throughput + * issues related to NCQ (recall that the compound condition is evaluated + * only if the device is detected as supporting NCQ). * * As for service guarantees, allowing the drive to enqueue more than one - * request at a time, and hence delegating de facto final scheduling decisions - * to the drive's internal scheduler, causes loss of control on the actual - * request service order. In this respect, when the drive is allowed to - * enqueue more than one request at a time, the service distribution enforced - * by the drive's internal scheduler is likely to coincide with the desired - * device-throughput distribution only in the following, perfectly symmetric, - * scenario: + * request at a time, and hence delegating de facto final scheduling + * decisions to the drive's internal scheduler, causes loss of control on + * the actual request service order. In this respect, when the drive is + * allowed to enqueue more than one request at a time, the service + * distribution enforced by the drive's internal scheduler is likely to + * coincide with the desired device-throughput distribution only in the + * following, perfectly symmetric, scenario: * 1) all active queues have the same weight, * 2) all active groups at the same level in the groups tree have the same * weight, @@ -2078,59 +2151,61 @@ static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) * * Even in such a scenario, sequential I/O may still receive a preferential * treatment, but this is not likely to be a big issue with flash-based - * devices, because of their non-dramatic loss of throughput with random I/O. - * Things do differ with HDDs, for which additional care is taken, as + * devices, because of their non-dramatic loss of throughput with random + * I/O. Things do differ with HDDs, for which additional care is taken, as * explained after completing the discussion for flash-based devices. * - * Unfortunately, keeping the necessary state for evaluating exactly the above - * symmetry conditions would be quite complex and time consuming. Therefore BFQ - * evaluates instead the following stronger sub-conditions, for which it is - * much easier to maintain the needed state: + * Unfortunately, keeping the necessary state for evaluating exactly the + * above symmetry conditions would be quite complex and time-consuming. + * Therefore BFQ evaluates instead the following stronger sub-conditions, + * for which it is much easier to maintain the needed state: * 1) all active queues have the same weight, * 2) all active groups have the same weight, * 3) all active groups have at most one active child each. * In particular, the last two conditions are always true if hierarchical * support and the cgroups interface are not enabled, hence no state needs - * to be maintained. + * to be maintained in this case. * - * According to the above considerations, the compound condition evaluates - * to true and hence idling is performed if any of the above symmetry - * sub-condition does not hold. These are the only sub-conditions considered - * if the device is flash-based, as, for such a device, it is sensible to - * force idling only for service-guarantee issues. In fact, as for throughput, - * idling NCQ-capable flash-based devices would not boost the throughput even - * with sequential I/O; rather it would lower the throughput in proportion to - * how fast the device is. In the end, (only) if all the three sub-conditions - * hold and the device is flash-based, then the compound condition evaluates - * to false and hence no idling is performed. + * According to the above considerations, the second component of the + * compound condition evaluates to true if any of the above symmetry + * sub-condition does not hold, or the device is not flash-based. Therefore, + * if also the first component is true, then idling is allowed for a sync + * queue. These are the only sub-conditions considered if the device is + * flash-based, as, for such a device, it is sensible to force idling only + * for service-guarantee issues. In fact, as for throughput, idling + * NCQ-capable flash-based devices would not boost the throughput even + * with sequential I/O; rather it would lower the throughput in proportion + * to how fast the device is. In the end, (only) if all the three + * sub-conditions hold and the device is flash-based, the compound + * condition evaluates to false and therefore no idling is performed. * - * As already said, things change with a rotational device, where idling boosts - * the throughput with sequential I/O (even with NCQ). Hence, for such a device - * the compound condition evaluates to true and idling is performed also if the - * following additional sub-condition does not hold: the queue is (constantly) - * seeky. Unfortunately, this different behavior with respect to flash-based - * devices causes an additional asymmetry: if some sync queues enjoy idling and - * some other sync queues do not, then the latter get a low share of the device - * bandwidth, simply because the former get many requests served after being - * set as in service, whereas the latter do not. As a consequence, to - * guarantee the desired bandwidth distribution, on HDDs the compound - * expression evaluates to true (and hence device idling is performed) also - * if the following last symmetry condition does not hold: no other queue is - * benefiting from idling. - * Also this last condition is actually replaced with a simpler-to-maintain - * and stronger condition: there is no busy queue which is not seeky (and - * hence may also benefit from idling). + * As already said, things change with a rotational device, where idling + * boosts the throughput with sequential I/O (even with NCQ). Hence, for + * such a device the second component of the compound condition evaluates + * to true also if the following additional sub-condition does not hold: + * the queue is constantly seeky. Unfortunately, this different behavior + * with respect to flash-based devices causes an additional asymmetry: if + * some sync queues enjoy idling and some other sync queues do not, then + * the latter get a low share of the device throughput, simply because the + * former get many requests served after being set as in service, whereas + * the latter do not. As a consequence, to guarantee the desired throughput + * distribution, on HDDs the compound expression evaluates to true (and + * hence device idling is performed) also if the following last symmetry + * condition does not hold: no other queue is benefiting from idling. Also + * this last condition is actually replaced with a simpler-to-maintain and + * stronger condition: there is no busy queue which is not constantly seeky + * (and hence may also benefit from idling). * * To sum up, when all the required symmetry and throughput-boosting - * sub-conditions hold, the compound condition evaluates to false, and hence - * no idling is performed. This helps to keep the drives' internal queues full - * on NCQ-capable devices, and hence to boost the throughput, without causing - * 'almost' any loss of service guarantees. The 'almost' follows from the fact - * that, if the internal queue of one such device is filled while all the - * sub-conditions hold, but at some point in time some sub-condition stops to - * hold, then it may become impossible to let requests be served in the new - * desired order until all the requests already queued in the device have been - * served. + * sub-conditions hold, the second component of the compound condition + * evaluates to false, and hence no idling is performed. This helps to + * keep the drives' internal queues full on NCQ-capable devices, and hence + * to boost the throughput, without causing 'almost' any loss of service + * guarantees. The 'almost' follows from the fact that, if the internal + * queue of one such device is filled while all the sub-conditions hold, + * but at some point in time some sub-condition stops to hold, then it may + * become impossible to let requests be served in the new desired order + * until all the requests already queued in the device have been served. */ static inline bool bfq_bfqq_must_not_expire(struct bfq_queue *bfqq) { @@ -2149,29 +2224,29 @@ static inline bool bfq_bfqq_must_not_expire(struct bfq_queue *bfqq) * the device). */ #define cond_for_expiring_non_wr (bfqd->hw_tag && \ - (bfqd->raised_busy_queues > 0 || \ + (bfqd->wr_busy_queues > 0 || \ (symmetric_scenario && \ (blk_queue_nonrot(bfqd->queue) || \ cond_for_seeky_on_ncq_hdd)))) - return bfq_bfqq_sync(bfqq) && ( - bfqq->wr_coeff > 1 || - (bfq_bfqq_idle_window(bfqq) && - !cond_for_expiring_non_wr) + return bfq_bfqq_sync(bfqq) && + (bfq_bfqq_IO_bound(bfqq) || bfqq->wr_coeff > 1) && + (bfqq->wr_coeff > 1 || + (bfq_bfqq_idle_window(bfqq) && + !cond_for_expiring_non_wr) ); } /* - * If the in-service queue is empty, but it is sync and either of the following - * conditions holds, then: 1) the queue must remain in service and cannot be - * expired, and 2) the disk must be idled to wait for the possible arrival - * of a new request for the queue. The conditions are: - * - the device is rotational and not performing NCQ, and the queue has its - * idle window set (in this case, waiting for a new request for the queue - * is likely to boost the disk throughput); - * - the queue is weight-raised (waiting for the request is necessary to - * provide the queue with fairness and latency guarantees, see [1] for - * details). + * If the in-service queue is empty but sync, and the function + * bfq_bfqq_must_not_expire returns true, then: + * 1) the queue must remain in service and cannot be expired, and + * 2) the disk must be idled to wait for the possible arrival of a new + * request for the queue. + * See the comments to the function bfq_bfqq_must_not_expire for the reasons + * why performing device idling is the best choice to boost the throughput + * and preserve service guarantees when bfq_bfqq_must_not_expire itself + * returns true. */ static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq) { @@ -2214,16 +2289,18 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) goto expire; } else { /* - * The idle timer may be pending because we may not - * disable disk idling even when a new request arrives + * The idle timer may be pending because we may + * not disable disk idling even when a new request + * arrives. */ if (timer_pending(&bfqd->idle_slice_timer)) { /* * If we get here: 1) at least a new request * has arrived but we have not disabled the * timer because the request was too small, - * 2) then the block layer has unplugged the - * device, causing the dispatch to be invoked. + * 2) then the block layer has unplugged + * the device, causing the dispatch to be + * invoked. * * Since the device is unplugged, now the * requests are probably large enough to @@ -2238,9 +2315,9 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) } /* - * No requests pending. If the in-service queue still has requests in - * flight (possibly waiting for a completion) or is idling for a new - * request, then keep it. + * No requests pending. If the in-service queue still has requests + * in flight (possibly waiting for a completion) or is idling for a + * new request, then keep it. */ if (timer_pending(&bfqd->idle_slice_timer) || (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq))) { @@ -2259,36 +2336,36 @@ keep_queue: return bfqq; } -static void bfq_update_raising_data(struct bfq_data *bfqd, - struct bfq_queue *bfqq) +static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) { struct bfq_entity *entity = &bfqq->entity; if (bfqq->wr_coeff > 1) { /* queue is being weight-raised */ bfq_log_bfqq(bfqd, bfqq, - "raising period dur %u/%u msec, old raising coeff %u, w %d(%d)", - jiffies_to_msecs(jiffies - - bfqq->last_wr_start_finish), - jiffies_to_msecs(bfqq->wr_cur_max_time), - bfqq->wr_coeff, - bfqq->entity.weight, bfqq->entity.orig_weight); + "raising period dur %u/%u msec, old coeff %u, w %d(%d)", + jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), + jiffies_to_msecs(bfqq->wr_cur_max_time), + bfqq->wr_coeff, + bfqq->entity.weight, bfqq->entity.orig_weight); BUG_ON(bfqq != bfqd->in_service_queue && entity->weight != - entity->orig_weight * bfqq->wr_coeff); + entity->orig_weight * bfqq->wr_coeff); if (entity->ioprio_changed) - bfq_log_bfqq(bfqd, bfqq, - "WARN: pending prio change"); + bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change"); + /* * If too much time has elapsed from the beginning - * of this weight-raising period, stop it. + * of this weight-raising period, or the queue has + * exceeded the acceptable number of cooperations, + * stop it. */ - if (time_is_before_jiffies(bfqq->last_wr_start_finish + + if (bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh || + time_is_before_jiffies(bfqq->last_wr_start_finish + bfqq->wr_cur_max_time)) { bfqq->last_wr_start_finish = jiffies; bfq_log_bfqq(bfqd, bfqq, "wrais ending at %lu, rais_max_time %u", bfqq->last_wr_start_finish, - jiffies_to_msecs(bfqq-> - wr_cur_max_time)); + jiffies_to_msecs(bfqq->wr_cur_max_time)); bfq_bfqq_end_wr(bfqq); } } @@ -2320,20 +2397,18 @@ static int bfq_dispatch_request(struct bfq_data *bfqd, if (service_to_charge > bfq_bfqq_budget_left(bfqq)) { /* - * This may happen if the next rq is chosen - * in fifo order instead of sector order. - * The budget is properly dimensioned - * to be always sufficient to serve the next request - * only if it is chosen in sector order. The reason is - * that it would be quite inefficient and little useful - * to always make sure that the budget is large enough - * to serve even the possible next rq in fifo order. + * This may happen if the next rq is chosen in fifo order + * instead of sector order. The budget is properly + * dimensioned to be always sufficient to serve the next + * request only if it is chosen in sector order. The reason + * is that it would be quite inefficient and little useful + * to always make sure that the budget is large enough to + * serve even the possible next rq in fifo order. * In fact, requests are seldom served in fifo order. * - * Expire the queue for budget exhaustion, and - * make sure that the next act_budget is enough - * to serve the next request, even if it comes - * from the fifo expired path. + * Expire the queue for budget exhaustion, and make sure + * that the next act_budget is enough to serve the next + * request, even if it comes from the fifo expired path. */ bfqq->next_rq = rq; /* @@ -2349,7 +2424,7 @@ static int bfq_dispatch_request(struct bfq_data *bfqd, bfq_bfqq_served(bfqq, service_to_charge); bfq_dispatch_insert(bfqd->queue, rq); - bfq_update_raising_data(bfqd, bfqq); + bfq_update_wr_data(bfqd, bfqq); bfq_log_bfqq(bfqd, bfqq, "dispatched %u sec req (%llu), budg left %lu", @@ -2390,8 +2465,8 @@ static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq) } /* - * Drain our current requests. Used for barriers and when switching - * io schedulers on-the-fly. + * Drain our current requests. + * Used for barriers and when switching io schedulers on-the-fly. */ static int bfq_forced_dispatch(struct bfq_data *bfqd) { @@ -2507,10 +2582,8 @@ static void bfq_put_cooperator(struct bfq_queue *bfqq) */ __bfqq = bfqq->new_bfqq; while (__bfqq) { - if (__bfqq == bfqq) { - WARN(1, "bfqq->new_bfqq loop detected.\n"); + if (__bfqq == bfqq) break; - } next = __bfqq->new_bfqq; bfq_put_queue(__bfqq); __bfqq = next; @@ -2532,7 +2605,7 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfq_put_queue(bfqq); } -static void bfq_init_icq(struct io_cq *icq) +static inline void bfq_init_icq(struct io_cq *icq) { struct bfq_io_cq *bic = icq_to_bic(icq); @@ -2622,11 +2695,6 @@ static void bfq_init_prio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) bfqq->entity.ioprio_changed = 1; - /* - * Keep track of original prio settings in case we have to temporarily - * elevate the priority of this queue. - */ - bfqq->org_ioprio = bfqq->entity.new_ioprio; bfq_clear_bfqq_prio_changed(bfqq); } @@ -2641,8 +2709,8 @@ static void bfq_changed_ioprio(struct bfq_io_cq *bic) bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data), &flags); /* - * This condition may trigger on a newly created bic, be sure to drop - * the lock before returning. + * This condition may trigger on a newly created bic, be sure to + * drop the lock before returning. */ if (unlikely(bfqd == NULL) || likely(bic->ioprio == ioprio)) goto out; @@ -2688,6 +2756,7 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfq_mark_bfqq_idle_window(bfqq); bfq_mark_bfqq_sync(bfqq); } + bfq_mark_bfqq_IO_bound(bfqq); /* Tentative initial value to trade off between thr and lat */ bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3; @@ -2793,7 +2862,8 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, bfqq = bfq_find_alloc_queue(bfqd, bfqg, is_sync, bic, gfp_mask); /* - * Pin the queue now that it's allocated, scheduler exit will prune it. + * Pin the queue now that it's allocated, scheduler exit will + * prune it. */ if (!is_sync && *async_bfqq == NULL) { atomic_inc(&bfqq->ref); @@ -3006,7 +3076,8 @@ static void bfq_insert_request(struct request_queue *q, struct request *rq) bfqq, new_bfqq); rq->elv.priv[1] = new_bfqq; bfqq = new_bfqq; - } + } else + bfq_bfqq_increase_failed_cooperations(bfqq); } bfq_init_prio_data(bfqq, RQ_BIC(rq)); @@ -3063,8 +3134,8 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq) bfq_update_hw_tag(bfqd); - WARN_ON(!bfqd->rq_in_driver); - WARN_ON(!bfqq->dispatched); + BUG_ON(!bfqd->rq_in_driver); + BUG_ON(!bfqq->dispatched); bfqd->rq_in_driver--; bfqq->dispatched--; @@ -3075,8 +3146,8 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq) BUG_ON(!bfqd->busy_in_flight_queues); bfqd->busy_in_flight_queues--; if (bfq_bfqq_constantly_seeky(bfqq)) { - BUG_ON( - !bfqd->const_seeky_busy_in_flight_queues); + BUG_ON(!bfqd-> + const_seeky_busy_in_flight_queues); bfqd->const_seeky_busy_in_flight_queues--; } } @@ -3145,9 +3216,9 @@ static int bfq_may_queue(struct request_queue *q, int rw) /* * Don't force setup of a queue from here, as a call to may_queue - * does not necessarily imply that a request actually will be queued. - * So just lookup a possibly existing queue, or return 'may queue' - * if that fails. + * does not necessarily imply that a request actually will be + * queued. So just lookup a possibly existing queue, or return + * 'may queue' if that fails. */ bic = bfq_bic_lookup(bfqd, tsk->io_context); if (bic == NULL) @@ -3273,9 +3344,9 @@ new_queue: if (split) { bfq_mark_bfqq_just_split(bfqq); /* - * If the queue has just been split from a shared queue, - * restore the idle window and the possible weight - * raising period. + * If the queue has just been split from a shared + * queue, restore the idle window and the possible + * weight raising period. */ bfq_bfqq_resume_state(bfqq, bic); } @@ -3318,12 +3389,12 @@ static void bfq_idle_slice_timer(unsigned long data) bfqq = bfqd->in_service_queue; /* - * Theoretical race here: the in-service queue can be NULL or different - * from the queue that was idling if the timer handler spins on - * the queue_lock and a new request arrives for the current - * queue and there is a full dispatch cycle that changes the - * in-service queue. This can hardly happen, but in the worst case - * we just expire a queue too early. + * Theoretical race here: the in-service queue can be NULL or + * different from the queue that was idling if the timer handler + * spins on the queue_lock and a new request arrives for the + * current queue and there is a full dispatch cycle that changes + * the in-service queue. This can hardly happen, but in the worst + * case we just expire a queue too early. */ if (bfqq != NULL) { bfq_log_bfqq(bfqd, bfqq, "slice_timer expired"); @@ -3337,9 +3408,9 @@ static void bfq_idle_slice_timer(unsigned long data) else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0) /* * The queue may not be empty upon timer expiration, - * because we may not disable the timer when the first - * request of the in-service queue arrives during - * disk idling + * because we may not disable the timer when the + * first request of the in-service queue arrives + * during disk idling. */ reason = BFQ_BFQQ_TOO_IDLE; else @@ -3479,6 +3550,10 @@ static int bfq_init_queue(struct request_queue *q) bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async; bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync; + bfqd->bfq_coop_thresh = 2; + bfqd->bfq_failed_cooperations = 7000; + bfqd->bfq_requests_within_timer = 120; + bfqd->low_latency = true; bfqd->bfq_wr_coeff = 20; @@ -3492,13 +3567,13 @@ static int bfq_init_queue(struct request_queue *q) * high-definition compressed * video. */ - bfqd->raised_busy_queues = 0; + bfqd->wr_busy_queues = 0; bfqd->busy_in_flight_queues = 0; bfqd->const_seeky_busy_in_flight_queues = 0; /* - * Begin by assuming, optimistically, that the device peak rate is equal - * to the highest reference rate. + * Begin by assuming, optimistically, that the device peak rate is + * equal to the highest reference rate. */ bfqd->RT_prod = R_fast[blk_queue_nonrot(bfqd->queue)] * T_fast[blk_queue_nonrot(bfqd->queue)]; @@ -3527,7 +3602,8 @@ static ssize_t bfq_var_show(unsigned int var, char *page) return sprintf(page, "%d\n", var); } -static ssize_t bfq_var_store(unsigned long *var, const char *page, size_t count) +static ssize_t bfq_var_store(unsigned long *var, const char *page, + size_t count) { unsigned long new_val; int ret = kstrtoul(page, 10, &new_val); @@ -3565,8 +3641,7 @@ static ssize_t bfq_weights_show(struct elevator_queue *e, char *page) bfqq->entity.weight, bfqq->queued[0], bfqq->queued[1], - jiffies_to_msecs(jiffies - - bfqq->last_wr_start_finish), + jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), jiffies_to_msecs(bfqq->wr_cur_max_time)); } @@ -3602,7 +3677,8 @@ SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0); SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0); SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1); SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0); -SHOW_FUNCTION(bfq_max_budget_async_rq_show, bfqd->bfq_max_budget_async_rq, 0); +SHOW_FUNCTION(bfq_max_budget_async_rq_show, + bfqd->bfq_max_budget_async_rq, 0); SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1); SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1); SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0); @@ -3806,14 +3882,14 @@ static int __init bfq_init(void) T_fast[1] = msecs_to_jiffies(2000); /* - * Thresholds that determine the switch between speed classes (see the - * comments before the definition of the array). + * Thresholds that determine the switch between speed classes (see + * the comments before the definition of the array). */ device_speed_thresh[0] = (R_fast[0] + R_slow[0]) / 2; device_speed_thresh[1] = (R_fast[1] + R_slow[1]) / 2; elv_register(&iosched_bfq); - pr_info("BFQ I/O-scheduler version: v7r4"); + pr_info("BFQ I/O-scheduler version: v7r5"); return 0; } diff --git a/block/bfq-sched.c b/block/bfq-sched.c index 0fd077c..546a254 100644 --- a/block/bfq-sched.c +++ b/block/bfq-sched.c @@ -97,7 +97,8 @@ static inline void bfq_update_budget(struct bfq_entity *next_in_service) * Shift for timestamp calculations. This actually limits the maximum * service allowed in one timestamp delta (small shift values increase it), * the maximum total weight that can be used for the queues in the system - * (big shift values increase it), and the period of virtual time wraparounds. + * (big shift values increase it), and the period of virtual time + * wraparounds. */ #define WFQ_SERVICE_SHIFT 22 @@ -339,7 +340,8 @@ static void bfq_weights_tree_remove(struct bfq_data *bfqd, /** - * bfq_active_insert - insert an entity in the active tree of its group/device. + * bfq_active_insert - insert an entity in the active tree of its + * group/device. * @st: the service tree of the entity. * @entity: the entity being inserted. * @@ -395,9 +397,9 @@ static void bfq_active_insert(struct bfq_service_tree *st, * bfq_ioprio_to_weight - calc a weight from an ioprio. * @ioprio: the ioprio value to convert. */ -static unsigned short bfq_ioprio_to_weight(int ioprio) +static inline unsigned short bfq_ioprio_to_weight(int ioprio) { - WARN_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR); + BUG_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR); return IOPRIO_BE_NR - ioprio; } @@ -409,9 +411,9 @@ static unsigned short bfq_ioprio_to_weight(int ioprio) * 0 is used as an escape ioprio value for weights (numerically) equal or * larger than IOPRIO_BE_NR */ -static unsigned short bfq_weight_to_ioprio(int weight) +static inline unsigned short bfq_weight_to_ioprio(int weight) { - WARN_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT); + BUG_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT); return IOPRIO_BE_NR - weight < 0 ? 0 : IOPRIO_BE_NR - weight; } @@ -681,7 +683,8 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, } /** - * bfq_bfqq_served - update the scheduler status after selection for service. + * bfq_bfqq_served - update the scheduler status after selection for + * service. * @bfqq: the queue being served. * @served: bytes to transfer. * @@ -820,7 +823,7 @@ static void bfq_activate_entity(struct bfq_entity *entity) * and if the caller did not specify @requeue, put it on the idle tree. * * Return %1 if the caller should update the entity hierarchy, i.e., - * if the entity was under service or if it was the next_in_service for + * if the entity was in service or if it was the next_in_service for * its sched_data; return %0 otherwise. */ static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue) @@ -876,7 +879,7 @@ static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue) /* * The parent entity is still backlogged, and * we don't need to update it as it is still - * under service. + * in service. */ break; @@ -917,7 +920,7 @@ update: * active tree of the device is not empty. * * NOTE: this hierarchical implementation updates vtimes quite often, - * we may end up with reactivated tasks getting timestamps after a + * we may end up with reactivated processes getting timestamps after a * vtime skip done because we needed a ->first_active entity on some * intermediate node. */ @@ -940,8 +943,8 @@ static void bfq_update_vtime(struct bfq_service_tree *st) * * This function searches the first schedulable entity, starting from the * root of the tree and going on the left every time on this side there is - * a subtree with at least one eligible (start >= vtime) entity. The path - * on the right is followed only if a) the left subtree contains no eligible + * a subtree with at least one eligible (start >= vtime) entity. The path on + * the right is followed only if a) the left subtree contains no eligible * entities and b) no eligible entity has been found yet. */ static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st) @@ -1134,14 +1137,14 @@ static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, BUG_ON(!bfqd->busy_in_flight_queues); bfqd->busy_in_flight_queues--; if (bfq_bfqq_constantly_seeky(bfqq)) { - BUG_ON( - !bfqd->const_seeky_busy_in_flight_queues); + BUG_ON(!bfqd-> + const_seeky_busy_in_flight_queues); bfqd->const_seeky_busy_in_flight_queues--; } } } if (bfqq->wr_coeff > 1) - bfqd->raised_busy_queues--; + bfqd->wr_busy_queues--; bfq_deactivate_bfqq(bfqd, bfqq, requeue); } @@ -1172,5 +1175,5 @@ static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq) } } if (bfqq->wr_coeff > 1) - bfqd->raised_busy_queues++; + bfqd->wr_busy_queues++; } diff --git a/block/bfq.h b/block/bfq.h index d9a6ab4..03c3f0c 100644 --- a/block/bfq.h +++ b/block/bfq.h @@ -1,5 +1,5 @@ /* - * BFQ-v7r4 for 3.10.0: data structures and common functions prototypes. + * BFQ-v7r5 for 3.10.0: data structures and common functions prototypes. * * Based on ideas and code from CFQ: * Copyright (C) 2003 Jens Axboe @@ -57,7 +57,7 @@ struct bfq_service_tree { /** * struct bfq_sched_data - multi-class scheduler. - * @in_service_entity: entity under service. + * @in_service_entity: entity in service. * @next_in_service: head-of-the-line entity in the scheduler. * @service_tree: array of service trees, one per ioprio_class. * @@ -97,7 +97,7 @@ struct bfq_weight_counter { /** * struct bfq_entity - schedulable entity. * @rb_node: service_tree member. - * @weights_counter: pointer to the weight counter associated with this entity. + * @weight_counter: pointer to the weight counter associated with this entity. * @on_st: flag, true if the entity is on a tree (either the active or * the idle one of its service_tree). * @finish: B-WF2Q+ finish timestamp (aka F_i). @@ -194,36 +194,42 @@ struct bfq_group; * @max_budget: maximum budget allowed from the feedback mechanism. * @budget_timeout: budget expiration (in jiffies). * @dispatched: number of requests on the dispatch list or inside driver. - * @org_ioprio: saved ioprio during boosted periods. * @flags: status flags. * @bfqq_list: node for active/idle bfqq list inside our bfqd. * @seek_samples: number of seeks sampled * @seek_total: sum of the distances of the seeks sampled * @seek_mean: mean seek distance * @last_request_pos: position of the last request enqueued + * @requests_within_timer: number of consecutive pairs of request completion + * and arrival, such that the queue becomes idle + * after the completion, but the next request arrives + * within an idle time slice; used only if the queue's + * IO_bound has been cleared. * @pid: pid of the process owning the queue, used for logging purposes. * @last_wr_start_finish: start time of the current weight-raising period if * the @bfq-queue is being weight-raised, otherwise * finish time of the last weight-raising period * @wr_cur_max_time: current max raising time for this queue - * @soft_rt_next_start: minimum time instant such that, only if a new request - * is enqueued after this time instant in an idle - * @bfq_queue with no outstanding requests, then the - * task associated with the queue it is deemed as soft - * real-time (see the comments to the function + * @soft_rt_next_start: minimum time instant such that, only if a new + * request is enqueued after this time instant in an + * idle @bfq_queue with no outstanding requests, then + * the task associated with the queue it is deemed as + * soft real-time (see the comments to the function * bfq_bfqq_softrt_next_start()) * @last_idle_bklogged: time of the last transition of the @bfq_queue from * idle to backlogged * @service_from_backlogged: cumulative service received from the @bfq_queue - * since the last transition from idle to backlogged + * since the last transition from idle to + * backlogged * @bic: pointer to the bfq_io_cq owning the bfq_queue, set to %NULL if the * queue is shared * - * A bfq_queue is a leaf request queue; it can be associated with an io_context - * or more, if it is async or shared between cooperating processes. @cgroup - * holds a reference to the cgroup, to be sure that it does not disappear while - * a bfqq still references it (mostly to avoid races between request issuing and - * task migration followed by cgroup destruction). + * A bfq_queue is a leaf request queue; it can be associated with an + * io_context or more, if it is async or shared between cooperating + * processes. @cgroup holds a reference to the cgroup, to be sure that it + * does not disappear while a bfqq still references it (mostly to avoid + * races between request issuing and task migration followed by cgroup + * destruction). * All the fields are protected by the queue lock of the containing bfqd. */ struct bfq_queue { @@ -249,8 +255,6 @@ struct bfq_queue { int dispatched; - unsigned short org_ioprio; - unsigned int flags; struct list_head bfqq_list; @@ -260,6 +264,8 @@ struct bfq_queue { sector_t seek_mean; sector_t last_request_pos; + unsigned int requests_within_timer; + pid_t pid; struct bfq_io_cq *bic; @@ -298,7 +304,14 @@ struct bfq_ttime { * shared queue, and is used to set @raising_cur_max_time * when the queue is split from the shared queue and its * weight is raised again - * @saved_idle_window: same purpose as the previous field for the idle window + * @saved_idle_window: same purpose as the previous field for the idle + * window + * @saved_IO_bound: same purpose as the previous two fields for the I/O + * bound classification of a queue + * @cooperations: counter of consecutive successful queue merges underwent + * by any of the process' @bfq_queues + * @failed_cooperations: counter of consecutive failed queue merges of any + * of the process' @bfq_queues */ struct bfq_io_cq { struct io_cq icq; /* must be the first member */ @@ -308,6 +321,10 @@ struct bfq_io_cq { unsigned int wr_time_left; unsigned int saved_idle_window; + unsigned int saved_IO_bound; + + unsigned int cooperations; + unsigned int failed_cooperations; }; enum bfq_device_speed { @@ -319,41 +336,42 @@ enum bfq_device_speed { * struct bfq_data - per device data structure. * @queue: request queue for the managed device. * @root_group: root bfq_group for the device. + * @rq_pos_tree: rbtree sorted by next_request position, used when + * determining if two or more queues have interleaving + * requests (see bfq_close_cooperator()). * @active_numerous_groups: number of bfq_groups containing more than one * active @bfq_entity. - * @rq_pos_tree: rbtree sorted by next_request position, - * used when determining if two or more queues - * have interleaving requests (see bfq_close_cooperator). * @queue_weights_tree: rbtree of weight counters of @bfq_queues, sorted by * weight. Used to keep track of whether all @bfq_queues - * have the same weight. The tree contains one counter - * for each distinct weight associated to some active - * and not weight-raised @bfq_queue (see the comments to + * have the same weight. The tree contains one counter + * for each distinct weight associated to some active + * and not weight-raised @bfq_queue (see the comments to * the functions bfq_weights_tree_[add|remove] for - * further details). + * further details). * @group_weights_tree: rbtree of non-queue @bfq_entity weight counters, sorted * by weight. Used to keep track of whether all - * @bfq_groups have the same weight. The tree contains - * one counter for each distinct weight associated to - * some active @bfq_group (see the comments to the - * functions bfq_weights_tree_[add|remove] for further - * details). + * @bfq_groups have the same weight. The tree contains + * one counter for each distinct weight associated to + * some active @bfq_group (see the comments to the + * functions bfq_weights_tree_[add|remove] for further + * details). * @busy_queues: number of bfq_queues containing requests (including the - * queue under service, even if it is idling). + * queue in service, even if it is idling). * @busy_in_flight_queues: number of @bfq_queues containing pending or - * in-flight requests, plus the @bfq_queue in service, - * even if idle but waiting for the possible arrival - * of its next sync request. This field is updated only - * if the device is rotational, but used only if the - * device is also NCQ-capable. The reason why the field - * is updated also for non-NCQ-capable rotational - * devices is related to the fact that the value of - * hw_tag may be set also later than when this field may - * need to be incremented for the first time(s). - * Taking also this possibility into account, to avoid - * unbalanced increments/decrements, would imply more - * overhead than just updating this field regardless of - * the value of hw_tag. + * in-flight requests, plus the @bfq_queue in + * service, even if idle but waiting for the + * possible arrival of its next sync request. This + * field is updated only if the device is rotational, + * but used only if the device is also NCQ-capable. + * The reason why the field is updated also for non- + * NCQ-capable rotational devices is related to the + * fact that the value of @hw_tag may be set also + * later than when busy_in_flight_queues may need to + * be incremented for the first time(s). Taking also + * this possibility into account, to avoid unbalanced + * increments/decrements, would imply more overhead + * than just updating busy_in_flight_queues + * regardless of the value of @hw_tag. * @const_seeky_busy_in_flight_queues: number of constantly-seeky @bfq_queues * (that is, seeky queues that expired * for budget timeout at least once) @@ -364,26 +382,27 @@ enum bfq_device_speed { * is rotational, but used only if the * device is also NCQ-capable (see the * comments to @busy_in_flight_queues). - * @raised_busy_queues: number of weight-raised busy bfq_queues. + * @wr_busy_queues: number of weight-raised busy @bfq_queues. * @queued: number of queued requests. * @rq_in_driver: number of requests dispatched and waiting for completion. * @sync_flight: number of sync requests in the driver. - * @max_rq_in_driver: max number of reqs in driver in the last @hw_tag_samples - * completed requests. + * @max_rq_in_driver: max number of reqs in driver in the last + * @hw_tag_samples completed requests. * @hw_tag_samples: nr of samples used to calculate hw_tag. * @hw_tag: flag set to one if the driver is showing a queueing behavior. * @budgets_assigned: number of budgets assigned. * @idle_slice_timer: timer set when idling for the next sequential request - * from the queue under service. + * from the queue in service. * @unplug_work: delayed work to restart dispatching on the request queue. - * @in_service_queue: bfq_queue under service. + * @in_service_queue: bfq_queue in service. * @in_service_bic: bfq_io_cq (bic) associated with the @in_service_queue. * @last_position: on-disk position of the last served request. * @last_budget_start: beginning of the last budget. * @last_idling_start: beginning of the last idle slice. * @peak_rate: peak transfer rate observed for a budget. * @peak_rate_samples: number of samples used to calculate @peak_rate. - * @bfq_max_budget: maximum budget allotted to a bfq_queue before rescheduling. + * @bfq_max_budget: maximum budget allotted to a bfq_queue before + * rescheduling. * @group_list: list of all the bfq_groups active on the device. * @active_list: list of all the bfq_queues active on the device. * @idle_list: list of all the bfq_queues idle on the device. @@ -393,7 +412,8 @@ enum bfq_device_speed { * @bfq_back_penalty: weight of backward seeks wrt forward ones. * @bfq_back_max: maximum allowed backward seek. * @bfq_slice_idle: maximum idling time. - * @bfq_user_max_budget: user-configured max budget value (0 for auto-tuning). + * @bfq_user_max_budget: user-configured max budget value + * (0 for auto-tuning). * @bfq_max_budget_async_rq: maximum budget (in nr of requests) allotted to * async queues. * @bfq_timeout: timeout for bfq_queues to consume their budget; used to @@ -403,6 +423,17 @@ enum bfq_device_speed { * they are charged for the whole allocated budget, to try * to preserve a behavior reasonably fair among them, but * without service-domain guarantees). + * @bfq_coop_thresh: number of queue merges after which a @bfq_queue is + * no more granted any weight-raising. + * @bfq_failed_cooperations: number of consecutive failed cooperation + * chances after which weight-raising is restored + * to a queue subject to more than bfq_coop_thresh + * queue merges. + * @bfq_requests_within_timer: number of consecutive requests that must be + * issued within the idle time slice to set + * again idling to a queue which was marked as + * non-I/O-bound (see the definition of the + * IO_bound flag for further details). * @bfq_wr_coeff: Maximum factor by which the weight of a weight-raised * queue is multiplied * @bfq_wr_max_time: maximum duration of a weight-raising period (jiffies) @@ -417,7 +448,7 @@ enum bfq_device_speed { * sectors per seconds * @RT_prod: cached value of the product R*T used for computing the maximum * duration of the weight raising automatically - * @device_speed: device speed class for the low-latency heuristic + * @device_speed: device-speed class for the low-latency heuristic * @oom_bfqq: fallback dummy bfqq for extreme OOM conditions * * All the fields are protected by the @queue lock. @@ -426,18 +457,19 @@ struct bfq_data { struct request_queue *queue; struct bfq_group *root_group; + struct rb_root rq_pos_tree; + #ifdef CONFIG_CGROUP_BFQIO int active_numerous_groups; #endif - struct rb_root rq_pos_tree; struct rb_root queue_weights_tree; struct rb_root group_weights_tree; int busy_queues; int busy_in_flight_queues; int const_seeky_busy_in_flight_queues; - int raised_busy_queues; + int wr_busy_queues; int queued; int rq_in_driver; int sync_flight; @@ -477,6 +509,10 @@ struct bfq_data { unsigned int bfq_max_budget_async_rq; unsigned int bfq_timeout[2]; + unsigned int bfq_coop_thresh; + unsigned int bfq_failed_cooperations; + unsigned int bfq_requests_within_timer; + bool low_latency; /* parameters of the low_latency heuristics */ @@ -493,7 +529,7 @@ struct bfq_data { }; enum bfqq_state_flags { - BFQ_BFQQ_FLAG_busy = 0, /* has requests or is under service */ + BFQ_BFQQ_FLAG_busy = 0, /* has requests or is in service */ BFQ_BFQQ_FLAG_wait_request, /* waiting for a request */ BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */ BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ @@ -501,14 +537,22 @@ enum bfqq_state_flags { BFQ_BFQQ_FLAG_prio_changed, /* task priority has changed */ BFQ_BFQQ_FLAG_sync, /* synchronous queue */ BFQ_BFQQ_FLAG_budget_new, /* no completion with this budget */ + BFQ_BFQQ_FLAG_IO_bound, /* + * bfqq has timed-out at least once + * having consumed at most 2/10 of + * its budget + */ BFQ_BFQQ_FLAG_constantly_seeky, /* - * bfqq has proved to be slow and seeky - * until budget timeout + * bfqq has proved to be slow and + * seeky until budget timeout + */ + BFQ_BFQQ_FLAG_softrt_update, /* + * may need softrt-next-start + * update */ BFQ_BFQQ_FLAG_coop, /* bfqq is shared */ BFQ_BFQQ_FLAG_split_coop, /* shared bfqq will be split */ BFQ_BFQQ_FLAG_just_split, /* queue has just been split */ - BFQ_BFQQ_FLAG_softrt_update, /* may need softrt-next-start update */ }; #define BFQ_BFQQ_FNS(name) \ @@ -533,6 +577,7 @@ BFQ_BFQQ_FNS(idle_window); BFQ_BFQQ_FNS(prio_changed); BFQ_BFQQ_FNS(sync); BFQ_BFQQ_FNS(budget_new); +BFQ_BFQQ_FNS(IO_bound); BFQ_BFQQ_FNS(constantly_seeky); BFQ_BFQQ_FNS(coop); BFQ_BFQQ_FNS(split_coop); @@ -549,7 +594,10 @@ BFQ_BFQQ_FNS(softrt_update); /* Expiration reasons. */ enum bfqq_expiration { - BFQ_BFQQ_TOO_IDLE = 0, /* queue has been idling for too long */ + BFQ_BFQQ_TOO_IDLE = 0, /* + * queue has been idling for + * too long + */ BFQ_BFQQ_BUDGET_TIMEOUT, /* budget took too long to be used */ BFQ_BFQQ_BUDGET_EXHAUSTED, /* budget consumed */ BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */ @@ -571,11 +619,13 @@ enum bfqq_expiration { * except for the idle class that has only one queue. * @async_idle_bfqq: async queue for the idle class (ioprio is ignored). * @my_entity: pointer to @entity, %NULL for the toplevel group; used - * to avoid too many special cases during group creation/migration. - * @active_entities: number of active entities belonging to the group; unused - * for the root group. Used to know whether there are groups - * with more than one active @bfq_entity (see the comments - * to the function bfq_bfqq_must_not_expire()). + * to avoid too many special cases during group creation/ + * migration. + * @active_entities: number of active entities belonging to the group; + * unused for the root group. Used to know whether there + * are groups with more than one active @bfq_entity + * (see the comments to the function + * bfq_bfqq_must_not_expire()). * * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup * there is a set of bfq_groups, each one collecting the lower-level @@ -714,4 +764,5 @@ static void bfq_end_wr_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); -#endif + +#endif /* _BFQ_H */ -- 1.9.3