diff options
Diffstat (limited to 'block/bfq-iosched.c')
-rw-r--r-- | block/bfq-iosched.c | 398 |
1 files changed, 376 insertions, 22 deletions
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 95586137194e..0270cd7ca165 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -1012,7 +1012,7 @@ static void bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, struct bfq_io_cq *bic, bool bfq_already_existing) { - unsigned int old_wr_coeff = bfqq->wr_coeff; + unsigned int old_wr_coeff = 1; bool busy = bfq_already_existing && bfq_bfqq_busy(bfqq); if (bic->saved_has_short_ttime) @@ -1033,7 +1033,13 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, bfqq->ttime = bic->saved_ttime; bfqq->io_start_time = bic->saved_io_start_time; bfqq->tot_idle_time = bic->saved_tot_idle_time; - bfqq->wr_coeff = bic->saved_wr_coeff; + /* + * Restore weight coefficient only if low_latency is on + */ + if (bfqd->low_latency) { + old_wr_coeff = bfqq->wr_coeff; + bfqq->wr_coeff = bic->saved_wr_coeff; + } bfqq->service_from_wr = bic->saved_service_from_wr; bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt; bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish; @@ -1069,7 +1075,7 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, static int bfqq_process_refs(struct bfq_queue *bfqq) { return bfqq->ref - bfqq->allocated - bfqq->entity.on_st_or_in_serv - - (bfqq->weight_counter != NULL); + (bfqq->weight_counter != NULL) - bfqq->stable_ref; } /* Empty burst list and add just bfqq (see comments on bfq_handle_burst) */ @@ -2622,6 +2628,11 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, return true; } +static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd, + struct bfq_queue *bfqq); + +static void bfq_put_stable_ref(struct bfq_queue *bfqq); + /* * Attempt to schedule a merge of bfqq with the currently in-service * queue or with a close queue among the scheduled queues. Return @@ -2644,11 +2655,50 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, */ static struct bfq_queue * bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, - void *io_struct, bool request) + void *io_struct, bool request, struct bfq_io_cq *bic) { struct bfq_queue *in_service_bfqq, *new_bfqq; /* + * Check delayed stable merge for rotational or non-queueing + * devs. For this branch to be executed, bfqq must not be + * currently merged with some other queue (i.e., bfqq->bic + * must be non null). If we considered also merged queues, + * then we should also check whether bfqq has already been + * merged with bic->stable_merge_bfqq. But this would be + * costly and complicated. + */ + if (unlikely(!bfqd->nonrot_with_queueing)) { + if (bic->stable_merge_bfqq && + !bfq_bfqq_just_created(bfqq) && + time_is_after_jiffies(bfqq->split_time + + msecs_to_jiffies(200))) { + struct bfq_queue *stable_merge_bfqq = + bic->stable_merge_bfqq; + int proc_ref = min(bfqq_process_refs(bfqq), + bfqq_process_refs(stable_merge_bfqq)); + + /* deschedule stable merge, because done or aborted here */ + bfq_put_stable_ref(stable_merge_bfqq); + + bic->stable_merge_bfqq = NULL; + + if (!idling_boosts_thr_without_issues(bfqd, bfqq) && + proc_ref > 0) { + /* next function will take at least one ref */ + struct bfq_queue *new_bfqq = + bfq_setup_merge(bfqq, stable_merge_bfqq); + + bic->stably_merged = true; + if (new_bfqq && new_bfqq->bic) + new_bfqq->bic->stably_merged = true; + return new_bfqq; + } else + return NULL; + } + } + + /* * Do not perform queue merging if the device is non * rotational and performs internal queueing. In fact, such a * device reaches a high speed through internal parallelism @@ -2789,6 +2839,17 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) } } + +static void +bfq_reassign_last_bfqq(struct bfq_queue *cur_bfqq, struct bfq_queue *new_bfqq) +{ + if (cur_bfqq->entity.parent && + cur_bfqq->entity.parent->last_bfqq_created == cur_bfqq) + cur_bfqq->entity.parent->last_bfqq_created = new_bfqq; + else if (cur_bfqq->bfqd && cur_bfqq->bfqd->last_bfqq_created == cur_bfqq) + cur_bfqq->bfqd->last_bfqq_created = new_bfqq; +} + void bfq_release_process_ref(struct bfq_data *bfqd, struct bfq_queue *bfqq) { /* @@ -2806,6 +2867,8 @@ void bfq_release_process_ref(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfqq != bfqd->in_service_queue) bfq_del_bfqq_busy(bfqd, bfqq, false); + bfq_reassign_last_bfqq(bfqq, NULL); + bfq_put_queue(bfqq); } @@ -2823,6 +2886,29 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, bfq_clear_bfqq_IO_bound(bfqq); /* + * The processes associated with bfqq are cooperators of the + * processes associated with new_bfqq. So, if bfqq has a + * waker, then assume that all these processes will be happy + * to let bfqq's waker freely inject I/O when they have no + * I/O. + */ + if (bfqq->waker_bfqq && !new_bfqq->waker_bfqq && + bfqq->waker_bfqq != new_bfqq) { + new_bfqq->waker_bfqq = bfqq->waker_bfqq; + new_bfqq->tentative_waker_bfqq = NULL; + + /* + * If the waker queue disappears, then + * new_bfqq->waker_bfqq must be reset. So insert + * new_bfqq into the woken_list of the waker. See + * bfq_check_waker for details. + */ + hlist_add_head(&new_bfqq->woken_list_node, + &new_bfqq->waker_bfqq->woken_list); + + } + + /* * If bfqq is weight-raised, then let new_bfqq inherit * weight-raising. To reduce false positives, neglect the case * where bfqq has just been created, but has not yet made it @@ -2879,6 +2965,9 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, */ new_bfqq->pid = -1; bfqq->bic = NULL; + + bfq_reassign_last_bfqq(bfqq, new_bfqq); + bfq_release_process_ref(bfqd, bfqq); } @@ -2906,7 +2995,7 @@ static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq, * We take advantage of this function to perform an early merge * of the queues of possible cooperating processes. */ - new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false); + new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false, bfqd->bio_bic); if (new_bfqq) { /* * bic still points to bfqq, then it has not yet been @@ -4491,9 +4580,15 @@ check_queue: bfq_bfqq_busy(bfqq->bic->bfqq[0]) && bfqq->bic->bfqq[0]->next_rq ? bfqq->bic->bfqq[0] : NULL; + struct bfq_queue *blocked_bfqq = + !hlist_empty(&bfqq->woken_list) ? + container_of(bfqq->woken_list.first, + struct bfq_queue, + woken_list_node) + : NULL; /* - * The next three mutually-exclusive ifs decide + * The next four mutually-exclusive ifs decide * whether to try injection, and choose the queue to * pick an I/O request from. * @@ -4526,7 +4621,15 @@ check_queue: * next bfqq's I/O is brought forward dramatically, * for it is not blocked for milliseconds. * - * The third if checks whether bfqq is a queue for + * The third if checks whether there is a queue woken + * by bfqq, and currently with pending I/O. Such a + * woken queue does not steal bandwidth from bfqq, + * because it remains soon without I/O if bfqq is not + * served. So there is virtually no risk of loss of + * bandwidth for bfqq if this woken queue has I/O + * dispatched while bfqq is waiting for new I/O. + * + * The fourth if checks whether bfqq is a queue for * which it is better to avoid injection. It is so if * bfqq delivers more throughput when served without * any further I/O from other queues in the middle, or @@ -4546,11 +4649,11 @@ check_queue: * bfq_update_has_short_ttime(), it is rather likely * that, if I/O is being plugged for bfqq and the * waker queue has pending I/O requests that are - * blocking bfqq's I/O, then the third alternative + * blocking bfqq's I/O, then the fourth alternative * above lets the waker queue get served before the * I/O-plugging timeout fires. So one may deem the * second alternative superfluous. It is not, because - * the third alternative may be way less effective in + * the fourth alternative may be way less effective in * case of a synchronization. For two main * reasons. First, throughput may be low because the * inject limit may be too low to guarantee the same @@ -4559,7 +4662,7 @@ check_queue: * guarantees (the second alternative unconditionally * injects a pending I/O request of the waker queue * for each bfq_dispatch_request()). Second, with the - * third alternative, the duration of the plugging, + * fourth alternative, the duration of the plugging, * i.e., the time before bfqq finally receives new I/O, * may not be minimized, because the waker queue may * happen to be served only after other queues. @@ -4577,6 +4680,14 @@ check_queue: bfq_bfqq_budget_left(bfqq->waker_bfqq) ) bfqq = bfqq->waker_bfqq; + else if (blocked_bfqq && + bfq_bfqq_busy(blocked_bfqq) && + blocked_bfqq->next_rq && + bfq_serv_to_charge(blocked_bfqq->next_rq, + blocked_bfqq) <= + bfq_bfqq_budget_left(blocked_bfqq) + ) + bfqq = blocked_bfqq; else if (!idling_boosts_thr_without_issues(bfqd, bfqq) && (bfqq->wr_coeff == 1 || bfqd->wr_busy_queues > 1 || !bfq_bfqq_has_short_ttime(bfqq))) @@ -4983,6 +5094,12 @@ void bfq_put_queue(struct bfq_queue *bfqq) bfqg_and_blkg_put(bfqg); } +static void bfq_put_stable_ref(struct bfq_queue *bfqq) +{ + bfqq->stable_ref--; + bfq_put_queue(bfqq); +} + static void bfq_put_cooperator(struct bfq_queue *bfqq) { struct bfq_queue *__bfqq, *next; @@ -5039,6 +5156,24 @@ static void bfq_exit_icq(struct io_cq *icq) { struct bfq_io_cq *bic = icq_to_bic(icq); + if (bic->stable_merge_bfqq) { + struct bfq_data *bfqd = bic->stable_merge_bfqq->bfqd; + + /* + * bfqd is NULL if scheduler already exited, and in + * that case this is the last time bfqq is accessed. + */ + if (bfqd) { + unsigned long flags; + + spin_lock_irqsave(&bfqd->lock, flags); + bfq_put_stable_ref(bic->stable_merge_bfqq); + spin_unlock_irqrestore(&bfqd->lock, flags); + } else { + bfq_put_stable_ref(bic->stable_merge_bfqq); + } + } + bfq_exit_icq_bfqq(bic, true); bfq_exit_icq_bfqq(bic, false); } @@ -5099,7 +5234,8 @@ bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, struct bio *bio, bool is_sync, - struct bfq_io_cq *bic); + struct bfq_io_cq *bic, + bool respawn); static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) { @@ -5119,7 +5255,7 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) bfqq = bic_to_bfqq(bic, false); if (bfqq) { bfq_release_process_ref(bfqd, bfqq); - bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic); + bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic, true); bic_set_bfqq(bic, bfqq, false); } @@ -5162,6 +5298,8 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, /* set end request to minus infinity from now */ bfqq->ttime.last_end_request = now_ns + 1; + bfqq->creation_time = jiffies; + bfqq->io_start_time = now_ns; bfq_mark_bfqq_IO_bound(bfqq); @@ -5211,9 +5349,156 @@ static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, } } +static struct bfq_queue * +bfq_do_early_stable_merge(struct bfq_data *bfqd, struct bfq_queue *bfqq, + struct bfq_io_cq *bic, + struct bfq_queue *last_bfqq_created) +{ + struct bfq_queue *new_bfqq = + bfq_setup_merge(bfqq, last_bfqq_created); + + if (!new_bfqq) + return bfqq; + + if (new_bfqq->bic) + new_bfqq->bic->stably_merged = true; + bic->stably_merged = true; + + /* + * Reusing merge functions. This implies that + * bfqq->bic must be set too, for + * bfq_merge_bfqqs to correctly save bfqq's + * state before killing it. + */ + bfqq->bic = bic; + bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq); + + return new_bfqq; +} + +/* + * Many throughput-sensitive workloads are made of several parallel + * I/O flows, with all flows generated by the same application, or + * more generically by the same task (e.g., system boot). The most + * counterproductive action with these workloads is plugging I/O + * dispatch when one of the bfq_queues associated with these flows + * remains temporarily empty. + * + * To avoid this plugging, BFQ has been using a burst-handling + * mechanism for years now. This mechanism has proven effective for + * throughput, and not detrimental for service guarantees. The + * following function pushes this mechanism a little bit further, + * basing on the following two facts. + * + * First, all the I/O flows of a the same application or task + * contribute to the execution/completion of that common application + * or task. So the performance figures that matter are total + * throughput of the flows and task-wide I/O latency. In particular, + * these flows do not need to be protected from each other, in terms + * of individual bandwidth or latency. + * + * Second, the above fact holds regardless of the number of flows. + * + * Putting these two facts together, this commits merges stably the + * bfq_queues associated with these I/O flows, i.e., with the + * processes that generate these IO/ flows, regardless of how many the + * involved processes are. + * + * To decide whether a set of bfq_queues is actually associated with + * the I/O flows of a common application or task, and to merge these + * queues stably, this function operates as follows: given a bfq_queue, + * say Q2, currently being created, and the last bfq_queue, say Q1, + * created before Q2, Q2 is merged stably with Q1 if + * - very little time has elapsed since when Q1 was created + * - Q2 has the same ioprio as Q1 + * - Q2 belongs to the same group as Q1 + * + * Merging bfq_queues also reduces scheduling overhead. A fio test + * with ten random readers on /dev/nullb shows a throughput boost of + * 40%, with a quadcore. Since BFQ's execution time amounts to ~50% of + * the total per-request processing time, the above throughput boost + * implies that BFQ's overhead is reduced by more than 50%. + * + * This new mechanism most certainly obsoletes the current + * burst-handling heuristics. We keep those heuristics for the moment. + */ +static struct bfq_queue *bfq_do_or_sched_stable_merge(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + struct bfq_io_cq *bic) +{ + struct bfq_queue **source_bfqq = bfqq->entity.parent ? + &bfqq->entity.parent->last_bfqq_created : + &bfqd->last_bfqq_created; + + struct bfq_queue *last_bfqq_created = *source_bfqq; + + /* + * If last_bfqq_created has not been set yet, then init it. If + * it has been set already, but too long ago, then move it + * forward to bfqq. Finally, move also if bfqq belongs to a + * different group than last_bfqq_created, or if bfqq has a + * different ioprio or ioprio_class. If none of these + * conditions holds true, then try an early stable merge or + * schedule a delayed stable merge. + * + * A delayed merge is scheduled (instead of performing an + * early merge), in case bfqq might soon prove to be more + * throughput-beneficial if not merged. Currently this is + * possible only if bfqd is rotational with no queueing. For + * such a drive, not merging bfqq is better for throughput if + * bfqq happens to contain sequential I/O. So, we wait a + * little bit for enough I/O to flow through bfqq. After that, + * if such an I/O is sequential, then the merge is + * canceled. Otherwise the merge is finally performed. + */ + if (!last_bfqq_created || + time_before(last_bfqq_created->creation_time + + bfqd->bfq_burst_interval, + bfqq->creation_time) || + bfqq->entity.parent != last_bfqq_created->entity.parent || + bfqq->ioprio != last_bfqq_created->ioprio || + bfqq->ioprio_class != last_bfqq_created->ioprio_class) + *source_bfqq = bfqq; + else if (time_after_eq(last_bfqq_created->creation_time + + bfqd->bfq_burst_interval, + bfqq->creation_time)) { + if (likely(bfqd->nonrot_with_queueing)) + /* + * With this type of drive, leaving + * bfqq alone may provide no + * throughput benefits compared with + * merging bfqq. So merge bfqq now. + */ + bfqq = bfq_do_early_stable_merge(bfqd, bfqq, + bic, + last_bfqq_created); + else { /* schedule tentative stable merge */ + /* + * get reference on last_bfqq_created, + * to prevent it from being freed, + * until we decide whether to merge + */ + last_bfqq_created->ref++; + /* + * need to keep track of stable refs, to + * compute process refs correctly + */ + last_bfqq_created->stable_ref++; + /* + * Record the bfqq to merge to. + */ + bic->stable_merge_bfqq = last_bfqq_created; + } + } + + return bfqq; +} + + static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, struct bio *bio, bool is_sync, - struct bfq_io_cq *bic) + struct bfq_io_cq *bic, + bool respawn) { const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio); const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); @@ -5271,7 +5556,10 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, out: bfqq->ref++; /* get a process reference to this queue */ - bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref); + + if (bfqq != &bfqd->oom_bfqq && is_sync && !respawn) + bfqq = bfq_do_or_sched_stable_merge(bfqd, bfqq, bic); + rcu_read_unlock(); return bfqq; } @@ -5521,7 +5809,8 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) { struct bfq_queue *bfqq = RQ_BFQQ(rq), - *new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true); + *new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true, + RQ_BIC(rq)); bool waiting, idle_timer_disabled = false; if (new_bfqq) { @@ -5627,7 +5916,48 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, spin_lock_irq(&bfqd->lock); bfqq = bfq_init_rq(rq); - if (!bfqq || at_head || blk_rq_is_passthrough(rq)) { + + /* + * Reqs with at_head or passthrough flags set are to be put + * directly into dispatch list. Additional case for putting rq + * directly into the dispatch queue: the only active + * bfq_queues are bfqq and either its waker bfq_queue or one + * of its woken bfq_queues. The rationale behind this + * additional condition is as follows: + * - consider a bfq_queue, say Q1, detected as a waker of + * another bfq_queue, say Q2 + * - by definition of a waker, Q1 blocks the I/O of Q2, i.e., + * some I/O of Q1 needs to be completed for new I/O of Q2 + * to arrive. A notable example of waker is journald + * - so, Q1 and Q2 are in any respect the queues of two + * cooperating processes (or of two cooperating sets of + * processes): the goal of Q1's I/O is doing what needs to + * be done so that new Q2's I/O can finally be + * issued. Therefore, if the service of Q1's I/O is delayed, + * then Q2's I/O is delayed too. Conversely, if Q2's I/O is + * delayed, the goal of Q1's I/O is hindered. + * - as a consequence, if some I/O of Q1/Q2 arrives while + * Q2/Q1 is the only queue in service, there is absolutely + * no point in delaying the service of such an I/O. The + * only possible result is a throughput loss + * - so, when the above condition holds, the best option is to + * have the new I/O dispatched as soon as possible + * - the most effective and efficient way to attain the above + * goal is to put the new I/O directly in the dispatch + * list + * - as an additional restriction, Q1 and Q2 must be the only + * busy queues for this commit to put the I/O of Q2/Q1 in + * the dispatch list. This is necessary, because, if also + * other queues are waiting for service, then putting new + * I/O directly in the dispatch list may evidently cause a + * violation of service guarantees for the other queues + */ + if (!bfqq || + (bfqq != bfqd->in_service_queue && + bfqd->in_service_queue != NULL && + bfq_tot_busy_queues(bfqd) == 1 + bfq_bfqq_busy(bfqq) && + (bfqq->waker_bfqq == bfqd->in_service_queue || + bfqd->in_service_queue->waker_bfqq == bfqq)) || at_head) { if (at_head) list_add(&rq->queuelist, &bfqd->dispatch); else @@ -5767,7 +6097,17 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) 1UL<<(BFQ_RATE_SHIFT - 10)) bfq_update_rate_reset(bfqd, NULL); bfqd->last_completion = now_ns; - bfqd->last_completed_rq_bfqq = bfqq; + /* + * Shared queues are likely to receive I/O at a high + * rate. This may deceptively let them be considered as wakers + * of other queues. But a false waker will unjustly steal + * bandwidth to its supposedly woken queue. So considering + * also shared queues in the waking mechanism may cause more + * control troubles than throughput benefits. Then do not set + * last_completed_rq_bfqq to bfqq if bfqq is a shared queue. + */ + if (!bfq_bfqq_coop(bfqq)) + bfqd->last_completed_rq_bfqq = bfqq; /* * If we are waiting to discover whether the request pattern @@ -6124,7 +6464,7 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, if (bfqq) bfq_put_queue(bfqq); - bfqq = bfq_get_queue(bfqd, bio, is_sync, bic); + bfqq = bfq_get_queue(bfqd, bio, is_sync, bic, split); bic_set_bfqq(bic, bfqq, is_sync); if (split && is_sync) { @@ -6245,8 +6585,9 @@ static struct bfq_queue *bfq_init_rq(struct request *rq) if (likely(!new_queue)) { /* If the queue was seeky for too long, break it apart. */ - if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { - bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq"); + if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq) && + !bic->stably_merged) { + struct bfq_queue *old_bfqq = bfqq; /* Update bic before losing reference to bfqq */ if (bfq_bfqq_in_large_burst(bfqq)) @@ -6255,11 +6596,24 @@ static struct bfq_queue *bfq_init_rq(struct request *rq) bfqq = bfq_split_bfqq(bic, bfqq); split = true; - if (!bfqq) + if (!bfqq) { bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, true, is_sync, NULL); - else + bfqq->waker_bfqq = old_bfqq->waker_bfqq; + bfqq->tentative_waker_bfqq = NULL; + + /* + * If the waker queue disappears, then + * new_bfqq->waker_bfqq must be + * reset. So insert new_bfqq into the + * woken_list of the waker. See + * bfq_check_waker for details. + */ + if (bfqq->waker_bfqq) + hlist_add_head(&bfqq->woken_list_node, + &bfqq->waker_bfqq->woken_list); + } else bfqq_already_existing = true; } } |