Newer
Older
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
+ * . Q is inserted in the burst list, as Q may be the first queue
+ * in a possible new burst (then the burst list contains just Q
+ * after this step).
+ */
+static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+ bool idle_for_long_time)
+{
+ /*
+ * If bfqq happened to be activated in a burst, but has been idle
+ * for at least as long as an interactive queue, then we assume
+ * that, in the overall I/O initiated in the burst, the I/O
+ * associated to bfqq is finished. So bfqq does not need to be
+ * treated as a queue belonging to a burst anymore. Accordingly,
+ * we reset bfqq's in_large_burst flag if set, and remove bfqq
+ * from the burst list if it's there. We do not decrement instead
+ * burst_size, because the fact that bfqq does not need to belong
+ * to the burst list any more does not invalidate the fact that
+ * bfqq may have been activated during the current burst.
+ */
+ if (idle_for_long_time) {
+ hlist_del_init(&bfqq->burst_list_node);
+ bfq_clear_bfqq_in_large_burst(bfqq);
+ }
+
+ /*
+ * If bfqq is already in the burst list or is part of a large
+ * burst, then there is nothing else to do.
+ */
+ if (!hlist_unhashed(&bfqq->burst_list_node) ||
+ bfq_bfqq_in_large_burst(bfqq))
+ return;
+
+ /*
+ * If bfqq's activation happens late enough, then the current
+ * burst is finished, and related data structures must be reset.
+ *
+ * In this respect, consider the special case where bfqq is the very
+ * first queue being activated. In this case, last_ins_in_burst is
+ * not yet significant when we get here. But it is easy to verify
+ * that, whether or not the following condition is true, bfqq will
+ * end up being inserted into the burst list. In particular the
+ * list will happen to contain only bfqq. And this is exactly what
+ * has to happen, as bfqq may be the first queue in a possible
+ * burst.
+ */
+ if (time_is_before_jiffies(bfqd->last_ins_in_burst +
+ bfqd->bfq_burst_interval)) {
+ bfqd->large_burst = false;
+ bfq_reset_burst_list(bfqd, bfqq);
+ return;
+ }
+
+ /*
+ * If we get here, then bfqq is being activated shortly after the
+ * last queue. So, if the current burst is also large, we can mark
+ * bfqq as belonging to this large burst immediately.
+ */
+ if (bfqd->large_burst) {
+ bfq_mark_bfqq_in_large_burst(bfqq);
+ return;
+ }
+
+ /*
+ * If we get here, then a large-burst state has not yet been
+ * reached, but bfqq is being activated shortly after the last
+ * queue. Then we add bfqq to the burst.
+ */
+ bfq_add_to_burst(bfqd, bfqq);
+}
+
+static void bfq_add_request(struct request *rq)
+{
+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
+ struct bfq_entity *entity = &bfqq->entity;
+ struct bfq_data *bfqd = bfqq->bfqd;
+ struct request *next_rq, *prev;
+ unsigned long old_wr_coeff = bfqq->wr_coeff;
+ bool interactive = false;
+
+ bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq));
+ bfqq->queued[rq_is_sync(rq)]++;
+ bfqd->queued++;
+
+ elv_rb_add(&bfqq->sort_list, rq);
+
+ /*
+ * Check if this request is a better next-serve candidate.
+ */
+ prev = bfqq->next_rq;
+ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);
+ bfqq->next_rq = next_rq;
+
+ if (!bfq_bfqq_busy(bfqq)) {
+ idle_for_long_time = time_is_before_jiffies(
+ bfqq->budget_timeout +
+ bfqd->bfq_wr_min_idle_time);
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq,
+ rq->cmd_flags);
+#endif
+ if (bfq_bfqq_sync(bfqq)) {
+ bool already_in_burst =
+ !hlist_unhashed(&bfqq->burst_list_node) ||
+ bfq_bfqq_in_large_burst(bfqq);
+ bfq_handle_burst(bfqd, bfqq, idle_for_long_time);
+ /*
+ * If bfqq was not already in the current burst,
+ * then, at this point, bfqq either has been
+ * added to the current burst or has caused the
+ * current burst to terminate. In particular, in
+ * the second case, bfqq has become the first
+ * queue in a possible new burst.
+ * In both cases last_ins_in_burst needs to be
+ * moved forward.
+ */
+ if (!already_in_burst)
+ bfqd->last_ins_in_burst = jiffies;
+ }
+
+ in_burst = bfq_bfqq_in_large_burst(bfqq);
+ soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 &&
+ time_is_before_jiffies(bfqq->soft_rt_next_start);
+ interactive = !in_burst && idle_for_long_time;
+ entity->budget = max_t(unsigned long, bfqq->max_budget,
+ bfq_serv_to_charge(next_rq, bfqq));
+
+ if (!bfq_bfqq_IO_bound(bfqq)) {
+ if (time_before(jiffies,
+ RQ_BIC(rq)->ttime.last_end_request +
+ bfqd->bfq_slice_idle)) {
+ bfqq->requests_within_timer++;
+ if (bfqq->requests_within_timer >=
+ bfqd->bfq_requests_within_timer)
+ bfq_mark_bfqq_IO_bound(bfqq);
+ } else
+ bfqq->requests_within_timer = 0;
+ }
+
+ if (!bfqd->low_latency)
+ goto add_bfqq_busy;
+
+ /*
+ * If the queue:
+ * - is not being boosted,
+ * - has been idle for enough time,
+ * - is not a sync queue or is linked to a bfq_io_cq (it is
+ * shared "for its nature" or it is not shared and its
+ * requests have not been redirected to a shared queue)
+ * start a weight-raising period.
+ if (old_wr_coeff == 1 && (interactive || soft_rt) &&
+ (!bfq_bfqq_sync(bfqq) || bfqq->bic)) {
+ bfqq->wr_coeff = bfqd->bfq_wr_coeff;
+ if (interactive)
+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
+ else
+ bfqq->wr_cur_max_time =
+ bfqd->bfq_wr_rt_max_time;
+ bfq_log_bfqq(bfqd, bfqq,
+ "wrais starting at %lu, rais_max_time %u",
+ jiffies,
+ jiffies_to_msecs(bfqq->wr_cur_max_time));
+ } else if (old_wr_coeff > 1) {
+ if (interactive)
+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
+ (bfqq->wr_cur_max_time ==
+ bfqd->bfq_wr_rt_max_time &&
+ !soft_rt)) {
+ bfqq->wr_coeff = 1;
+ bfq_log_bfqq(bfqd, bfqq,
+ "wrais ending at %lu, rais_max_time %u",
+ jiffies,
+ jiffies_to_msecs(bfqq->
+ wr_cur_max_time));
+ } else if (time_before(
+ bfqq->last_wr_start_finish +
+ bfqq->wr_cur_max_time,
+ jiffies +
+ bfqd->bfq_wr_rt_max_time) &&
+ soft_rt) {
+ /*
+ *
+ * The remaining weight-raising time is lower
+ * than bfqd->bfq_wr_rt_max_time, which means
+ * that the application is enjoying weight
+ * raising either because deemed soft-rt in
+ * the near past, or because deemed interactive
+ * a long ago.
+ * In both cases, resetting now the current
+ * remaining weight-raising time for the
+ * application to the weight-raising duration
+ * for soft rt applications would not cause any
+ * latency increase for the application (as the
+ * new duration would be higher than the
+ * remaining time).
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
+ *
+ * In addition, the application is now meeting
+ * the requirements for being deemed soft rt.
+ * In the end we can correctly and safely
+ * (re)charge the weight-raising duration for
+ * the application with the weight-raising
+ * duration for soft rt applications.
+ *
+ * In particular, doing this recharge now, i.e.,
+ * before the weight-raising period for the
+ * application finishes, reduces the probability
+ * of the following negative scenario:
+ * 1) the weight of a soft rt application is
+ * raised at startup (as for any newly
+ * created application),
+ * 2) since the application is not interactive,
+ * at a certain time weight-raising is
+ * stopped for the application,
+ * 3) at that time the application happens to
+ * still have pending requests, and hence
+ * is destined to not have a chance to be
+ * deemed soft rt before these requests are
+ * completed (see the comments to the
+ * function bfq_bfqq_softrt_next_start()
+ * for details on soft rt detection),
+ * 4) these pending requests experience a high
+ * latency because the application is not
+ * weight-raised while they are pending.
+ */
+ bfqq->last_wr_start_finish = jiffies;
+ bfqq->wr_cur_max_time =
+ bfqd->bfq_wr_rt_max_time;
+ }
+ }
+ if (old_wr_coeff != bfqq->wr_coeff)
+add_bfqq_busy:
+ bfqq->last_idle_bklogged = jiffies;
+ bfqq->service_from_backlogged = 0;
+ bfq_clear_bfqq_softrt_update(bfqq);
+ bfq_add_bfqq_busy(bfqd, bfqq);
+ } else {
+ if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) &&
+ time_is_before_jiffies(
+ bfqq->last_wr_start_finish +
+ bfqd->bfq_wr_min_inter_arr_async)) {
+ bfqq->wr_coeff = bfqd->bfq_wr_coeff;
+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
+
+ bfqd->wr_busy_queues++;
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
+ bfq_log_bfqq(bfqd, bfqq,
+ "non-idle wrais starting at %lu, rais_max_time %u",
+ jiffies,
+ jiffies_to_msecs(bfqq->wr_cur_max_time));
+ }
+ if (prev != bfqq->next_rq)
+ bfq_updated_next_req(bfqd, bfqq);
+ }
+
+ if (bfqd->low_latency &&
+ (old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive))
+ bfqq->last_wr_start_finish = jiffies;
+}
+
+static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd,
+ struct bio *bio)
+{
+ struct task_struct *tsk = current;
+ struct bfq_io_cq *bic;
+ struct bfq_queue *bfqq;
+
+ bic = bfq_bic_lookup(bfqd, tsk->io_context);
+ return NULL;
+
+ bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
+ return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio));
+
+ return NULL;
+}
+
+static void bfq_activate_request(struct request_queue *q, struct request *rq)
+{
+ struct bfq_data *bfqd = q->elevator->elevator_data;
+
+ bfqd->rq_in_driver++;
+ bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
+ bfq_log(bfqd, "activate_request: new bfqd->last_position %llu",
+ (long long unsigned)bfqd->last_position);
+}
+
+static void bfq_deactivate_request(struct request_queue *q, struct request *rq)
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
+{
+ struct bfq_data *bfqd = q->elevator->elevator_data;
+
+ BUG_ON(bfqd->rq_in_driver == 0);
+ bfqd->rq_in_driver--;
+}
+
+static void bfq_remove_request(struct request *rq)
+{
+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
+ struct bfq_data *bfqd = bfqq->bfqd;
+ const int sync = rq_is_sync(rq);
+
+ if (bfqq->next_rq == rq) {
+ bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq);
+ bfq_updated_next_req(bfqd, bfqq);
+ }
+
+ if (rq->queuelist.prev != &rq->queuelist)
+ list_del_init(&rq->queuelist);
+ BUG_ON(bfqq->queued[sync] == 0);
+ bfqq->queued[sync]--;
+ bfqd->queued--;
+ elv_rb_del(&bfqq->sort_list, rq);
+
+ if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
+ if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue)
+ bfq_del_bfqq_busy(bfqd, bfqq, 1);
+ /*
+ * Remove queue from request-position tree as it is empty.
+ */
+ rb_erase(&bfqq->pos_node, bfqq->pos_root);
+ bfqq->pos_root = NULL;
+ }
+ }
+
+ if (rq->cmd_flags & REQ_META) {
+ BUG_ON(bfqq->meta_pending == 0);
+ bfqq->meta_pending--;
+ }
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags);
+#endif
+}
+
+static int bfq_merge(struct request_queue *q, struct request **req,
+ struct bio *bio)
+{
+ struct bfq_data *bfqd = q->elevator->elevator_data;
+ struct request *__rq;
+
+ __rq = bfq_find_rq_fmerge(bfqd, bio);
+ if (__rq && elv_rq_merge_ok(__rq, bio)) {
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
+ *req = __rq;
+ return ELEVATOR_FRONT_MERGE;
+ }
+
+ return ELEVATOR_NO_MERGE;
+}
+
+static void bfq_merged_request(struct request_queue *q, struct request *req,
+ int type)
+{
+ if (type == ELEVATOR_FRONT_MERGE &&
+ rb_prev(&req->rb_node) &&
+ blk_rq_pos(req) <
+ blk_rq_pos(container_of(rb_prev(&req->rb_node),
+ struct request, rb_node))) {
+ struct bfq_queue *bfqq = RQ_BFQQ(req);
+ struct bfq_data *bfqd = bfqq->bfqd;
+ struct request *prev, *next_rq;
+
+ /* Reposition request in its sort_list */
+ elv_rb_del(&bfqq->sort_list, req);
+ elv_rb_add(&bfqq->sort_list, req);
+ /* Choose next request to be served for bfqq */
+ prev = bfqq->next_rq;
+ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req,
+ bfqd->last_position);
+ bfqq->next_rq = next_rq;
+ }
+}
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+static void bfq_bio_merged(struct request_queue *q, struct request *req,
+ struct bio *bio)
+{
+ bfqg_stats_update_io_merged(bfqq_group(RQ_BFQQ(req)), bio->bi_rw);
+}
+#endif
+
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
+static void bfq_merged_requests(struct request_queue *q, struct request *rq,
+ struct request *next)
+{
+ struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next);
+
+ /*
+ * If next and rq belong to the same bfq_queue and next is older
+ * than rq, then reposition rq in the fifo (by substituting next
+ * with rq). Otherwise, if next and rq belong to different
+ * bfq_queues, never reposition rq: in fact, we would have to
+ * reposition it with respect to next's position in its own fifo,
+ * which would most certainly be too expensive with respect to
+ * the benefits.
+ */
+ if (bfqq == next_bfqq &&
+ !list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
+ time_before(next->fifo_time, rq->fifo_time)) {
+ list_del_init(&rq->queuelist);
+ list_replace_init(&next->queuelist, &rq->queuelist);
+ rq->fifo_time = next->fifo_time;
+ }
+
+ if (bfqq->next_rq == next)
+ bfqq->next_rq = rq;
+
+ bfq_remove_request(next);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags);
+#endif
+}
+
+/* Must be called with bfqq != NULL */
+static void bfq_bfqq_end_wr(struct bfq_queue *bfqq)
+ if (bfq_bfqq_busy(bfqq))
+ bfqq->bfqd->wr_busy_queues--;
+ bfqq->wr_coeff = 1;
+ bfqq->wr_cur_max_time = 0;
+ /* Trigger a weight change on the next activation of the queue */
+}
+
+static void bfq_end_wr_async_queues(struct bfq_data *bfqd,
+ struct bfq_group *bfqg)
+{
+ int i, j;
+
+ for (i = 0; i < 2; i++)
+ for (j = 0; j < IOPRIO_BE_NR; j++)
+ bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]);
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
+ bfq_bfqq_end_wr(bfqg->async_idle_bfqq);
+}
+
+static void bfq_end_wr(struct bfq_data *bfqd)
+{
+ struct bfq_queue *bfqq;
+
+ spin_lock_irq(bfqd->queue->queue_lock);
+
+ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)
+ bfq_bfqq_end_wr(bfqq);
+ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list)
+ bfq_bfqq_end_wr(bfqq);
+ bfq_end_wr_async(bfqd);
+
+ spin_unlock_irq(bfqd->queue->queue_lock);
+}
+
+static int bfq_allow_merge(struct request_queue *q, struct request *rq,
+ struct bio *bio)
+{
+ struct bfq_data *bfqd = q->elevator->elevator_data;
+ struct bfq_io_cq *bic;
+
+ /*
+ * Disallow merge of a sync bio into an async request.
+ */
+ if (bfq_bio_sync(bio) && !rq_is_sync(rq))
+ return 0;
+
+ /*
+ * Lookup the bfqq that this bio will be queued with. Allow
+ * merge only if rq is queued there.
+ * Queue lock is held here.
+ */
+ bic = bfq_bic_lookup(bfqd, current->io_context);
+ return bic_to_bfqq(bic, bfq_bio_sync(bio)) == RQ_BFQQ(rq);
+}
+
+static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq)
+{
+ if (bfqq) {
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ bfqg_stats_update_avg_queue_size(bfqq_group(bfqq));
+#endif
+ bfq_mark_bfqq_must_alloc(bfqq);
+ bfq_mark_bfqq_budget_new(bfqq);
+ bfq_clear_bfqq_fifo_expire(bfqq);
+
+ bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;
+
+ bfq_log_bfqq(bfqd, bfqq,
+ "set_in_service_queue, cur-budget = %d",
+ bfqq->entity.budget);
+ }
+
+ bfqd->in_service_queue = bfqq;
+}
+
+/*
+ * Get and set a new queue for service.
+ */
+static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd)
+{
+ struct bfq_queue *bfqq = bfq_get_next_queue(bfqd);
+ __bfq_set_in_service_queue(bfqd, bfqq);
+ return bfqq;
+}
+
+/*
+ * If enough samples have been computed, return the current max budget
+ * stored in bfqd, which is dynamically updated according to the
+ * estimated disk peak rate; otherwise return the default max budget
+ */
+static int bfq_max_budget(struct bfq_data *bfqd)
+ if (bfqd->budgets_assigned < bfq_stats_min_budgets)
+ return bfq_default_max_budget;
+ else
+ return bfqd->bfq_max_budget;
+}
+
+/*
+ * Return min budget, which is a fraction of the current or default
+ * max budget (trying with 1/32)
+ */
+static int bfq_min_budget(struct bfq_data *bfqd)
+ if (bfqd->budgets_assigned < bfq_stats_min_budgets)
+ return bfq_default_max_budget / 32;
+ else
+ return bfqd->bfq_max_budget / 32;
+}
+
+static void bfq_arm_slice_timer(struct bfq_data *bfqd)
+{
+ struct bfq_queue *bfqq = bfqd->in_service_queue;
+ struct bfq_io_cq *bic;
+ unsigned long sl;
+
+ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
+
+ /* Processes have exited, don't wait. */
+ bic = bfqd->in_service_bic;
+ if (!bic || atomic_read(&bic->icq.ioc->active_ref) == 0)
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
+ return;
+
+ bfq_mark_bfqq_wait_request(bfqq);
+
+ /*
+ * We don't want to idle for seeks, but we do want to allow
+ * fair distribution of slice time for a process doing back-to-back
+ * seeks. So allow a little bit of time for him to submit a new rq.
+ *
+ * To prevent processes with (partly) seeky workloads from
+ * being too ill-treated, grant them a small fraction of the
+ * assigned budget before reducing the waiting time to
+ * BFQ_MIN_TT. This happened to help reduce latency.
+ */
+ sl = bfqd->bfq_slice_idle;
+ /*
+ * Unless the queue is being weight-raised or the scenario is
+ * asymmetric, grant only minimum idle time if the queue either
+ * has been seeky for long enough or has already proved to be
+ * constantly seeky.
+ */
+ if (bfq_sample_valid(bfqq->seek_samples) &&
+ ((BFQQ_SEEKY(bfqq) && bfqq->entity.service >
+ bfq_max_budget(bfqq->bfqd) / 8) ||
+ bfq_bfqq_constantly_seeky(bfqq)) && bfqq->wr_coeff == 1 &&
+ sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT));
+ else if (bfqq->wr_coeff > 1)
+ sl = sl * 3;
+ bfqd->last_idling_start = ktime_get();
+ mod_timer(&bfqd->idle_slice_timer, jiffies + sl);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ bfqg_stats_set_start_idle_time(bfqq_group(bfqq));
+#endif
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
+ bfq_log(bfqd, "arm idle: %u/%u ms",
+ jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle));
+}
+
+/*
+ * Set the maximum time for the in-service queue to consume its
+ * budget. This prevents seeky processes from lowering the disk
+ * throughput (always guaranteed with a time slice scheme as in CFQ).
+ */
+static void bfq_set_budget_timeout(struct bfq_data *bfqd)
+{
+ struct bfq_queue *bfqq = bfqd->in_service_queue;
+ unsigned int timeout_coeff;
+ if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time)
+ timeout_coeff = 1;
+ else
+ timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight;
+
+ bfqd->last_budget_start = ktime_get();
+
+ bfq_clear_bfqq_budget_new(bfqq);
+ bfqq->budget_timeout = jiffies +
+ bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff;
+
+ bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u",
+ jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] *
+ timeout_coeff));
+}
+
+/*
+ * Move request from internal lists to the request queue dispatch list.
+ */
+static void bfq_dispatch_insert(struct request_queue *q, struct request *rq)
+{
+ struct bfq_data *bfqd = q->elevator->elevator_data;
+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
+
+ /*
+ * For consistency, the next instruction should have been executed
+ * after removing the request from the queue and dispatching it.
+ * We execute instead this instruction before bfq_remove_request()
+ * (and hence introduce a temporary inconsistency), for efficiency.
+ * In fact, in a forced_dispatch, this prevents two counters related
+ * to bfqq->dispatched to risk to be uselessly decremented if bfqq
+ * is not in service, and then to be incremented again after
+ * incrementing bfqq->dispatched.
+ */
+ bfqq->dispatched++;
+ bfq_remove_request(rq);
+ elv_dispatch_sort(q, rq);
+
+ if (bfq_bfqq_sync(bfqq))
+ bfqd->sync_flight++;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ bfqg_stats_update_dispatch(bfqq_group(bfqq), blk_rq_bytes(rq),
+ rq->cmd_flags);
+#endif
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
+}
+
+/*
+ * Return expired entry, or NULL to just start from scratch in rbtree.
+ */
+static struct request *bfq_check_fifo(struct bfq_queue *bfqq)
+{
+ struct request *rq = NULL;
+
+ if (bfq_bfqq_fifo_expire(bfqq))
+ return NULL;
+
+ bfq_mark_bfqq_fifo_expire(bfqq);
+
+ if (list_empty(&bfqq->fifo))
+ return NULL;
+
+ rq = rq_entry_fifo(bfqq->fifo.next);
+
+ if (time_before(jiffies, rq->fifo_time))
+ return NULL;
+
+ return rq;
+}
+
+static int bfq_bfqq_budget_left(struct bfq_queue *bfqq)
+{
+ struct bfq_entity *entity = &bfqq->entity;
+ return entity->budget - entity->service;
+}
+
+static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+ BUG_ON(bfqq != bfqd->in_service_queue);
+
+ __bfq_bfqd_reset_in_service(bfqd);
+
+ if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
+ /*
+ * Overloading budget_timeout field to store the time
+ * at which the queue remains with no backlog; used by
+ * the weight-raising mechanism.
+ */
+ bfqq->budget_timeout = jiffies;
+ bfq_del_bfqq_busy(bfqd, bfqq, 1);
+ bfq_activate_bfqq(bfqd, bfqq);
+}
+
+/**
+ * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.
+ * @bfqd: device data.
+ * @bfqq: queue to update.
+ * @reason: reason for expiration.
+ *
+ * Handle the feedback on @bfqq budget at queue expiration.
+ * See the body for detailed comments.
+ */
+static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq,
+ enum bfqq_expiration reason)
+{
+ struct request *next_rq;
+
+ budget = bfqq->max_budget;
+ min_budget = bfq_min_budget(bfqd);
+
+ BUG_ON(bfqq != bfqd->in_service_queue);
+
+ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %d, budg left %d",
+ bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));
+ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %d, min budg %d",
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
+ budget, bfq_min_budget(bfqd));
+ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",
+ bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue));
+
+ if (bfq_bfqq_sync(bfqq)) {
+ switch (reason) {
+ /*
+ * Caveat: in all the following cases we trade latency
+ * for throughput.
+ */
+ case BFQ_BFQQ_TOO_IDLE:
+ /*
+ * This is the only case where we may reduce
+ * the budget: if there is no request of the
+ * process still waiting for completion, then
+ * we assume (tentatively) that the timer has
+ * expired because the batch of requests of
+ * the process could have been served with a
+ * smaller budget. Hence, betting that
+ * process will behave in the same way when it
+ * becomes backlogged again, we reduce its
+ * next budget. As long as we guess right,
+ * this budget cut reduces the latency
+ * experienced by the process.
+ *
+ * However, if there are still outstanding
+ * requests, then the process may have not yet
+ * issued its next request just because it is
+ * still waiting for the completion of some of
+ * the still outstanding ones. So in this
+ * subcase we do not reduce its budget, on the
+ * contrary we increase it to possibly boost
+ * the throughput, as discussed in the
+ * comments to the BUDGET_TIMEOUT case.
+ */
+ if (bfqq->dispatched > 0) /* still outstanding reqs */
+ budget = min(budget * 2, bfqd->bfq_max_budget);
+ else {
+ if (budget > 5 * min_budget)
+ budget -= 4 * min_budget;
+ else
+ budget = min_budget;
+ }
+ break;
+ case BFQ_BFQQ_BUDGET_TIMEOUT:
+ /*
+ * We double the budget here because: 1) it
+ * gives the chance to boost the throughput if
+ * this is not a seeky process (which may have
+ * bumped into this timeout because of, e.g.,
+ * ZBR), 2) together with charge_full_budget
+ * it helps give seeky processes higher
+ * timestamps, and hence be served less
+ * frequently.
+ */
+ budget = min(budget * 2, bfqd->bfq_max_budget);
+ break;
+ case BFQ_BFQQ_BUDGET_EXHAUSTED:
+ /*
+ * The process still has backlog, and did not
+ * let either the budget timeout or the disk
+ * idling timeout expire. Hence it is not
+ * seeky, has a short thinktime and may be
+ * happy with a higher budget too. So
+ * definitely increase the budget of this good
+ * candidate to boost the disk throughput.
+ */
+ budget = min(budget * 4, bfqd->bfq_max_budget);
+ break;
+ case BFQ_BFQQ_NO_MORE_REQUESTS:
+ /*
+ * Leave the budget unchanged.
+ */
+ default:
+ return;
+ }
+ } else
+ /*
+ * Async queues get always the maximum possible budget
+ * (their ability to dispatch is limited by
+ * @bfqd->bfq_max_budget_async_rq).
+ */
+ budget = bfqd->bfq_max_budget;
+
+ bfqq->max_budget = budget;
+
+ if (bfqd->budgets_assigned >= bfq_stats_min_budgets &&
+ !bfqd->bfq_user_max_budget)
+ bfqq->max_budget = min(bfqq->max_budget, bfqd->bfq_max_budget);
+
+ /*
+ * Make sure that we have enough budget for the next request.
+ * Since the finish time of the bfqq must be kept in sync with
+ * the budget, be sure to call __bfq_bfqq_expire() after the
+ * update.
+ */
+ next_rq = bfqq->next_rq;
+ bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget,
+ bfq_serv_to_charge(next_rq, bfqq));
+ else
+ bfqq->entity.budget = bfqq->max_budget;
+
+ bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %d",
+ next_rq ? blk_rq_sectors(next_rq) : 0,
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
+ bfqq->entity.budget);
+}
+
+static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout)
+{
+ unsigned long max_budget;
+
+ /*
+ * The max_budget calculated when autotuning is equal to the
+ * amount of sectors transfered in timeout_sync at the
+ * estimated peak rate.
+ */
+ max_budget = (unsigned long)(peak_rate * 1000 *
+ timeout >> BFQ_RATE_SHIFT);
+
+ return max_budget;
+}
+
+/*
+ * In addition to updating the peak rate, checks whether the process
+ * is "slow", and returns 1 if so. This slow flag is used, in addition
+ * to the budget timeout, to reduce the amount of service provided to
+ * seeky processes, and hence reduce their chances to lower the
+ * throughput. See the code for more details.
+ */
+static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+ bool compensate, enum bfqq_expiration reason)
+{
+ u64 bw, usecs, expected, timeout;
+ ktime_t delta;
+ int update = 0;
+
+ if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq))
+
+ if (compensate)
+ delta = bfqd->last_idling_start;
+ else
+ delta = ktime_get();
+ delta = ktime_sub(delta, bfqd->last_budget_start);
+ usecs = ktime_to_us(delta);
+
+ /* Don't trust short/unrealistic values. */
+ if (usecs < 100 || usecs >= LONG_MAX)
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
+
+ /*
+ * Calculate the bandwidth for the last slice. We use a 64 bit
+ * value to store the peak rate, in sectors per usec in fixed
+ * point math. We do so to have enough precision in the estimate
+ * and to avoid overflows.
+ */
+ bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT;
+ do_div(bw, (unsigned long)usecs);
+
+ timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);
+
+ /*
+ * Use only long (> 20ms) intervals to filter out spikes for
+ * the peak rate estimation.
+ */
+ if (usecs > 20000) {
+ if (bw > bfqd->peak_rate ||
+ (!BFQQ_SEEKY(bfqq) &&
+ reason == BFQ_BFQQ_BUDGET_TIMEOUT)) {
+ bfq_log(bfqd, "measured bw =%llu", bw);
+ /*
+ * To smooth oscillations use a low-pass filter with
+ * alpha=7/8, i.e.,
+ * new_rate = (7/8) * old_rate + (1/8) * bw
+ */
+ do_div(bw, 8);
+ if (bw == 0)
+ return 0;
+ bfqd->peak_rate *= 7;
+ do_div(bfqd->peak_rate, 8);
+ bfqd->peak_rate += bw;
+ update = 1;
+ bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate);
+ }
+
+ update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1;
+
+ if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES)
+ bfqd->peak_rate_samples++;
+
+ if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES &&
+ update) {
+ int dev_type = blk_queue_nonrot(bfqd->queue);
+ if (bfqd->bfq_user_max_budget == 0) {
+ bfqd->bfq_max_budget =
+ bfq_calc_max_budget(bfqd->peak_rate,
+ timeout);
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
+ bfqd->bfq_max_budget);
+ }
+ if (bfqd->device_speed == BFQ_BFQD_FAST &&
+ bfqd->peak_rate < device_speed_thresh[dev_type]) {
+ bfqd->device_speed = BFQ_BFQD_SLOW;
+ bfqd->RT_prod = R_slow[dev_type] *
+ T_slow[dev_type];
+ } else if (bfqd->device_speed == BFQ_BFQD_SLOW &&
+ bfqd->peak_rate > device_speed_thresh[dev_type]) {
+ bfqd->device_speed = BFQ_BFQD_FAST;
+ bfqd->RT_prod = R_fast[dev_type] *
+ T_fast[dev_type];
+ }
+ }
+ }
+
+ /*
+ * If the process has been served for a too short time
+ * interval to let its possible sequential accesses prevail on
+ * the initial seek time needed to move the disk head on the
+ * first sector it requested, then give the process a chance
+ * and for the moment return false.
+ */
+ if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8)
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
+
+ /*
+ * A process is considered ``slow'' (i.e., seeky, so that we
+ * cannot treat it fairly in the service domain, as it would
+ * slow down too much the other processes) if, when a slice
+ * ends for whatever reason, it has received service at a
+ * rate that would not be high enough to complete the budget
+ * before the budget timeout expiration.
+ */
+ expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT;
+
+ /*
+ * Caveat: processes doing IO in the slower disk zones will
+ * tend to be slow(er) even if not seeky. And the estimated
+ * peak rate will actually be an average over the disk
+ * surface. Hence, to not be too harsh with unlucky processes,
+ * we keep a budget/3 margin of safety before declaring a
+ * process slow.
+ */
+ return expected > (4 * bfqq->entity.budget) / 3;
+}
+
+/*
+ * To be deemed as soft real-time, an application must meet two
+ * requirements. First, the application must not require an average
+ * bandwidth higher than the approximate bandwidth required to playback or
+ * record a compressed high-definition video.
+ * The next function is invoked on the completion of the last request of a
+ * batch, to compute the next-start time instant, soft_rt_next_start, such
+ * that, if the next request of the application does not arrive before
+ * soft_rt_next_start, then the above requirement on the bandwidth is met.
+ *
+ * The second requirement is that the request pattern of the application is
+ * isochronous, i.e., that, after issuing a request or a batch of requests,
+ * the application stops issuing new requests until all its pending requests
+ * have been completed. After that, the application may issue a new batch,
+ * and so on.
+ * For this reason the next function is invoked to compute
+ * soft_rt_next_start only for applications that meet this requirement,
+ * whereas soft_rt_next_start is set to infinity for applications that do
+ * not.
+ *
+ * Unfortunately, even a greedy application may happen to behave in an
+ * isochronous way if the CPU load is high. In fact, the application may
+ * stop issuing requests while the CPUs are busy serving other processes,
+ * then restart, then stop again for a while, and so on. In addition, if
+ * the disk achieves a low enough throughput with the request pattern
+ * issued by the application (e.g., because the request pattern is random
+ * and/or the device is slow), then the application may meet the above
+ * bandwidth requirement too. To prevent such a greedy application to be
+ * deemed as soft real-time, a further rule is used in the computation of
+ * soft_rt_next_start: soft_rt_next_start must be higher than the current
+ * time plus the maximum time for which the arrival of a request is waited
+ * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle.
+ * This filters out greedy applications, as the latter issue instead their
+ * next request as soon as possible after the last one has been completed
+ * (in contrast, when a batch of requests is completed, a soft real-time
+ * application spends some time processing data).
+ *
+ * Unfortunately, the last filter may easily generate false positives if
+ * only bfqd->bfq_slice_idle is used as a reference time interval and one
+ * or both the following cases occur: