/*
 * CPU ConCurrency (CC) is measures the CPU load by averaging
 * the number of running tasks. Using CC, the scheduler can
 * evaluate the load of CPUs to improve load balance for power
 * efficiency without sacrificing performance.
 *
 * Copyright (C) 2013 Intel, Inc.,
 *
 * Author: Du, Yuyang <yuyang.du@intel.com>
 *
 * CPU Workload Consolidation consolidate workload to the smallest
 * number of CPUs that are capable of handling it. We measure
 * capability of CPU by CC, then compare it with a threshold,
 * and finally run the workload on non-shielded CPUs if they are
 * predicted capable after the consolidation.
 *
 * Copyright (C) 2013 Intel, Inc.,
 *
 * Author: Rudramuni, Vishwesh M <vishwesh.m.rudramuni@intel.com>
 *         Du, Yuyang <yuyang.du@intel.com>
 *
 */

#ifdef CONFIG_CPU_CONCURRENCY

#include "sched.h"

/*
 * the sum period of time is 2^26 ns (~64) by default
 */
unsigned long sysctl_concurrency_sum_period = 26UL;

/*
 * the number of sum periods, after which the original
 * will be reduced/decayed to half
 */
unsigned long sysctl_concurrency_decay_rate = 1UL;

/*
 * the contrib period of time is 2^10 (~1us) by default,
 * us has better precision than ms, and
 * 1024 makes use of faster shift than div
 */
static unsigned long cc_contrib_period = 10UL;

#ifdef CONFIG_WORKLOAD_CONSOLIDATION
/*
 * whether we use concurrency to select cpu to run
 * the woken up task
 */
static unsigned long wc_wakeup = 1UL;

/*
 * concurrency lower than percentage of this number
 * is capable of running wakee
 */
static unsigned long wc_wakeup_threshold = 80UL;

/*
 * aggressively push the task even it is hot
 */
static unsigned long wc_push_hot_task = 1UL;
#endif

/*
 * the concurrency is scaled up for decaying,
 * thus, concurrency 1 is effectively 2^cc_resolution (1024),
 * which can be halved by 10 half-life periods
 */
static unsigned long cc_resolution = 10UL;

/*
 * after this number of half-life periods, even
 * (1>>32)-1 (which is sufficiently large) is less than 1
 */
static unsigned long cc_decay_max_pds = 32UL;

static inline unsigned long cc_scale_up(unsigned long c)
{
	return c << cc_resolution;
}

static inline unsigned long cc_scale_down(unsigned long c)
{
	return c >> cc_resolution;
}

/* from nanoseconds to sum periods */
static inline u64 cc_sum_pds(u64 n)
{
	return n >> sysctl_concurrency_sum_period;
}

/* from sum period to timestamp in ns */
static inline u64 cc_timestamp(u64 p)
{
	return p << sysctl_concurrency_sum_period;
}

/*
 * from nanoseconds to contrib periods, because
 * ns so risky that can overflow cc->contrib
 */
static inline u64 cc_contrib_pds(u64 n)
{
	return n >> cc_contrib_period;
}

/*
 * cc_decay_factor only works for 32bit integer,
 * cc_decay_factor_x, x indicates the number of periods
 * as half-life (sysctl_concurrency_decay_rate)
 */
static const unsigned long cc_decay_factor_1[] = {
	0xFFFFFFFF,
};

static const unsigned long cc_decay_factor_2[] = {
	0xFFFFFFFF, 0xB504F333,
};

static const unsigned long cc_decay_factor_4[] = {
	0xFFFFFFFF, 0xD744FCCA, 0xB504F333, 0x9837F051,
};

static const unsigned long cc_decay_factor_8[] = {
	0xFFFFFFFF, 0xEAC0C6E7, 0xD744FCCA, 0xC5672A11,
	0xB504F333, 0xA5FED6A9, 0x9837F051, 0x8B95C1E3,
};

/* by default sysctl_concurrency_decay_rate */
static const unsigned long *cc_decay_factor =
	cc_decay_factor_1;

/*
 * cc_decayed_sum depends on cc_resolution (fixed 10),
 * cc_decayed_sum_x, x indicates the number of periods
 * as half-life (sysctl_concurrency_decay_rate)
 */
static const unsigned long cc_decayed_sum_1[] = {
	0, 512, 768, 896, 960, 992,
	1008, 1016, 1020, 1022, 1023,
};

static const unsigned long cc_decayed_sum_2[] = {
	0, 724, 1235, 1597, 1853, 2034, 2162, 2252,
	2316, 2361, 2393, 2416, 2432, 2443, 2451,
	2457, 2461, 2464, 2466, 2467, 2468, 2469,
};

static const unsigned long cc_decayed_sum_4[] = {
	0, 861, 1585, 2193, 2705, 3135, 3497, 3801, 4057,
	4272, 4453, 4605, 4733, 4840, 4930, 5006, 5070,
	5124, 5169, 5207, 5239, 5266, 5289, 5308, 5324,
	5337, 5348, 5358, 5366, 5373, 5379, 5384, 5388,
	5391, 5394, 5396, 5398, 5400, 5401, 5402, 5403,
	5404, 5405, 5406,
};

static const unsigned long cc_decayed_sum_8[] = {
	0, 939, 1800, 2589, 3313, 3977, 4585, 5143,
	5655, 6124, 6554, 6949, 7311, 7643, 7947, 8226,
	8482, 8717, 8932, 9129, 9310, 9476, 9628, 9767,
	9895, 10012, 10120, 10219, 10309, 10392, 10468, 10538,
	10602, 10661, 10715, 10764, 10809, 10850, 10888, 10923,
	10955, 10984, 11011, 11036, 11059, 11080, 11099, 11116,
	11132, 11147, 11160, 11172, 11183, 11193, 11203, 11212,
	11220, 11227, 11234, 11240, 11246, 11251, 11256, 11260,
	11264, 11268, 11271, 11274, 11277, 11280, 11282, 11284,
	11286, 11288, 11290, 11291, 11292, 11293, 11294, 11295,
	11296, 11297, 11298, 11299, 11300, 11301, 11302,
};

/* by default sysctl_concurrency_decay_rate */
static const unsigned long *cc_decayed_sum = cc_decayed_sum_1;

/*
 * the last index of cc_decayed_sum array
 */
static unsigned long cc_decayed_sum_len =
	sizeof(cc_decayed_sum_1) / sizeof(cc_decayed_sum_1[0]) - 1;

/*
 * sysctl handler to update decay rate
 */
int concurrency_decay_rate_handler(struct ctl_table *table, int write,
		void __user *buffer, size_t *lenp, loff_t *ppos)
{
	int ret = proc_dointvec(table, write, buffer, lenp, ppos);

	if (ret || !write)
		return ret;

	switch (sysctl_concurrency_decay_rate) {
	case 1:
		cc_decay_factor = cc_decay_factor_1;
		cc_decayed_sum = cc_decayed_sum_1;
		cc_decayed_sum_len = sizeof(cc_decayed_sum_1) /
			sizeof(cc_decayed_sum_1[0]) - 1;
		break;
	case 2:
		cc_decay_factor = cc_decay_factor_2;
		cc_decayed_sum = cc_decayed_sum_2;
		cc_decayed_sum_len = sizeof(cc_decayed_sum_2) /
			sizeof(cc_decayed_sum_2[0]) - 1;
		break;
	case 4:
		cc_decay_factor = cc_decay_factor_4;
		cc_decayed_sum = cc_decayed_sum_4;
		cc_decayed_sum_len = sizeof(cc_decayed_sum_4) /
			sizeof(cc_decayed_sum_4[0]) - 1;
		break;
	case 8:
		cc_decay_factor = cc_decay_factor_8;
		cc_decayed_sum = cc_decayed_sum_8;
		cc_decayed_sum_len = sizeof(cc_decayed_sum_8) /
			sizeof(cc_decayed_sum_8[0]) - 1;
		break;
	default:
		return -EINVAL;
	}

	cc_decay_max_pds *= sysctl_concurrency_decay_rate;

	return 0;
}

/*
 * decay concurrency at some decay rate
 */
static inline u64 decay_cc(u64 cc, u64 periods)
{
	u32 periods_l;

	if (periods <= 0)
		return cc;

	if (unlikely(periods >= cc_decay_max_pds))
		return 0;

	/* now period is not too large */
	periods_l = (u32)periods;
	if (periods_l >= sysctl_concurrency_decay_rate) {
		cc >>= periods_l / sysctl_concurrency_decay_rate;
		periods_l %= sysctl_concurrency_decay_rate;
	}

	if (!periods_l)
		return cc;

	cc *= cc_decay_factor[periods_l];

	return cc >> 32;
}

/*
 * add missed periods by predefined constants
 */
static inline u64 cc_missed_pds(u64 periods)
{
	if (periods <= 0)
		return 0;

	if (periods > cc_decayed_sum_len)
		periods = cc_decayed_sum_len;

	return cc_decayed_sum[periods];
}

/*
 * scale up nr_running, because we decay
 */
static inline unsigned long cc_weight(unsigned long nr_running)
{
	/*
	 * scaling factor, this should be tunable
	 */
	return cc_scale_up(nr_running);
}

static inline void
__update_concurrency(struct rq *rq, u64 now, struct cpu_concurrency_t *cc)
{
	u64 sum_pds, sum_pds_s, sum_pds_e;
	u64 contrib_pds, ts_contrib, contrib_pds_one;
	u64 sum_now;
	unsigned long weight;
	int updated = 0;

	/*
	 * guarantee contrib_timestamp always >= sum_timestamp,
	 * and sum_timestamp is at period boundary
	 */
	if (now <= cc->sum_timestamp) {
		cc->sum_timestamp = cc_timestamp(cc_sum_pds(now));
		cc->contrib_timestamp = now;
		return;
	}

	weight = cc_weight(cc->nr_running);

	/* start and end of sum periods */
	sum_pds_s = cc_sum_pds(cc->sum_timestamp);
	sum_pds_e = cc_sum_pds(now);
	sum_pds = sum_pds_e - sum_pds_s;
	/* number of contrib periods in one sum period */
	contrib_pds_one = cc_contrib_pds(cc_timestamp(1));

	/*
	 * if we have passed at least one period,
	 * we need to do four things:
	 */
	if (sum_pds) {
		/* 1) complete the last period */
		ts_contrib = cc_timestamp(sum_pds_s + 1);
		contrib_pds = cc_contrib_pds(ts_contrib);
		contrib_pds -= cc_contrib_pds(cc->contrib_timestamp);

		if (likely(contrib_pds))
			cc->contrib += weight * contrib_pds;

		cc->contrib = div64_u64(cc->contrib, contrib_pds_one);

		cc->sum += cc->contrib;
		cc->contrib = 0;

		/* 2) update/decay them */
		cc->sum = decay_cc(cc->sum, sum_pds);
		sum_now = decay_cc(cc->sum, sum_pds - 1);

		/* 3) compensate missed periods if any */
		sum_pds -= 1;
		cc->sum += cc->nr_running * cc_missed_pds(sum_pds);
		sum_now += cc->nr_running * cc_missed_pds(sum_pds - 1);
		updated = 1;

		/* 4) update contrib timestamp to period boundary */
		ts_contrib = cc_timestamp(sum_pds_e);

		cc->sum_timestamp = ts_contrib;
		cc->contrib_timestamp = ts_contrib;
	}

	/* current period */
	contrib_pds = cc_contrib_pds(now);
	contrib_pds -= cc_contrib_pds(cc->contrib_timestamp);

	if (likely(contrib_pds))
		cc->contrib += weight * contrib_pds;

	/* new nr_running for next update */
	cc->nr_running = rq->nr_running;

	/*
	 * we need to account for the current sum period,
	 * if now has passed 1/2 of sum period, we contribute,
	 * otherwise, we use the last complete sum period
	 */
	contrib_pds = cc_contrib_pds(now - cc->sum_timestamp);

	if (contrib_pds > contrib_pds_one / 2) {
		sum_now = div64_u64(cc->contrib, contrib_pds);
		sum_now += cc->sum;
		updated = 1;
	}

	if (updated == 1)
		cc->sum_now = sum_now;
	cc->contrib_timestamp = now;
}

void init_cpu_concurrency(struct rq *rq)
{
	rq->concurrency.sum = 0;
	rq->concurrency.sum_now = 0;
	rq->concurrency.contrib = 0;
	rq->concurrency.nr_running = 0;
	rq->concurrency.sum_timestamp = ULLONG_MAX;
	rq->concurrency.contrib_timestamp = ULLONG_MAX;
#ifdef CONFIG_WORKLOAD_CONSOLIDATION
	rq->concurrency.unload = 0;
#endif
}

/*
 * we update cpu concurrency at:
 * 1) enqueue task, which increases concurrency
 * 2) dequeue task, which decreases concurrency
 * 3) periodic scheduler tick, in case no en/dequeue for long
 * 4) enter and exit idle (necessary?)
 */
void update_cpu_concurrency(struct rq *rq)
{
	/*
	 * protected under rq->lock
	 */
	struct cpu_concurrency_t *cc = &rq->concurrency;
	u64 now = rq->clock;

	__update_concurrency(rq, now, cc);
}

#endif

#ifdef CONFIG_WORKLOAD_CONSOLIDATION
/*
 * whether cpu is capable of having more concurrency
 */
static int cpu_cc_capable(int cpu)
{
	u64 sum = cpu_rq(cpu)->concurrency.sum_now;
	u64 threshold = cc_weight(1);

	sum *= 100;
	sum *= cpu_rq(cpu)->cpu_power;

	threshold *= wc_wakeup_threshold;
	threshold <<= SCHED_POWER_SHIFT;

	if (sum <= threshold)
		return 1;

	return 0;
}

/*
 * we do not select idle, if the cc of the
 * wakee and waker (in this order) is capable
 * of handling the wakee task
 */
int workload_consolidation_wakeup(int prev, int target)
{
	if (!wc_wakeup) {
		if (idle_cpu(target))
			return target;

		return nr_cpu_ids;
	}

	if (idle_cpu(prev) || cpu_cc_capable(prev))
		return prev;

	if (prev != target && (idle_cpu(target) || cpu_cc_capable(target)))
		return target;

	return nr_cpu_ids;
}

static inline u64 sched_group_cc(struct sched_group *sg)
{
	u64 sg_cc = 0;
	int i;

	for_each_cpu(i, sched_group_cpus(sg))
		sg_cc += cpu_rq(i)->concurrency.sum_now *
			cpu_rq(i)->cpu_power;

	return sg_cc;
}

static inline u64 sched_domain_cc(struct sched_domain *sd)
{
	struct sched_group *sg = sd->groups;
	u64 sd_cc = 0;

	do {
		sd_cc += sched_group_cc(sg);
		sg = sg->next;
	} while (sg != sd->groups);

	return sd_cc;
}

static inline struct sched_group *
find_lowest_cc_group(struct sched_group *sg, int span)
{
	u64 grp_cc, min = ULLONG_MAX;
	struct sched_group *lowest = NULL;
	int i;

	for (i = 0; i < span; ++i) {
		grp_cc = sched_group_cc(sg);

		if (grp_cc < min) {
			min = grp_cc;
			lowest = sg;
		}

		sg = sg->next;
	}

	return lowest;
}

static inline u64 __calc_cc_thr(int cpus, unsigned int asym_cc)
{
	u64 thr = cpus;

	thr *= cc_weight(1);
	thr *= asym_cc;
	thr <<= SCHED_POWER_SHIFT;

	return thr;
}

/*
 * can @src_cc of @src_nr cpus be consolidated
 * to @dst_cc of @dst_nr cpus
 */
static inline int
__can_consolidate_cc(u64 src_cc, int src_nr, u64 dst_cc, int dst_nr)
{
	dst_cc *= dst_nr;
	src_nr -= dst_nr;

	if (unlikely(src_nr <= 0))
		return 0;

	src_nr = ilog2(src_nr);
	src_nr += dst_nr;
	src_cc *= src_nr;

	if (src_cc > dst_cc)
		return 0;

	return 1;
}

/*
 * find the group for asymmetric concurrency
 * problem to address: traverse sd from top to down
 */
struct sched_group *
workload_consolidation_find_group(struct sched_domain *sd,
	struct task_struct *p, int this_cpu)
{
	int half, sg_weight, ns_half = 0;
	struct sched_group *sg;
	u64 sd_cc;

	half = DIV_ROUND_CLOSEST(sd->total_groups, 2);
	sg_weight = sd->groups->group_weight;

	sd_cc = sched_domain_cc(sd);
	sd_cc *= 100;

	while (half) {
		int allowed = 0, i;
		int cpus = sg_weight * half;
		u64 threshold = __calc_cc_thr(cpus,
			sd->asym_concurrency);

		/*
		 * we did not consider the added cc by this
		 * wakeup (mostly from fork/exec)
		 */
		if (!__can_consolidate_cc(sd_cc, sd->span_weight,
			threshold, cpus))
			break;

		sg = sd->first_group;
		for (i = 0; i < half; ++i) {
			/* if it has no cpus allowed */
			if (!cpumask_intersects(sched_group_cpus(sg),
					tsk_cpus_allowed(p)))
				continue;

			allowed = 1;
			sg = sg->next;
		}

		if (!allowed)
			break;

		ns_half = half;
		half /= 2;
	}

	if (!ns_half)
		return NULL;

	if (ns_half == 1)
		return sd->first_group;

	return find_lowest_cc_group(sd->first_group, ns_half);
}

/*
 * top_flag_domain - return top sched_domain containing flag.
 * @cpu:	the cpu whose highest level of sched domain is to
 *		be returned.
 * @flag:	the flag to check for the highest sched_domain
 *		for the given cpu.
 *
 * returns the highest sched_domain of a cpu which contains the given flag.
 * different from highest_flag_domain in that along the domain upward chain
 * domain may or may not contain the flag.
 */
static inline struct sched_domain *top_flag_domain(int cpu, int flag)
{
	struct sched_domain *sd, *hsd = NULL;

	for_each_domain(cpu, sd) {
		if (!(sd->flags & flag))
			continue;
		hsd = sd;
	}

	return hsd;
}

/*
 * workload_consolidation_cpu_shielded - return whether @cpu is shielded or not
 *
 * traverse downward the sched_domain tree when the sched_domain contains
 * flag SD_ASYM_CONCURRENCY, each sd may have more than two groups, but
 * we assume 1) every sched_group has the same weight, 2) every CPU has
 * the same computing power
 */
int workload_consolidation_cpu_shielded(int cpu)
{
	struct sched_domain *sd;

	sd = top_flag_domain(cpu, SD_ASYM_CONCURRENCY);

	while (sd) {
		int half, sg_weight, this_sg_nr;
		u64 sd_cc;

		if (!(sd->flags & SD_ASYM_CONCURRENCY)) {
			sd = sd->child;
			continue;
		}

		half = DIV_ROUND_CLOSEST(sd->total_groups, 2);
		sg_weight = sd->groups->group_weight;
		this_sg_nr = sd->group_number;

		sd_cc = sched_domain_cc(sd);
		sd_cc *= 100;

		while (half) {
			int cpus = sg_weight * half;
			u64 threshold = __calc_cc_thr(cpus,
				sd->asym_concurrency);

			if (!__can_consolidate_cc(sd_cc, sd->span_weight,
				threshold, cpus))
				return 0;

			if (this_sg_nr >= half)
				return 1;

			half /= 2;
		}

		sd = sd->child;
	}

	return 0;
}

/*
 * as of now, we have the following assumption
 * 1) every sched_group has the same weight
 * 2) every CPU has the same computing power
 */
static inline int __nonshielded_groups(struct sched_domain *sd)
{
	int half, sg_weight, ret = 0;
	u64 sd_cc;

	half = DIV_ROUND_CLOSEST(sd->total_groups, 2);
	sg_weight = sd->groups->group_weight;

	sd_cc = sched_domain_cc(sd);
	sd_cc *= 100;

	while (half) {
		int cpus = sg_weight * half;
		u64 threshold = __calc_cc_thr(cpus,
			sd->asym_concurrency);

		if (!__can_consolidate_cc(sd_cc, sd->span_weight,
			threshold, cpus))
			return ret;

		ret = half;
		half /= 2;
	}

	return ret;
}

static DEFINE_PER_CPU(struct cpumask, nonshielded_cpumask);

/*
 * workload_consolidation_nonshielded_mask - return the nonshielded cpus in the @mask,
 * which is unmasked by the shielded cpus
 *
 * traverse downward the sched_domain tree when the sched_domain contains
 * flag SD_ASYM_CONCURRENCY, each sd may have more than two groups
 */
void workload_consolidation_nonshielded_mask(int cpu, struct cpumask *mask)
{
	struct sched_domain *sd;
	struct cpumask *pcpu_mask = &per_cpu(nonshielded_cpumask, cpu);
	int i;

	sd = top_flag_domain(cpu, SD_ASYM_CONCURRENCY);

	if (!sd)
		return;

	while (sd) {
		struct sched_group *sg;
		int this_sg_nr, ns_half;

		if (!(sd->flags & SD_ASYM_CONCURRENCY)) {
			sd = sd->child;
			continue;
		}

		ns_half = __nonshielded_groups(sd);

		if (!ns_half)
			break;

		cpumask_clear(pcpu_mask);
		sg = sd->first_group;

		for (i = 0; i < ns_half; ++i) {
			cpumask_or(pcpu_mask, pcpu_mask,
				sched_group_cpus(sg));
			sg = sg->next;
		}

		cpumask_and(mask, mask, pcpu_mask);

		this_sg_nr = sd->group_number;
		if (this_sg_nr)
			break;

		sd = sd->child;
	}
}

static int cpu_task_hot(struct task_struct *p, u64 now)
{
	s64 delta;

	if (p->sched_class != &fair_sched_class)
		return 0;

	if (unlikely(p->policy == SCHED_IDLE))
		return 0;

	if (sysctl_sched_migration_cost == -1)
		return 1;

	if (sysctl_sched_migration_cost == 0)
		return 0;

	if (wc_push_hot_task)
		return 0;

	/*
	 * buddy candidates are cache hot:
	 */
	if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
			(&p->se == p->se.cfs_rq->next ||
			 &p->se == p->se.cfs_rq->last)) {
		return 1;
	}

	delta = now - p->se.exec_start;

	if (delta < (s64)sysctl_sched_migration_cost)
		return 1;

	return 0;
}

static int
cpu_move_task(struct task_struct *p, struct rq *src_rq, struct rq *dst_rq)
{
	/*
	 * we do not migrate tasks that are:
	 * 1) running (obviously), or
	 * 2) cannot be migrated to this CPU due to cpus_allowed, or
	 * 3) are cache-hot on their current CPU.
	 */
	if (!cpumask_test_cpu(dst_rq->cpu, tsk_cpus_allowed(p)))
		return 0;

	if (task_running(src_rq, p))
		return 0;

	/*
	 * aggressive migration if task is cache cold
	 */
	if (!cpu_task_hot(p, src_rq->clock_task)) {
		/*
		 * move a task
		 */
		deactivate_task(src_rq, p, 0);
		set_task_cpu(p, dst_rq->cpu);
		activate_task(dst_rq, p, 0);
		check_preempt_curr(dst_rq, p, 0);
		return 1;
	}

	return 0;
}

/*
 * __unload_cpu_work is run by src cpu stopper, which pushes running
 * tasks off src cpu onto dst cpu
 */
static int __unload_cpu_work(void *data)
{
	struct rq *src_rq = data;
	int src_cpu = cpu_of(src_rq);
	struct cpu_concurrency_t *cc = &src_rq->concurrency;
	struct rq *dst_rq = cpu_rq(cc->dst_cpu);

	struct list_head *tasks = &src_rq->cfs_tasks;
	struct task_struct *p, *n;
	int pushed = 0;
	int nr_migrate_break = 1;

	raw_spin_lock_irq(&src_rq->lock);

	/* make sure the requested cpu hasn't gone down in the meantime */
	if (unlikely(src_cpu != smp_processor_id() || !cc->unload))
		goto out_unlock;

	/* Is there any task to move? */
	if (src_rq->nr_running <= 1)
		goto out_unlock;

	double_lock_balance(src_rq, dst_rq);

	list_for_each_entry_safe(p, n, tasks, se.group_node) {

		if (!cpu_move_task(p, src_rq, dst_rq))
			continue;

		pushed++;

		if (pushed >= nr_migrate_break)
			break;
	}

	double_unlock_balance(src_rq, dst_rq);
out_unlock:
	cc->unload = 0;
	raw_spin_unlock_irq(&src_rq->lock);

	return 0;
}

/*
 * unload src_cpu to dst_cpu
 */
static void unload_cpu(int src_cpu, int dst_cpu)
{
	unsigned long flags;
	struct rq *src_rq = cpu_rq(src_cpu);
	struct cpu_concurrency_t *cc = &src_rq->concurrency;
	int unload = 0;

	raw_spin_lock_irqsave(&src_rq->lock, flags);

	if (!cc->unload) {
		cc->unload = 1;
		cc->dst_cpu = dst_cpu;
		unload = 1;
	}

	raw_spin_unlock_irqrestore(&src_rq->lock, flags);

	if (unload)
		stop_one_cpu_nowait(src_cpu, __unload_cpu_work, src_rq,
			&cc->unload_work);
}

static inline int find_lowest_cc_cpu(struct cpumask *mask)
{
	u64 cpu_cc, min = ULLONG_MAX;
	int i, lowest = nr_cpu_ids;
	struct rq *rq;

	for_each_cpu(i, mask) {
		rq = cpu_rq(i);
		cpu_cc = rq->concurrency.sum_now * rq->cpu_power;

		if (cpu_cc < min) {
			min = cpu_cc;
			lowest = i;
		}
	}

	return lowest;
}

/*
 * find the lowest cc cpu in shielded and nonshielded cpus,
 * aggressively unload the shielded to the nonshielded
 */
void workload_consolidation_unload(struct cpumask *nonshielded)
{
	int src_cpu = nr_cpu_ids, dst_cpu, i;
	u64 cpu_cc, min = ULLONG_MAX;
	struct rq *rq;

	for_each_cpu_not(i, nonshielded) {
		if (i >= nr_cpu_ids)
			break;

		rq = cpu_rq(i);
		if (rq->nr_running <= 0)
			continue;

		cpu_cc = rq->concurrency.sum_now * rq->cpu_power;
		if (cpu_cc < min) {
			min = cpu_cc;
			src_cpu = i;
		}
	}

	if (src_cpu >= nr_cpu_ids)
		return;

	dst_cpu = find_lowest_cc_cpu(nonshielded);
	if (dst_cpu >= nr_cpu_ids)
		return;

	if (src_cpu != dst_cpu)
		unload_cpu(src_cpu, dst_cpu);
}

#endif /* CONFIG_WORKLOAD_CONSOLIDATION */