xref: /DragonOS/kernel/src/sched/mod.rs (revision cf7f801e1d50ee5b04cb728e4251a57f4183bfbc)
1 pub mod clock;
2 pub mod completion;
3 pub mod cputime;
4 pub mod fair;
5 pub mod idle;
6 pub mod pelt;
7 pub mod prio;
8 pub mod syscall;
9 
10 use core::{
11     intrinsics::{likely, unlikely},
12     sync::atomic::{compiler_fence, fence, AtomicUsize, Ordering},
13 };
14 
15 use alloc::{
16     boxed::Box,
17     collections::LinkedList,
18     sync::{Arc, Weak},
19     vec::Vec,
20 };
21 use system_error::SystemError;
22 
23 use crate::{
24     arch::{interrupt::ipi::send_ipi, CurrentIrqArch},
25     exception::{
26         ipi::{IpiKind, IpiTarget},
27         InterruptArch,
28     },
29     libs::{
30         lazy_init::Lazy,
31         spinlock::{SpinLock, SpinLockGuard},
32     },
33     mm::percpu::{PerCpu, PerCpuVar},
34     process::{ProcessControlBlock, ProcessFlags, ProcessManager, ProcessState, SchedInfo},
35     sched::idle::IdleScheduler,
36     smp::{core::smp_get_processor_id, cpu::ProcessorId},
37     time::{clocksource::HZ, timer::clock},
38 };
39 
40 use self::{
41     clock::{ClockUpdataFlag, SchedClock},
42     cputime::{irq_time_read, CpuTimeFunc, IrqTime},
43     fair::{CfsRunQueue, CompletelyFairScheduler, FairSchedEntity},
44     prio::PrioUtil,
45 };
46 
47 static mut CPU_IRQ_TIME: Option<Vec<&'static mut IrqTime>> = None;
48 
49 // 这里虽然rq是percpu的,但是在负载均衡的时候需要修改对端cpu的rq,所以仍需加锁
50 static CPU_RUNQUEUE: Lazy<PerCpuVar<Arc<CpuRunQueue>>> = PerCpuVar::define_lazy();
51 
52 /// 用于记录系统中所有 CPU 的可执行进程数量的总和。
53 static CALCULATE_LOAD_TASKS: AtomicUsize = AtomicUsize::new(0);
54 
55 const LOAD_FREQ: usize = HZ as usize * 5 + 1;
56 
57 pub const SCHED_FIXEDPOINT_SHIFT: u64 = 10;
58 #[allow(dead_code)]
59 pub const SCHED_FIXEDPOINT_SCALE: u64 = 1 << SCHED_FIXEDPOINT_SHIFT;
60 #[allow(dead_code)]
61 pub const SCHED_CAPACITY_SHIFT: u64 = SCHED_FIXEDPOINT_SHIFT;
62 #[allow(dead_code)]
63 pub const SCHED_CAPACITY_SCALE: u64 = 1 << SCHED_CAPACITY_SHIFT;
64 
65 #[inline]
cpu_irq_time(cpu: usize) -> &'static mut IrqTime66 pub fn cpu_irq_time(cpu: usize) -> &'static mut IrqTime {
67     unsafe { CPU_IRQ_TIME.as_mut().unwrap()[cpu] }
68 }
69 
70 #[inline]
cpu_rq(cpu: usize) -> Arc<CpuRunQueue>71 pub fn cpu_rq(cpu: usize) -> Arc<CpuRunQueue> {
72     CPU_RUNQUEUE.ensure();
73     unsafe {
74         CPU_RUNQUEUE
75             .get()
76             .force_get(ProcessorId::new(cpu as u32))
77             .clone()
78     }
79 }
80 
81 lazy_static! {
82     pub static ref SCHED_FEATURES: SchedFeature = SchedFeature::GENTLE_FAIR_SLEEPERS
83         | SchedFeature::START_DEBIT
84         | SchedFeature::LAST_BUDDY
85         | SchedFeature::CACHE_HOT_BUDDY
86         | SchedFeature::WAKEUP_PREEMPTION
87         | SchedFeature::NONTASK_CAPACITY
88         | SchedFeature::TTWU_QUEUE
89         | SchedFeature::SIS_UTIL
90         | SchedFeature::RT_PUSH_IPI
91         | SchedFeature::ALT_PERIOD
92         | SchedFeature::BASE_SLICE
93         | SchedFeature::UTIL_EST
94         | SchedFeature::UTIL_EST_FASTUP;
95 }
96 
97 pub trait Scheduler {
98     /// ## 加入当任务进入可运行状态时调用。它将调度实体(任务)放到红黑树中,增加nr_running变量的值。
enqueue(rq: &mut CpuRunQueue, pcb: Arc<ProcessControlBlock>, flags: EnqueueFlag)99     fn enqueue(rq: &mut CpuRunQueue, pcb: Arc<ProcessControlBlock>, flags: EnqueueFlag);
100 
101     /// ## 当任务不再可运行时被调用,对应的调度实体被移出红黑树。它减少nr_running变量的值。
dequeue(rq: &mut CpuRunQueue, pcb: Arc<ProcessControlBlock>, flags: DequeueFlag)102     fn dequeue(rq: &mut CpuRunQueue, pcb: Arc<ProcessControlBlock>, flags: DequeueFlag);
103 
104     /// ## 主动让出cpu,这个函数的行为基本上是出队,紧接着入队
yield_task(rq: &mut CpuRunQueue)105     fn yield_task(rq: &mut CpuRunQueue);
106 
107     /// ## 检查进入可运行状态的任务能否抢占当前正在运行的任务
check_preempt_currnet( rq: &mut CpuRunQueue, pcb: &Arc<ProcessControlBlock>, flags: WakeupFlags, )108     fn check_preempt_currnet(
109         rq: &mut CpuRunQueue,
110         pcb: &Arc<ProcessControlBlock>,
111         flags: WakeupFlags,
112     );
113 
114     /// ## 选择接下来最适合运行的任务
115     #[allow(dead_code)]
pick_task(rq: &mut CpuRunQueue) -> Option<Arc<ProcessControlBlock>>116     fn pick_task(rq: &mut CpuRunQueue) -> Option<Arc<ProcessControlBlock>>;
117 
118     /// ## 选择接下来最适合运行的任务
pick_next_task( rq: &mut CpuRunQueue, pcb: Option<Arc<ProcessControlBlock>>, ) -> Option<Arc<ProcessControlBlock>>119     fn pick_next_task(
120         rq: &mut CpuRunQueue,
121         pcb: Option<Arc<ProcessControlBlock>>,
122     ) -> Option<Arc<ProcessControlBlock>>;
123 
124     /// ## 被时间滴答函数调用,它可能导致进程切换。驱动了运行时抢占。
tick(rq: &mut CpuRunQueue, pcb: Arc<ProcessControlBlock>, queued: bool)125     fn tick(rq: &mut CpuRunQueue, pcb: Arc<ProcessControlBlock>, queued: bool);
126 
127     /// ## 在进程fork时,如需加入cfs,则调用
task_fork(pcb: Arc<ProcessControlBlock>)128     fn task_fork(pcb: Arc<ProcessControlBlock>);
129 
put_prev_task(rq: &mut CpuRunQueue, prev: Arc<ProcessControlBlock>)130     fn put_prev_task(rq: &mut CpuRunQueue, prev: Arc<ProcessControlBlock>);
131 }
132 
133 /// 调度策略
134 #[allow(dead_code)]
135 #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
136 pub enum SchedPolicy {
137     /// 实时进程
138     RT,
139     /// 先进先出调度
140     FIFO,
141     /// 完全公平调度
142     CFS,
143     /// IDLE
144     IDLE,
145 }
146 
147 #[allow(dead_code)]
148 pub struct TaskGroup {
149     /// CFS管理的调度实体,percpu的
150     entitys: Vec<Arc<FairSchedEntity>>,
151     /// 每个CPU的CFS运行队列
152     cfs: Vec<Arc<CfsRunQueue>>,
153     /// 父节点
154     parent: Option<Arc<TaskGroup>>,
155 
156     shares: u64,
157 }
158 
159 #[derive(Debug, Default)]
160 pub struct LoadWeight {
161     /// 负载权重
162     pub weight: u64,
163     /// weight的倒数,方便计算
164     pub inv_weight: u32,
165 }
166 
167 impl LoadWeight {
168     /// 用于限制权重在一个合适的区域内
169     pub const SCHED_FIXEDPOINT_SHIFT: u32 = 10;
170 
171     pub const WMULT_SHIFT: u32 = 32;
172     pub const WMULT_CONST: u32 = !0;
173 
174     pub const NICE_0_LOAD_SHIFT: u32 = Self::SCHED_FIXEDPOINT_SHIFT + Self::SCHED_FIXEDPOINT_SHIFT;
175 
update_load_add(&mut self, inc: u64)176     pub fn update_load_add(&mut self, inc: u64) {
177         self.weight += inc;
178         self.inv_weight = 0;
179     }
180 
update_load_sub(&mut self, dec: u64)181     pub fn update_load_sub(&mut self, dec: u64) {
182         self.weight -= dec;
183         self.inv_weight = 0;
184     }
185 
update_load_set(&mut self, weight: u64)186     pub fn update_load_set(&mut self, weight: u64) {
187         self.weight = weight;
188         self.inv_weight = 0;
189     }
190 
191     /// ## 更新负载权重的倒数
update_inv_weight(&mut self)192     pub fn update_inv_weight(&mut self) {
193         // 已经更新
194         if likely(self.inv_weight != 0) {
195             return;
196         }
197 
198         let w = Self::scale_load_down(self.weight);
199 
200         if unlikely(w >= Self::WMULT_CONST as u64) {
201             // 高位有数据
202             self.inv_weight = 1;
203         } else if unlikely(w == 0) {
204             // 倒数去最大
205             self.inv_weight = Self::WMULT_CONST;
206         } else {
207             // 计算倒数
208             self.inv_weight = Self::WMULT_CONST / w as u32;
209         }
210     }
211 
212     /// ## 计算任务的执行时间差
213     ///
214     /// 计算公式:(delta_exec * (weight * self.inv_weight)) >> WMULT_SHIFT
calculate_delta(&mut self, delta_exec: u64, weight: u64) -> u64215     pub fn calculate_delta(&mut self, delta_exec: u64, weight: u64) -> u64 {
216         // 降低精度
217         let mut fact = Self::scale_load_down(weight);
218 
219         // 记录fact高32位
220         let mut fact_hi = (fact >> 32) as u32;
221         // 用于恢复
222         let mut shift = Self::WMULT_SHIFT;
223 
224         self.update_inv_weight();
225 
226         if unlikely(fact_hi != 0) {
227             // 这里表示高32位还有数据
228             // 需要计算最高位,然后继续调整fact
229             let fs = 32 - fact_hi.leading_zeros();
230             shift -= fs;
231 
232             // 确保高32位全为0
233             fact >>= fs;
234         }
235 
236         // 这里确定了fact已经在32位内
237         fact *= self.inv_weight as u64;
238 
239         fact_hi = (fact >> 32) as u32;
240 
241         if fact_hi != 0 {
242             // 这里表示高32位还有数据
243             // 需要计算最高位,然后继续调整fact
244             let fs = 32 - fact_hi.leading_zeros();
245             shift -= fs;
246 
247             // 确保高32位全为0
248             fact >>= fs;
249         }
250 
251         return ((delta_exec as u128 * fact as u128) >> shift) as u64;
252     }
253 
254     /// ## 将负载权重缩小到到一个小的范围中计算,相当于减小精度计算
scale_load_down(mut weight: u64) -> u64255     pub const fn scale_load_down(mut weight: u64) -> u64 {
256         if weight != 0 {
257             weight >>= Self::SCHED_FIXEDPOINT_SHIFT;
258 
259             if weight < 2 {
260                 weight = 2;
261             }
262         }
263         weight
264     }
265 
266     #[allow(dead_code)]
scale_load(weight: u64) -> u64267     pub const fn scale_load(weight: u64) -> u64 {
268         weight << Self::SCHED_FIXEDPOINT_SHIFT
269     }
270 }
271 
272 pub trait SchedArch {
273     /// 开启当前核心的调度
enable_sched_local()274     fn enable_sched_local();
275     /// 关闭当前核心的调度
276     #[allow(dead_code)]
disable_sched_local()277     fn disable_sched_local();
278 
279     /// 在第一次开启调度之前,进行初始化工作。
280     ///
281     /// 注意区别于sched_init,这个函数只是做初始化时钟的工作等等。
initial_setup_sched_local()282     fn initial_setup_sched_local() {}
283 }
284 
285 /// ## PerCpu的运行队列,其中维护了各个调度器对应的rq
286 #[allow(dead_code)]
287 #[derive(Debug)]
288 pub struct CpuRunQueue {
289     lock: SpinLock<()>,
290     lock_on_who: AtomicUsize,
291 
292     cpu: usize,
293     clock_task: u64,
294     clock: u64,
295     prev_irq_time: u64,
296     clock_updata_flags: ClockUpdataFlag,
297 
298     /// 过载
299     overload: bool,
300 
301     next_balance: u64,
302 
303     /// 运行任务数
304     nr_running: usize,
305 
306     /// 被阻塞的任务数量
307     nr_uninterruptible: usize,
308 
309     /// 记录上次更新负载时间
310     cala_load_update: usize,
311     cala_load_active: usize,
312 
313     /// CFS调度器
314     cfs: Arc<CfsRunQueue>,
315 
316     clock_pelt: u64,
317     lost_idle_time: u64,
318     clock_idle: u64,
319 
320     cfs_tasks: LinkedList<Arc<FairSchedEntity>>,
321 
322     /// 最近一次的调度信息
323     sched_info: SchedInfo,
324 
325     /// 当前在运行队列上执行的进程
326     current: Weak<ProcessControlBlock>,
327 
328     idle: Weak<ProcessControlBlock>,
329 }
330 
331 impl CpuRunQueue {
new(cpu: usize) -> Self332     pub fn new(cpu: usize) -> Self {
333         Self {
334             lock: SpinLock::new(()),
335             lock_on_who: AtomicUsize::new(usize::MAX),
336             cpu,
337             clock_task: 0,
338             clock: 0,
339             prev_irq_time: 0,
340             clock_updata_flags: ClockUpdataFlag::empty(),
341             overload: false,
342             next_balance: 0,
343             nr_running: 0,
344             nr_uninterruptible: 0,
345             cala_load_update: (clock() + (5 * HZ + 1)) as usize,
346             cala_load_active: 0,
347             cfs: Arc::new(CfsRunQueue::new()),
348             clock_pelt: 0,
349             lost_idle_time: 0,
350             clock_idle: 0,
351             cfs_tasks: LinkedList::new(),
352             sched_info: SchedInfo::default(),
353             current: Weak::new(),
354             idle: Weak::new(),
355         }
356     }
357 
358     /// 此函数只能在关中断的情况下使用!!!
359     /// 获取到rq的可变引用,需要注意的是返回的第二个值需要确保其生命周期
360     /// 所以可以说这个函数是unsafe的,需要确保正确性
361     /// 在中断上下文,关中断的情况下,此函数是安全的
self_lock(&self) -> (&mut Self, Option<SpinLockGuard<()>>)362     pub fn self_lock(&self) -> (&mut Self, Option<SpinLockGuard<()>>) {
363         if self.lock.is_locked()
364             && smp_get_processor_id().data() as usize == self.lock_on_who.load(Ordering::SeqCst)
365         {
366             // 在本cpu已上锁则可以直接拿
367             (
368                 unsafe {
369                     (self as *const Self as usize as *mut Self)
370                         .as_mut()
371                         .unwrap()
372                 },
373                 None,
374             )
375         } else {
376             // 否则先上锁再拿
377             let guard = self.lock();
378             (
379                 unsafe {
380                     (self as *const Self as usize as *mut Self)
381                         .as_mut()
382                         .unwrap()
383                 },
384                 Some(guard),
385             )
386         }
387     }
388 
lock(&self) -> SpinLockGuard<()>389     fn lock(&self) -> SpinLockGuard<()> {
390         let guard = self.lock.lock_irqsave();
391 
392         // 更新在哪一个cpu上锁
393         self.lock_on_who
394             .store(smp_get_processor_id().data() as usize, Ordering::SeqCst);
395 
396         guard
397     }
398 
enqueue_task(&mut self, pcb: Arc<ProcessControlBlock>, flags: EnqueueFlag)399     pub fn enqueue_task(&mut self, pcb: Arc<ProcessControlBlock>, flags: EnqueueFlag) {
400         if !flags.contains(EnqueueFlag::ENQUEUE_NOCLOCK) {
401             self.update_rq_clock();
402         }
403 
404         if !flags.contains(EnqueueFlag::ENQUEUE_RESTORE) {
405             let sched_info = pcb.sched_info().sched_stat.upgradeable_read_irqsave();
406             if sched_info.last_queued == 0 {
407                 sched_info.upgrade().last_queued = self.clock;
408             }
409         }
410 
411         match pcb.sched_info().policy() {
412             SchedPolicy::CFS => CompletelyFairScheduler::enqueue(self, pcb, flags),
413             SchedPolicy::FIFO => todo!(),
414             SchedPolicy::RT => todo!(),
415             SchedPolicy::IDLE => IdleScheduler::enqueue(self, pcb, flags),
416         }
417 
418         // TODO:https://code.dragonos.org.cn/xref/linux-6.6.21/kernel/sched/core.c#239
419     }
420 
dequeue_task(&mut self, pcb: Arc<ProcessControlBlock>, flags: DequeueFlag)421     pub fn dequeue_task(&mut self, pcb: Arc<ProcessControlBlock>, flags: DequeueFlag) {
422         // TODO:sched_core
423 
424         if !flags.contains(DequeueFlag::DEQUEUE_NOCLOCK) {
425             self.update_rq_clock()
426         }
427 
428         if !flags.contains(DequeueFlag::DEQUEUE_SAVE) {
429             let sched_info = pcb.sched_info().sched_stat.upgradeable_read_irqsave();
430 
431             if sched_info.last_queued > 0 {
432                 let delta = self.clock - sched_info.last_queued;
433 
434                 let mut sched_info = sched_info.upgrade();
435                 sched_info.last_queued = 0;
436                 sched_info.run_delay += delta as usize;
437 
438                 self.sched_info.run_delay += delta as usize;
439             }
440         }
441 
442         match pcb.sched_info().policy() {
443             SchedPolicy::CFS => CompletelyFairScheduler::dequeue(self, pcb, flags),
444             SchedPolicy::FIFO => todo!(),
445             SchedPolicy::RT => todo!(),
446             SchedPolicy::IDLE => IdleScheduler::dequeue(self, pcb, flags),
447         }
448     }
449 
450     /// 启用一个任务,将加入队列
activate_task(&mut self, pcb: &Arc<ProcessControlBlock>, mut flags: EnqueueFlag)451     pub fn activate_task(&mut self, pcb: &Arc<ProcessControlBlock>, mut flags: EnqueueFlag) {
452         if *pcb.sched_info().on_rq.lock_irqsave() == OnRq::Migrating {
453             flags |= EnqueueFlag::ENQUEUE_MIGRATED;
454         }
455 
456         if flags.contains(EnqueueFlag::ENQUEUE_MIGRATED) {
457             todo!()
458         }
459 
460         self.enqueue_task(pcb.clone(), flags);
461 
462         *pcb.sched_info().on_rq.lock_irqsave() = OnRq::Queued;
463     }
464 
465     /// 检查对应的task是否可以抢占当前运行的task
466     #[allow(clippy::comparison_chain)]
check_preempt_currnet(&mut self, pcb: &Arc<ProcessControlBlock>, flags: WakeupFlags)467     pub fn check_preempt_currnet(&mut self, pcb: &Arc<ProcessControlBlock>, flags: WakeupFlags) {
468         if pcb.sched_info().policy() == self.current().sched_info().policy() {
469             match self.current().sched_info().policy() {
470                 SchedPolicy::CFS => {
471                     CompletelyFairScheduler::check_preempt_currnet(self, pcb, flags)
472                 }
473                 SchedPolicy::FIFO => todo!(),
474                 SchedPolicy::RT => todo!(),
475                 SchedPolicy::IDLE => IdleScheduler::check_preempt_currnet(self, pcb, flags),
476             }
477         } else if pcb.sched_info().policy() < self.current().sched_info().policy() {
478             // 调度优先级更高
479             self.resched_current();
480         }
481 
482         if *self.current().sched_info().on_rq.lock_irqsave() == OnRq::Queued
483             && self.current().flags().contains(ProcessFlags::NEED_SCHEDULE)
484         {
485             self.clock_updata_flags
486                 .insert(ClockUpdataFlag::RQCF_REQ_SKIP);
487         }
488     }
489 
490     /// 禁用一个任务,将离开队列
deactivate_task(&mut self, pcb: Arc<ProcessControlBlock>, flags: DequeueFlag)491     pub fn deactivate_task(&mut self, pcb: Arc<ProcessControlBlock>, flags: DequeueFlag) {
492         *pcb.sched_info().on_rq.lock_irqsave() = if flags.contains(DequeueFlag::DEQUEUE_SLEEP) {
493             OnRq::None
494         } else {
495             OnRq::Migrating
496         };
497 
498         self.dequeue_task(pcb, flags);
499     }
500 
501     #[inline]
cfs_rq(&self) -> Arc<CfsRunQueue>502     pub fn cfs_rq(&self) -> Arc<CfsRunQueue> {
503         self.cfs.clone()
504     }
505 
506     /// 更新rq时钟
update_rq_clock(&mut self)507     pub fn update_rq_clock(&mut self) {
508         // 需要跳过这次时钟更新
509         if self
510             .clock_updata_flags
511             .contains(ClockUpdataFlag::RQCF_ACT_SKIP)
512         {
513             return;
514         }
515 
516         let clock = SchedClock::sched_clock_cpu(self.cpu);
517         if clock < self.clock {
518             return;
519         }
520 
521         let delta = clock - self.clock;
522         self.clock += delta;
523         // error!("clock {}", self.clock);
524         self.update_rq_clock_task(delta);
525     }
526 
527     /// 更新任务时钟
update_rq_clock_task(&mut self, mut delta: u64)528     pub fn update_rq_clock_task(&mut self, mut delta: u64) {
529         let mut irq_delta = irq_time_read(self.cpu) - self.prev_irq_time;
530         // if self.cpu == 0 {
531         //     error!(
532         //         "cpu 0 delta {delta} irq_delta {} irq_time_read(self.cpu) {} self.prev_irq_time {}",
533         //         irq_delta,
534         //         irq_time_read(self.cpu),
535         //         self.prev_irq_time
536         //     );
537         // }
538         compiler_fence(Ordering::SeqCst);
539 
540         if irq_delta > delta {
541             irq_delta = delta;
542         }
543 
544         self.prev_irq_time += irq_delta;
545 
546         delta -= irq_delta;
547 
548         // todo: psi?
549 
550         // send_to_default_serial8250_port(format!("\n{delta}\n",).as_bytes());
551         compiler_fence(Ordering::SeqCst);
552         self.clock_task += delta;
553         compiler_fence(Ordering::SeqCst);
554         // if self.cpu == 0 {
555         //     error!("cpu {} clock_task {}", self.cpu, self.clock_task);
556         // }
557         // todo: pelt?
558     }
559 
560     /// 计算当前进程中的可执行数量
calculate_load_fold_active(&mut self, adjust: usize) -> usize561     fn calculate_load_fold_active(&mut self, adjust: usize) -> usize {
562         let mut nr_active = self.nr_running - adjust;
563         nr_active += self.nr_uninterruptible;
564         let mut delta = 0;
565 
566         if nr_active != self.cala_load_active {
567             delta = nr_active - self.cala_load_active;
568             self.cala_load_active = nr_active;
569         }
570 
571         delta
572     }
573 
574     /// ## tick计算全局负载
calculate_global_load_tick(&mut self)575     pub fn calculate_global_load_tick(&mut self) {
576         if clock() < self.cala_load_update as u64 {
577             // 如果当前时间在上次更新时间之前,则直接返回
578             return;
579         }
580 
581         let delta = self.calculate_load_fold_active(0);
582 
583         if delta != 0 {
584             CALCULATE_LOAD_TASKS.fetch_add(delta, Ordering::SeqCst);
585         }
586 
587         self.cala_load_update += LOAD_FREQ;
588     }
589 
add_nr_running(&mut self, nr_running: usize)590     pub fn add_nr_running(&mut self, nr_running: usize) {
591         let prev = self.nr_running;
592 
593         self.nr_running = prev + nr_running;
594         if prev < 2 && self.nr_running >= 2 && !self.overload {
595             self.overload = true;
596         }
597     }
598 
sub_nr_running(&mut self, count: usize)599     pub fn sub_nr_running(&mut self, count: usize) {
600         self.nr_running -= count;
601     }
602 
603     /// 在运行idle?
sched_idle_rq(&self) -> bool604     pub fn sched_idle_rq(&self) -> bool {
605         return unlikely(
606             self.nr_running == self.cfs.idle_h_nr_running as usize && self.nr_running > 0,
607         );
608     }
609 
610     #[inline]
current(&self) -> Arc<ProcessControlBlock>611     pub fn current(&self) -> Arc<ProcessControlBlock> {
612         self.current.upgrade().unwrap()
613     }
614 
615     #[inline]
set_current(&mut self, pcb: Weak<ProcessControlBlock>)616     pub fn set_current(&mut self, pcb: Weak<ProcessControlBlock>) {
617         self.current = pcb;
618     }
619 
620     #[inline]
set_idle(&mut self, pcb: Weak<ProcessControlBlock>)621     pub fn set_idle(&mut self, pcb: Weak<ProcessControlBlock>) {
622         self.idle = pcb;
623     }
624 
625     #[inline]
clock_task(&self) -> u64626     pub fn clock_task(&self) -> u64 {
627         self.clock_task
628     }
629 
630     /// 重新调度当前进程
resched_current(&self)631     pub fn resched_current(&self) {
632         let current = self.current();
633 
634         // 又需要被调度?
635         if unlikely(current.flags().contains(ProcessFlags::NEED_SCHEDULE)) {
636             return;
637         }
638 
639         let cpu = self.cpu;
640 
641         if cpu == smp_get_processor_id().data() as usize {
642             // assert!(
643             //     Arc::ptr_eq(&current, &ProcessManager::current_pcb()),
644             //     "rq current name {} process current {}",
645             //     current.basic().name().to_string(),
646             //     ProcessManager::current_pcb().basic().name().to_string(),
647             // );
648             // 设置需要调度
649             ProcessManager::current_pcb()
650                 .flags()
651                 .insert(ProcessFlags::NEED_SCHEDULE);
652             return;
653         }
654 
655         // 向目标cpu发送重调度ipi
656         send_resched_ipi(ProcessorId::new(cpu as u32));
657     }
658 
659     /// 选择下一个task
pick_next_task(&mut self, prev: Arc<ProcessControlBlock>) -> Arc<ProcessControlBlock>660     pub fn pick_next_task(&mut self, prev: Arc<ProcessControlBlock>) -> Arc<ProcessControlBlock> {
661         if likely(prev.sched_info().policy() >= SchedPolicy::CFS)
662             && self.nr_running == self.cfs.h_nr_running as usize
663         {
664             let p = CompletelyFairScheduler::pick_next_task(self, Some(prev.clone()));
665 
666             if let Some(pcb) = p.as_ref() {
667                 return pcb.clone();
668             } else {
669                 // error!(
670                 //     "pick idle cfs rq {:?}",
671                 //     self.cfs_rq()
672                 //         .entities
673                 //         .iter()
674                 //         .map(|x| x.1.pid)
675                 //         .collect::<Vec<_>>()
676                 // );
677                 match prev.sched_info().policy() {
678                     SchedPolicy::FIFO => todo!(),
679                     SchedPolicy::RT => todo!(),
680                     SchedPolicy::CFS => CompletelyFairScheduler::put_prev_task(self, prev),
681                     SchedPolicy::IDLE => IdleScheduler::put_prev_task(self, prev),
682                 }
683                 // 选择idle
684                 return self.idle.upgrade().unwrap();
685             }
686         }
687 
688         todo!()
689     }
690 }
691 
692 bitflags! {
693     pub struct SchedFeature:u32 {
694         /// 给予睡眠任务仅有 50% 的服务赤字。这意味着睡眠任务在被唤醒后会获得一定的服务,但不能过多地占用资源。
695         const GENTLE_FAIR_SLEEPERS = 1 << 0;
696         /// 将新任务排在前面,以避免已经运行的任务被饿死
697         const START_DEBIT = 1 << 1;
698         /// 在调度时优先选择上次唤醒的任务,因为它可能会访问之前唤醒的任务所使用的数据,从而提高缓存局部性。
699         const NEXT_BUDDY = 1 << 2;
700         /// 在调度时优先选择上次运行的任务,因为它可能会访问与之前运行的任务相同的数据,从而提高缓存局部性。
701         const LAST_BUDDY = 1 << 3;
702         /// 认为任务的伙伴(buddy)在缓存中是热点,减少缓存伙伴被迁移的可能性,从而提高缓存局部性。
703         const CACHE_HOT_BUDDY = 1 << 4;
704         /// 允许唤醒时抢占当前任务。
705         const WAKEUP_PREEMPTION = 1 << 5;
706         /// 基于任务未运行时间来减少 CPU 的容量。
707         const NONTASK_CAPACITY = 1 << 6;
708         /// 将远程唤醒排队到目标 CPU,并使用调度器 IPI 处理它们,以减少运行队列锁的争用。
709         const TTWU_QUEUE = 1 << 7;
710         /// 在唤醒时尝试限制对最后级联缓存(LLC)域的无谓扫描。
711         const SIS_UTIL = 1 << 8;
712         /// 在 RT(Real-Time)任务迁移时,通过发送 IPI 来减少 CPU 之间的锁竞争。
713         const RT_PUSH_IPI = 1 << 9;
714         /// 启用估计的 CPU 利用率功能,用于调度决策。
715         const UTIL_EST = 1 << 10;
716         const UTIL_EST_FASTUP = 1 << 11;
717         /// 启用备选调度周期
718         const ALT_PERIOD = 1 << 12;
719         /// 启用基本时间片
720         const BASE_SLICE = 1 << 13;
721     }
722 
723     pub struct EnqueueFlag: u8 {
724         const ENQUEUE_WAKEUP	= 0x01;
725         const ENQUEUE_RESTORE	= 0x02;
726         const ENQUEUE_MOVE	= 0x04;
727         const ENQUEUE_NOCLOCK	= 0x08;
728 
729         const ENQUEUE_MIGRATED	= 0x40;
730 
731         const ENQUEUE_INITIAL	= 0x80;
732     }
733 
734     pub struct DequeueFlag: u8 {
735         const DEQUEUE_SLEEP		= 0x01;
736         const DEQUEUE_SAVE		= 0x02; /* Matches ENQUEUE_RESTORE */
737         const DEQUEUE_MOVE		= 0x04; /* Matches ENQUEUE_MOVE */
738         const DEQUEUE_NOCLOCK		= 0x08; /* Matches ENQUEUE_NOCLOCK */
739     }
740 
741     pub struct WakeupFlags: u8 {
742         /* Wake flags. The first three directly map to some SD flag value */
743         const WF_EXEC         = 0x02; /* Wakeup after exec; maps to SD_BALANCE_EXEC */
744         const WF_FORK         = 0x04; /* Wakeup after fork; maps to SD_BALANCE_FORK */
745         const WF_TTWU         = 0x08; /* Wakeup;            maps to SD_BALANCE_WAKE */
746 
747         const WF_SYNC         = 0x10; /* Waker goes to sleep after wakeup */
748         const WF_MIGRATED     = 0x20; /* Internal use, task got migrated */
749         const WF_CURRENT_CPU  = 0x40; /* Prefer to move the wakee to the current CPU. */
750     }
751 
752     pub struct SchedMode: u8 {
753         /*
754         * Constants for the sched_mode argument of __schedule().
755         *
756         * The mode argument allows RT enabled kernels to differentiate a
757         * preemption from blocking on an 'sleeping' spin/rwlock. Note that
758         * SM_MASK_PREEMPT for !RT has all bits set, which allows the compiler to
759         * optimize the AND operation out and just check for zero.
760         */
761         /// 在调度过程中不会再次进入队列,即需要手动唤醒
762         const SM_NONE			= 0x0;
763         /// 重新加入队列,即当前进程被抢占,需要时钟调度
764         const SM_PREEMPT		= 0x1;
765         /// rt相关
766         const SM_RTLOCK_WAIT		= 0x2;
767         /// 默认与SM_PREEMPT相同
768         const SM_MASK_PREEMPT	= Self::SM_PREEMPT.bits;
769     }
770 }
771 
772 #[derive(Copy, Clone, Debug, PartialEq)]
773 pub enum OnRq {
774     Queued,
775     Migrating,
776     None,
777 }
778 
779 impl ProcessManager {
update_process_times(user_tick: bool)780     pub fn update_process_times(user_tick: bool) {
781         let pcb = Self::current_pcb();
782         CpuTimeFunc::irqtime_account_process_tick(&pcb, user_tick, 1);
783 
784         scheduler_tick();
785     }
786 }
787 
788 /// ## 时钟tick时调用此函数
scheduler_tick()789 pub fn scheduler_tick() {
790     fence(Ordering::SeqCst);
791     // 获取当前CPU索引
792     let cpu_idx = smp_get_processor_id().data() as usize;
793 
794     // 获取当前CPU的请求队列
795     let rq = cpu_rq(cpu_idx);
796 
797     let (rq, guard) = rq.self_lock();
798 
799     // 获取当前请求队列的当前请求
800     let current = rq.current();
801 
802     // 更新请求队列时钟
803     rq.update_rq_clock();
804 
805     match current.sched_info().policy() {
806         SchedPolicy::CFS => CompletelyFairScheduler::tick(rq, current, false),
807         SchedPolicy::FIFO => todo!(),
808         SchedPolicy::RT => todo!(),
809         SchedPolicy::IDLE => IdleScheduler::tick(rq, current, false),
810     }
811 
812     rq.calculate_global_load_tick();
813 
814     drop(guard);
815     // TODO:处理负载均衡
816 }
817 
818 /// ## 执行调度
819 /// 若preempt_count不为0则报错
820 #[inline]
schedule(sched_mod: SchedMode)821 pub fn schedule(sched_mod: SchedMode) {
822     let _guard = unsafe { CurrentIrqArch::save_and_disable_irq() };
823     assert_eq!(ProcessManager::current_pcb().preempt_count(), 0);
824     __schedule(sched_mod);
825 }
826 
827 /// ## 执行调度
828 /// 此函数与schedule的区别为,该函数不会检查preempt_count
829 /// 适用于时钟中断等场景
__schedule(sched_mod: SchedMode)830 pub fn __schedule(sched_mod: SchedMode) {
831     let cpu = smp_get_processor_id().data() as usize;
832     let rq = cpu_rq(cpu);
833 
834     let mut prev = rq.current();
835     if let ProcessState::Exited(_) = prev.clone().sched_info().inner_lock_read_irqsave().state() {
836         // 从exit进的Schedule
837         prev = ProcessManager::current_pcb();
838     }
839 
840     // TODO: hrtick_clear(rq);
841 
842     let (rq, _guard) = rq.self_lock();
843 
844     rq.clock_updata_flags = ClockUpdataFlag::from_bits_truncate(rq.clock_updata_flags.bits() << 1);
845 
846     rq.update_rq_clock();
847     rq.clock_updata_flags = ClockUpdataFlag::RQCF_UPDATE;
848 
849     // kBUG!(
850     //     "before cfs rq pcbs {:?}\nvruntimes {:?}\n",
851     //     rq.cfs
852     //         .entities
853     //         .iter()
854     //         .map(|x| { x.1.pcb().pid() })
855     //         .collect::<Vec<_>>(),
856     //     rq.cfs
857     //         .entities
858     //         .iter()
859     //         .map(|x| { x.1.vruntime })
860     //         .collect::<Vec<_>>(),
861     // );
862     // warn!(
863     //     "before cfs rq {:?} prev {:?}",
864     //     rq.cfs
865     //         .entities
866     //         .iter()
867     //         .map(|x| { x.1.pcb().pid() })
868     //         .collect::<Vec<_>>(),
869     //     prev.pid()
870     // );
871 
872     // error!("prev pid {:?} {:?}", prev.pid(), prev.sched_info().policy());
873     if !sched_mod.contains(SchedMode::SM_MASK_PREEMPT)
874         && prev.sched_info().policy() != SchedPolicy::IDLE
875         && prev.sched_info().inner_lock_read_irqsave().is_mark_sleep()
876     {
877         // warn!("deactivate_task prev {:?}", prev.pid());
878         // TODO: 这里需要处理信号
879         // https://code.dragonos.org.cn/xref/linux-6.6.21/kernel/sched/core.c?r=&mo=172979&fi=6578#6630
880         rq.deactivate_task(
881             prev.clone(),
882             DequeueFlag::DEQUEUE_SLEEP | DequeueFlag::DEQUEUE_NOCLOCK,
883         );
884     }
885 
886     let next = rq.pick_next_task(prev.clone());
887 
888     // kBUG!(
889     //     "after cfs rq pcbs {:?}\nvruntimes {:?}\n",
890     //     rq.cfs
891     //         .entities
892     //         .iter()
893     //         .map(|x| { x.1.pcb().pid() })
894     //         .collect::<Vec<_>>(),
895     //     rq.cfs
896     //         .entities
897     //         .iter()
898     //         .map(|x| { x.1.vruntime })
899     //         .collect::<Vec<_>>(),
900     // );
901 
902     // error!("next {:?}", next.pid());
903 
904     prev.flags().remove(ProcessFlags::NEED_SCHEDULE);
905     fence(Ordering::SeqCst);
906     if likely(!Arc::ptr_eq(&prev, &next)) {
907         rq.set_current(Arc::downgrade(&next));
908         // warn!(
909         //     "switch_process prev {:?} next {:?} sched_mode {sched_mod:?}",
910         //     prev.pid(),
911         //     next.pid()
912         // );
913 
914         // send_to_default_serial8250_port(
915         //     format!(
916         //         "switch_process prev {:?} next {:?} sched_mode {sched_mod:?}\n",
917         //         prev.pid(),
918         //         next.pid()
919         //     )
920         //     .as_bytes(),
921         // );
922 
923         // CurrentApic.send_eoi();
924         compiler_fence(Ordering::SeqCst);
925 
926         unsafe { ProcessManager::switch_process(prev, next) };
927     } else {
928         assert!(
929             Arc::ptr_eq(&ProcessManager::current_pcb(), &prev),
930             "{}",
931             ProcessManager::current_pcb().basic().name()
932         );
933     }
934 }
935 
sched_fork(pcb: &Arc<ProcessControlBlock>) -> Result<(), SystemError>936 pub fn sched_fork(pcb: &Arc<ProcessControlBlock>) -> Result<(), SystemError> {
937     let mut prio_guard = pcb.sched_info().prio_data.write_irqsave();
938     let current = ProcessManager::current_pcb();
939 
940     prio_guard.prio = current.sched_info().prio_data.read_irqsave().normal_prio;
941 
942     if PrioUtil::dl_prio(prio_guard.prio) {
943         return Err(SystemError::EAGAIN_OR_EWOULDBLOCK);
944     } else if PrioUtil::rt_prio(prio_guard.prio) {
945         let policy = &pcb.sched_info().sched_policy;
946         *policy.write_irqsave() = SchedPolicy::RT;
947     } else {
948         let policy = &pcb.sched_info().sched_policy;
949         *policy.write_irqsave() = SchedPolicy::CFS;
950     }
951 
952     pcb.sched_info()
953         .sched_entity()
954         .force_mut()
955         .init_entity_runnable_average();
956 
957     Ok(())
958 }
959 
sched_cgroup_fork(pcb: &Arc<ProcessControlBlock>)960 pub fn sched_cgroup_fork(pcb: &Arc<ProcessControlBlock>) {
961     __set_task_cpu(pcb, smp_get_processor_id());
962     match pcb.sched_info().policy() {
963         SchedPolicy::RT => todo!(),
964         SchedPolicy::FIFO => todo!(),
965         SchedPolicy::CFS => CompletelyFairScheduler::task_fork(pcb.clone()),
966         SchedPolicy::IDLE => todo!(),
967     }
968 }
969 
__set_task_cpu(pcb: &Arc<ProcessControlBlock>, cpu: ProcessorId)970 fn __set_task_cpu(pcb: &Arc<ProcessControlBlock>, cpu: ProcessorId) {
971     // TODO: Fixme There is not implement group sched;
972     let se = pcb.sched_info().sched_entity();
973     let rq = cpu_rq(cpu.data() as usize);
974     se.force_mut().set_cfs(Arc::downgrade(&rq.cfs));
975 }
976 
977 #[inline(never)]
sched_init()978 pub fn sched_init() {
979     // 初始化percpu变量
980     unsafe {
981         CPU_IRQ_TIME = Some(Vec::with_capacity(PerCpu::MAX_CPU_NUM as usize));
982         CPU_IRQ_TIME
983             .as_mut()
984             .unwrap()
985             .resize_with(PerCpu::MAX_CPU_NUM as usize, || Box::leak(Box::default()));
986 
987         let mut cpu_runqueue = Vec::with_capacity(PerCpu::MAX_CPU_NUM as usize);
988         for cpu in 0..PerCpu::MAX_CPU_NUM as usize {
989             let rq = Arc::new(CpuRunQueue::new(cpu));
990             rq.cfs.force_mut().set_rq(Arc::downgrade(&rq));
991             cpu_runqueue.push(rq);
992         }
993 
994         CPU_RUNQUEUE.init(PerCpuVar::new(cpu_runqueue).unwrap());
995     };
996 }
997 
998 #[inline]
send_resched_ipi(cpu: ProcessorId)999 pub fn send_resched_ipi(cpu: ProcessorId) {
1000     send_ipi(IpiKind::KickCpu, IpiTarget::Specified(cpu));
1001 }
1002