xref: /DragonOS/kernel/src/sched/mod.rs (revision 9fab312ea9921618629924ab15c28c2d255b21c6)
1 pub mod clock;
2 pub mod completion;
3 pub mod cputime;
4 pub mod fair;
5 pub mod idle;
6 pub mod pelt;
7 pub mod prio;
8 
9 use core::{
10     intrinsics::{likely, unlikely},
11     sync::atomic::{compiler_fence, fence, AtomicUsize, Ordering},
12 };
13 
14 use alloc::{
15     boxed::Box,
16     collections::LinkedList,
17     sync::{Arc, Weak},
18     vec::Vec,
19 };
20 use system_error::SystemError;
21 
22 use crate::{
23     arch::{interrupt::ipi::send_ipi, CurrentIrqArch},
24     exception::{
25         ipi::{IpiKind, IpiTarget},
26         InterruptArch,
27     },
28     libs::{
29         lazy_init::Lazy,
30         spinlock::{SpinLock, SpinLockGuard},
31     },
32     mm::percpu::{PerCpu, PerCpuVar},
33     process::{ProcessControlBlock, ProcessFlags, ProcessManager, ProcessState, SchedInfo},
34     sched::idle::IdleScheduler,
35     smp::{core::smp_get_processor_id, cpu::ProcessorId},
36     time::{clocksource::HZ, timer::clock},
37 };
38 
39 use self::{
40     clock::{ClockUpdataFlag, SchedClock},
41     cputime::{irq_time_read, CpuTimeFunc, IrqTime},
42     fair::{CfsRunQueue, CompletelyFairScheduler, FairSchedEntity},
43     prio::PrioUtil,
44 };
45 
46 static mut CPU_IRQ_TIME: Option<Vec<&'static mut IrqTime>> = None;
47 
48 // 这里虽然rq是percpu的,但是在负载均衡的时候需要修改对端cpu的rq,所以仍需加锁
49 static CPU_RUNQUEUE: Lazy<PerCpuVar<Arc<CpuRunQueue>>> = PerCpuVar::define_lazy();
50 
51 /// 用于记录系统中所有 CPU 的可执行进程数量的总和。
52 static CALCULATE_LOAD_TASKS: AtomicUsize = AtomicUsize::new(0);
53 
54 const LOAD_FREQ: usize = HZ as usize * 5 + 1;
55 
56 pub const SCHED_FIXEDPOINT_SHIFT: u64 = 10;
57 #[allow(dead_code)]
58 pub const SCHED_FIXEDPOINT_SCALE: u64 = 1 << SCHED_FIXEDPOINT_SHIFT;
59 #[allow(dead_code)]
60 pub const SCHED_CAPACITY_SHIFT: u64 = SCHED_FIXEDPOINT_SHIFT;
61 #[allow(dead_code)]
62 pub const SCHED_CAPACITY_SCALE: u64 = 1 << SCHED_CAPACITY_SHIFT;
63 
64 #[inline]
65 pub fn cpu_irq_time(cpu: usize) -> &'static mut IrqTime {
66     unsafe { CPU_IRQ_TIME.as_mut().unwrap()[cpu] }
67 }
68 
69 #[inline]
70 pub fn cpu_rq(cpu: usize) -> Arc<CpuRunQueue> {
71     CPU_RUNQUEUE.ensure();
72     unsafe {
73         CPU_RUNQUEUE
74             .get()
75             .force_get(ProcessorId::new(cpu as u32))
76             .clone()
77     }
78 }
79 
80 lazy_static! {
81     pub static ref SCHED_FEATURES: SchedFeature = SchedFeature::GENTLE_FAIR_SLEEPERS
82         | SchedFeature::START_DEBIT
83         | SchedFeature::LAST_BUDDY
84         | SchedFeature::CACHE_HOT_BUDDY
85         | SchedFeature::WAKEUP_PREEMPTION
86         | SchedFeature::NONTASK_CAPACITY
87         | SchedFeature::TTWU_QUEUE
88         | SchedFeature::SIS_UTIL
89         | SchedFeature::RT_PUSH_IPI
90         | SchedFeature::ALT_PERIOD
91         | SchedFeature::BASE_SLICE
92         | SchedFeature::UTIL_EST
93         | SchedFeature::UTIL_EST_FASTUP;
94 }
95 
96 pub trait Scheduler {
97     /// ## 加入当任务进入可运行状态时调用。它将调度实体(任务)放到红黑树中,增加nr_running变量的值。
98     fn enqueue(rq: &mut CpuRunQueue, pcb: Arc<ProcessControlBlock>, flags: EnqueueFlag);
99 
100     /// ## 当任务不再可运行时被调用,对应的调度实体被移出红黑树。它减少nr_running变量的值。
101     fn dequeue(rq: &mut CpuRunQueue, pcb: Arc<ProcessControlBlock>, flags: DequeueFlag);
102 
103     /// ## 主动让出cpu,这个函数的行为基本上是出队,紧接着入队
104     fn yield_task(rq: &mut CpuRunQueue);
105 
106     /// ## 检查进入可运行状态的任务能否抢占当前正在运行的任务
107     fn check_preempt_currnet(
108         rq: &mut CpuRunQueue,
109         pcb: &Arc<ProcessControlBlock>,
110         flags: WakeupFlags,
111     );
112 
113     /// ## 选择接下来最适合运行的任务
114     fn pick_task(rq: &mut CpuRunQueue) -> Option<Arc<ProcessControlBlock>>;
115 
116     /// ## 选择接下来最适合运行的任务
117     fn pick_next_task(
118         rq: &mut CpuRunQueue,
119         pcb: Option<Arc<ProcessControlBlock>>,
120     ) -> Option<Arc<ProcessControlBlock>>;
121 
122     /// ## 被时间滴答函数调用,它可能导致进程切换。驱动了运行时抢占。
123     fn tick(rq: &mut CpuRunQueue, pcb: Arc<ProcessControlBlock>, queued: bool);
124 
125     /// ## 在进程fork时,如需加入cfs,则调用
126     fn task_fork(pcb: Arc<ProcessControlBlock>);
127 
128     fn put_prev_task(rq: &mut CpuRunQueue, prev: Arc<ProcessControlBlock>);
129 }
130 
131 /// 调度策略
132 #[allow(dead_code)]
133 #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
134 pub enum SchedPolicy {
135     /// 实时进程
136     RT,
137     /// 先进先出调度
138     FIFO,
139     /// 完全公平调度
140     CFS,
141     /// IDLE
142     IDLE,
143 }
144 
145 #[allow(dead_code)]
146 pub struct TaskGroup {
147     /// CFS管理的调度实体,percpu的
148     entitys: Vec<Arc<FairSchedEntity>>,
149     /// 每个CPU的CFS运行队列
150     cfs: Vec<Arc<CfsRunQueue>>,
151     /// 父节点
152     parent: Option<Arc<TaskGroup>>,
153 
154     shares: u64,
155 }
156 
157 #[derive(Debug, Default)]
158 pub struct LoadWeight {
159     /// 负载权重
160     pub weight: u64,
161     /// weight的倒数,方便计算
162     pub inv_weight: u32,
163 }
164 
165 impl LoadWeight {
166     /// 用于限制权重在一个合适的区域内
167     pub const SCHED_FIXEDPOINT_SHIFT: u32 = 10;
168 
169     pub const WMULT_SHIFT: u32 = 32;
170     pub const WMULT_CONST: u32 = !0;
171 
172     pub const NICE_0_LOAD_SHIFT: u32 = Self::SCHED_FIXEDPOINT_SHIFT + Self::SCHED_FIXEDPOINT_SHIFT;
173 
174     pub fn update_load_add(&mut self, inc: u64) {
175         self.weight += inc;
176         self.inv_weight = 0;
177     }
178 
179     pub fn update_load_sub(&mut self, dec: u64) {
180         self.weight -= dec;
181         self.inv_weight = 0;
182     }
183 
184     pub fn update_load_set(&mut self, weight: u64) {
185         self.weight = weight;
186         self.inv_weight = 0;
187     }
188 
189     /// ## 更新负载权重的倒数
190     pub fn update_inv_weight(&mut self) {
191         // 已经更新
192         if likely(self.inv_weight != 0) {
193             return;
194         }
195 
196         let w = Self::scale_load_down(self.weight);
197 
198         if unlikely(w >= Self::WMULT_CONST as u64) {
199             // 高位有数据
200             self.inv_weight = 1;
201         } else if unlikely(w == 0) {
202             // 倒数去最大
203             self.inv_weight = Self::WMULT_CONST;
204         } else {
205             // 计算倒数
206             self.inv_weight = Self::WMULT_CONST / w as u32;
207         }
208     }
209 
210     /// ## 计算任务的执行时间差
211     ///
212     /// 计算公式:(delta_exec * (weight * self.inv_weight)) >> WMULT_SHIFT
213     pub fn calculate_delta(&mut self, delta_exec: u64, weight: u64) -> u64 {
214         // 降低精度
215         let mut fact = Self::scale_load_down(weight);
216 
217         // 记录fact高32位
218         let mut fact_hi = (fact >> 32) as u32;
219         // 用于恢复
220         let mut shift = Self::WMULT_SHIFT;
221 
222         self.update_inv_weight();
223 
224         if unlikely(fact_hi != 0) {
225             // 这里表示高32位还有数据
226             // 需要计算最高位,然后继续调整fact
227             let fs = 32 - fact_hi.leading_zeros();
228             shift -= fs;
229 
230             // 确保高32位全为0
231             fact >>= fs;
232         }
233 
234         // 这里确定了fact已经在32位内
235         fact *= self.inv_weight as u64;
236 
237         fact_hi = (fact >> 32) as u32;
238 
239         if fact_hi != 0 {
240             // 这里表示高32位还有数据
241             // 需要计算最高位,然后继续调整fact
242             let fs = 32 - fact_hi.leading_zeros();
243             shift -= fs;
244 
245             // 确保高32位全为0
246             fact >>= fs;
247         }
248 
249         return ((delta_exec as u128 * fact as u128) >> shift) as u64;
250     }
251 
252     /// ## 将负载权重缩小到到一个小的范围中计算,相当于减小精度计算
253     pub const fn scale_load_down(mut weight: u64) -> u64 {
254         if weight != 0 {
255             weight >>= Self::SCHED_FIXEDPOINT_SHIFT;
256 
257             if weight < 2 {
258                 weight = 2;
259             }
260         }
261         weight
262     }
263 
264     #[allow(dead_code)]
265     pub const fn scale_load(weight: u64) -> u64 {
266         weight << Self::SCHED_FIXEDPOINT_SHIFT
267     }
268 }
269 
270 pub trait SchedArch {
271     /// 开启当前核心的调度
272     fn enable_sched_local();
273     /// 关闭当前核心的调度
274     fn disable_sched_local();
275 
276     /// 在第一次开启调度之前,进行初始化工作。
277     ///
278     /// 注意区别于sched_init,这个函数只是做初始化时钟的工作等等。
279     fn initial_setup_sched_local() {}
280 }
281 
282 /// ## PerCpu的运行队列,其中维护了各个调度器对应的rq
283 #[allow(dead_code)]
284 #[derive(Debug)]
285 pub struct CpuRunQueue {
286     lock: SpinLock<()>,
287     lock_on_who: AtomicUsize,
288 
289     cpu: usize,
290     clock_task: u64,
291     clock: u64,
292     prev_irq_time: u64,
293     clock_updata_flags: ClockUpdataFlag,
294 
295     /// 过载
296     overload: bool,
297 
298     next_balance: u64,
299 
300     /// 运行任务数
301     nr_running: usize,
302 
303     /// 被阻塞的任务数量
304     nr_uninterruptible: usize,
305 
306     /// 记录上次更新负载时间
307     cala_load_update: usize,
308     cala_load_active: usize,
309 
310     /// CFS调度器
311     cfs: Arc<CfsRunQueue>,
312 
313     clock_pelt: u64,
314     lost_idle_time: u64,
315     clock_idle: u64,
316 
317     cfs_tasks: LinkedList<Arc<FairSchedEntity>>,
318 
319     /// 最近一次的调度信息
320     sched_info: SchedInfo,
321 
322     /// 当前在运行队列上执行的进程
323     current: Weak<ProcessControlBlock>,
324 
325     idle: Weak<ProcessControlBlock>,
326 }
327 
328 impl CpuRunQueue {
329     pub fn new(cpu: usize) -> Self {
330         Self {
331             lock: SpinLock::new(()),
332             lock_on_who: AtomicUsize::new(usize::MAX),
333             cpu,
334             clock_task: 0,
335             clock: 0,
336             prev_irq_time: 0,
337             clock_updata_flags: ClockUpdataFlag::empty(),
338             overload: false,
339             next_balance: 0,
340             nr_running: 0,
341             nr_uninterruptible: 0,
342             cala_load_update: (clock() + (5 * HZ + 1)) as usize,
343             cala_load_active: 0,
344             cfs: Arc::new(CfsRunQueue::new()),
345             clock_pelt: 0,
346             lost_idle_time: 0,
347             clock_idle: 0,
348             cfs_tasks: LinkedList::new(),
349             sched_info: SchedInfo::default(),
350             current: Weak::new(),
351             idle: Weak::new(),
352         }
353     }
354 
355     /// 此函数只能在关中断的情况下使用!!!
356     /// 获取到rq的可变引用,需要注意的是返回的第二个值需要确保其生命周期
357     /// 所以可以说这个函数是unsafe的,需要确保正确性
358     /// 在中断上下文,关中断的情况下,此函数是安全的
359     pub fn self_lock(&self) -> (&mut Self, Option<SpinLockGuard<()>>) {
360         if self.lock.is_locked()
361             && smp_get_processor_id().data() as usize == self.lock_on_who.load(Ordering::SeqCst)
362         {
363             // 在本cpu已上锁则可以直接拿
364             (
365                 unsafe { &mut *(self as *const Self as usize as *mut Self) },
366                 None,
367             )
368         } else {
369             // 否则先上锁再拿
370             let guard = self.lock();
371             (
372                 unsafe { &mut *(self as *const Self as usize as *mut Self) },
373                 Some(guard),
374             )
375         }
376     }
377 
378     fn lock(&self) -> SpinLockGuard<()> {
379         let guard = self.lock.lock_irqsave();
380 
381         // 更新在哪一个cpu上锁
382         self.lock_on_who
383             .store(smp_get_processor_id().data() as usize, Ordering::SeqCst);
384 
385         guard
386     }
387 
388     pub fn enqueue_task(&mut self, pcb: Arc<ProcessControlBlock>, flags: EnqueueFlag) {
389         if !flags.contains(EnqueueFlag::ENQUEUE_NOCLOCK) {
390             self.update_rq_clock();
391         }
392 
393         if !flags.contains(EnqueueFlag::ENQUEUE_RESTORE) {
394             let sched_info = pcb.sched_info().sched_stat.upgradeable_read_irqsave();
395             if sched_info.last_queued == 0 {
396                 sched_info.upgrade().last_queued = self.clock;
397             }
398         }
399 
400         match pcb.sched_info().policy() {
401             SchedPolicy::CFS => CompletelyFairScheduler::enqueue(self, pcb, flags),
402             SchedPolicy::FIFO => todo!(),
403             SchedPolicy::RT => todo!(),
404             SchedPolicy::IDLE => IdleScheduler::enqueue(self, pcb, flags),
405         }
406 
407         // TODO:https://code.dragonos.org.cn/xref/linux-6.6.21/kernel/sched/core.c#239
408     }
409 
410     pub fn dequeue_task(&mut self, pcb: Arc<ProcessControlBlock>, flags: DequeueFlag) {
411         // TODO:sched_core
412 
413         if !flags.contains(DequeueFlag::DEQUEUE_NOCLOCK) {
414             self.update_rq_clock()
415         }
416 
417         if !flags.contains(DequeueFlag::DEQUEUE_SAVE) {
418             let sched_info = pcb.sched_info().sched_stat.upgradeable_read_irqsave();
419 
420             if sched_info.last_queued > 0 {
421                 let delta = self.clock - sched_info.last_queued;
422 
423                 let mut sched_info = sched_info.upgrade();
424                 sched_info.last_queued = 0;
425                 sched_info.run_delay += delta as usize;
426 
427                 self.sched_info.run_delay += delta as usize;
428             }
429         }
430 
431         match pcb.sched_info().policy() {
432             SchedPolicy::CFS => CompletelyFairScheduler::dequeue(self, pcb, flags),
433             SchedPolicy::FIFO => todo!(),
434             SchedPolicy::RT => todo!(),
435             SchedPolicy::IDLE => todo!(),
436         }
437     }
438 
439     /// 启用一个任务,将加入队列
440     pub fn activate_task(&mut self, pcb: &Arc<ProcessControlBlock>, mut flags: EnqueueFlag) {
441         if *pcb.sched_info().on_rq.lock_irqsave() == OnRq::Migrating {
442             flags |= EnqueueFlag::ENQUEUE_MIGRATED;
443         }
444 
445         if flags.contains(EnqueueFlag::ENQUEUE_MIGRATED) {
446             todo!()
447         }
448 
449         self.enqueue_task(pcb.clone(), flags);
450 
451         *pcb.sched_info().on_rq.lock_irqsave() = OnRq::Queued;
452     }
453 
454     /// 检查对应的task是否可以抢占当前运行的task
455     #[allow(clippy::comparison_chain)]
456     pub fn check_preempt_currnet(&mut self, pcb: &Arc<ProcessControlBlock>, flags: WakeupFlags) {
457         if pcb.sched_info().policy() == self.current().sched_info().policy() {
458             match self.current().sched_info().policy() {
459                 SchedPolicy::CFS => {
460                     CompletelyFairScheduler::check_preempt_currnet(self, pcb, flags)
461                 }
462                 SchedPolicy::FIFO => todo!(),
463                 SchedPolicy::RT => todo!(),
464                 SchedPolicy::IDLE => IdleScheduler::check_preempt_currnet(self, pcb, flags),
465             }
466         } else if pcb.sched_info().policy() < self.current().sched_info().policy() {
467             // 调度优先级更高
468             self.resched_current();
469         }
470 
471         if *self.current().sched_info().on_rq.lock_irqsave() == OnRq::Queued
472             && self.current().flags().contains(ProcessFlags::NEED_SCHEDULE)
473         {
474             self.clock_updata_flags
475                 .insert(ClockUpdataFlag::RQCF_REQ_SKIP);
476         }
477     }
478 
479     /// 禁用一个任务,将离开队列
480     pub fn deactivate_task(&mut self, pcb: Arc<ProcessControlBlock>, flags: DequeueFlag) {
481         *pcb.sched_info().on_rq.lock_irqsave() = if flags.contains(DequeueFlag::DEQUEUE_SLEEP) {
482             OnRq::None
483         } else {
484             OnRq::Migrating
485         };
486 
487         self.dequeue_task(pcb, flags);
488     }
489 
490     #[inline]
491     pub fn cfs_rq(&self) -> Arc<CfsRunQueue> {
492         self.cfs.clone()
493     }
494 
495     /// 更新rq时钟
496     pub fn update_rq_clock(&mut self) {
497         // 需要跳过这次时钟更新
498         if self
499             .clock_updata_flags
500             .contains(ClockUpdataFlag::RQCF_ACT_SKIP)
501         {
502             return;
503         }
504 
505         let clock = SchedClock::sched_clock_cpu(self.cpu);
506         if clock < self.clock {
507             return;
508         }
509 
510         let delta = clock - self.clock;
511         self.clock += delta;
512         // kerror!("clock {}", self.clock);
513         self.update_rq_clock_task(delta);
514     }
515 
516     /// 更新任务时钟
517     pub fn update_rq_clock_task(&mut self, mut delta: u64) {
518         let mut irq_delta = irq_time_read(self.cpu) - self.prev_irq_time;
519         // if self.cpu == 0 {
520         //     kerror!(
521         //         "cpu 0 delta {delta} irq_delta {} irq_time_read(self.cpu) {} self.prev_irq_time {}",
522         //         irq_delta,
523         //         irq_time_read(self.cpu),
524         //         self.prev_irq_time
525         //     );
526         // }
527         compiler_fence(Ordering::SeqCst);
528 
529         if irq_delta > delta {
530             irq_delta = delta;
531         }
532 
533         self.prev_irq_time += irq_delta;
534 
535         delta -= irq_delta;
536 
537         // todo: psi?
538 
539         // send_to_default_serial8250_port(format!("\n{delta}\n",).as_bytes());
540         compiler_fence(Ordering::SeqCst);
541         self.clock_task += delta;
542         compiler_fence(Ordering::SeqCst);
543         // if self.cpu == 0 {
544         //     kerror!("cpu {} clock_task {}", self.cpu, self.clock_task);
545         // }
546         // todo: pelt?
547     }
548 
549     /// 计算当前进程中的可执行数量
550     fn calculate_load_fold_active(&mut self, adjust: usize) -> usize {
551         let mut nr_active = self.nr_running - adjust;
552         nr_active += self.nr_uninterruptible;
553         let mut delta = 0;
554 
555         if nr_active != self.cala_load_active {
556             delta = nr_active - self.cala_load_active;
557             self.cala_load_active = nr_active;
558         }
559 
560         delta
561     }
562 
563     /// ## tick计算全局负载
564     pub fn calculate_global_load_tick(&mut self) {
565         if clock() < self.cala_load_update as u64 {
566             // 如果当前时间在上次更新时间之前,则直接返回
567             return;
568         }
569 
570         let delta = self.calculate_load_fold_active(0);
571 
572         if delta != 0 {
573             CALCULATE_LOAD_TASKS.fetch_add(delta, Ordering::SeqCst);
574         }
575 
576         self.cala_load_update += LOAD_FREQ;
577     }
578 
579     pub fn add_nr_running(&mut self, nr_running: usize) {
580         let prev = self.nr_running;
581 
582         self.nr_running = prev + nr_running;
583         if prev < 2 && self.nr_running >= 2 && !self.overload {
584             self.overload = true;
585         }
586     }
587 
588     pub fn sub_nr_running(&mut self, count: usize) {
589         self.nr_running -= count;
590     }
591 
592     /// 在运行idle?
593     pub fn sched_idle_rq(&self) -> bool {
594         return unlikely(
595             self.nr_running == self.cfs.idle_h_nr_running as usize && self.nr_running > 0,
596         );
597     }
598 
599     #[inline]
600     pub fn current(&self) -> Arc<ProcessControlBlock> {
601         self.current.upgrade().unwrap()
602     }
603 
604     #[inline]
605     pub fn set_current(&mut self, pcb: Weak<ProcessControlBlock>) {
606         self.current = pcb;
607     }
608 
609     #[inline]
610     pub fn set_idle(&mut self, pcb: Weak<ProcessControlBlock>) {
611         self.idle = pcb;
612     }
613 
614     #[inline]
615     pub fn clock_task(&self) -> u64 {
616         self.clock_task
617     }
618 
619     /// 重新调度当前进程
620     pub fn resched_current(&self) {
621         let current = self.current();
622 
623         // 又需要被调度?
624         if unlikely(current.flags().contains(ProcessFlags::NEED_SCHEDULE)) {
625             return;
626         }
627 
628         let cpu = self.cpu;
629 
630         if cpu == smp_get_processor_id().data() as usize {
631             // assert!(
632             //     Arc::ptr_eq(&current, &ProcessManager::current_pcb()),
633             //     "rq current name {} process current {}",
634             //     current.basic().name().to_string(),
635             //     ProcessManager::current_pcb().basic().name().to_string(),
636             // );
637             // 设置需要调度
638             ProcessManager::current_pcb()
639                 .flags()
640                 .insert(ProcessFlags::NEED_SCHEDULE);
641             return;
642         }
643 
644         // 向目标cpu发送重调度ipi
645         send_resched_ipi(ProcessorId::new(cpu as u32));
646     }
647 
648     /// 选择下一个task
649     pub fn pick_next_task(&mut self, prev: Arc<ProcessControlBlock>) -> Arc<ProcessControlBlock> {
650         if likely(prev.sched_info().policy() >= SchedPolicy::CFS)
651             && self.nr_running == self.cfs.h_nr_running as usize
652         {
653             let p = CompletelyFairScheduler::pick_next_task(self, Some(prev.clone()));
654 
655             if let Some(pcb) = p.as_ref() {
656                 return pcb.clone();
657             } else {
658                 // kerror!(
659                 //     "pick idle cfs rq {:?}",
660                 //     self.cfs_rq()
661                 //         .entities
662                 //         .iter()
663                 //         .map(|x| x.1.pid)
664                 //         .collect::<Vec<_>>()
665                 // );
666                 match prev.sched_info().policy() {
667                     SchedPolicy::FIFO => todo!(),
668                     SchedPolicy::RT => todo!(),
669                     SchedPolicy::CFS => CompletelyFairScheduler::put_prev_task(self, prev),
670                     SchedPolicy::IDLE => IdleScheduler::put_prev_task(self, prev),
671                 }
672                 // 选择idle
673                 return self.idle.upgrade().unwrap();
674             }
675         }
676 
677         todo!()
678     }
679 }
680 
681 bitflags! {
682     pub struct SchedFeature:u32 {
683         /// 给予睡眠任务仅有 50% 的服务赤字。这意味着睡眠任务在被唤醒后会获得一定的服务,但不能过多地占用资源。
684         const GENTLE_FAIR_SLEEPERS = 1 << 0;
685         /// 将新任务排在前面,以避免已经运行的任务被饿死
686         const START_DEBIT = 1 << 1;
687         /// 在调度时优先选择上次唤醒的任务,因为它可能会访问之前唤醒的任务所使用的数据,从而提高缓存局部性。
688         const NEXT_BUDDY = 1 << 2;
689         /// 在调度时优先选择上次运行的任务,因为它可能会访问与之前运行的任务相同的数据,从而提高缓存局部性。
690         const LAST_BUDDY = 1 << 3;
691         /// 认为任务的伙伴(buddy)在缓存中是热点,减少缓存伙伴被迁移的可能性,从而提高缓存局部性。
692         const CACHE_HOT_BUDDY = 1 << 4;
693         /// 允许唤醒时抢占当前任务。
694         const WAKEUP_PREEMPTION = 1 << 5;
695         /// 基于任务未运行时间来减少 CPU 的容量。
696         const NONTASK_CAPACITY = 1 << 6;
697         /// 将远程唤醒排队到目标 CPU,并使用调度器 IPI 处理它们,以减少运行队列锁的争用。
698         const TTWU_QUEUE = 1 << 7;
699         /// 在唤醒时尝试限制对最后级联缓存(LLC)域的无谓扫描。
700         const SIS_UTIL = 1 << 8;
701         /// 在 RT(Real-Time)任务迁移时,通过发送 IPI 来减少 CPU 之间的锁竞争。
702         const RT_PUSH_IPI = 1 << 9;
703         /// 启用估计的 CPU 利用率功能,用于调度决策。
704         const UTIL_EST = 1 << 10;
705         const UTIL_EST_FASTUP = 1 << 11;
706         /// 启用备选调度周期
707         const ALT_PERIOD = 1 << 12;
708         /// 启用基本时间片
709         const BASE_SLICE = 1 << 13;
710     }
711 
712     pub struct EnqueueFlag: u8 {
713         const ENQUEUE_WAKEUP	= 0x01;
714         const ENQUEUE_RESTORE	= 0x02;
715         const ENQUEUE_MOVE	= 0x04;
716         const ENQUEUE_NOCLOCK	= 0x08;
717 
718         const ENQUEUE_MIGRATED	= 0x40;
719 
720         const ENQUEUE_INITIAL	= 0x80;
721     }
722 
723     pub struct DequeueFlag: u8 {
724         const DEQUEUE_SLEEP		= 0x01;
725         const DEQUEUE_SAVE		= 0x02; /* Matches ENQUEUE_RESTORE */
726         const DEQUEUE_MOVE		= 0x04; /* Matches ENQUEUE_MOVE */
727         const DEQUEUE_NOCLOCK		= 0x08; /* Matches ENQUEUE_NOCLOCK */
728     }
729 
730     pub struct WakeupFlags: u8 {
731         /* Wake flags. The first three directly map to some SD flag value */
732         const WF_EXEC         = 0x02; /* Wakeup after exec; maps to SD_BALANCE_EXEC */
733         const WF_FORK         = 0x04; /* Wakeup after fork; maps to SD_BALANCE_FORK */
734         const WF_TTWU         = 0x08; /* Wakeup;            maps to SD_BALANCE_WAKE */
735 
736         const WF_SYNC         = 0x10; /* Waker goes to sleep after wakeup */
737         const WF_MIGRATED     = 0x20; /* Internal use, task got migrated */
738         const WF_CURRENT_CPU  = 0x40; /* Prefer to move the wakee to the current CPU. */
739     }
740 
741     pub struct SchedMode: u8 {
742         /*
743         * Constants for the sched_mode argument of __schedule().
744         *
745         * The mode argument allows RT enabled kernels to differentiate a
746         * preemption from blocking on an 'sleeping' spin/rwlock. Note that
747         * SM_MASK_PREEMPT for !RT has all bits set, which allows the compiler to
748         * optimize the AND operation out and just check for zero.
749         */
750         /// 在调度过程中不会再次进入队列,即需要手动唤醒
751         const SM_NONE			= 0x0;
752         /// 重新加入队列,即当前进程被抢占,需要时钟调度
753         const SM_PREEMPT		= 0x1;
754         /// rt相关
755         const SM_RTLOCK_WAIT		= 0x2;
756         /// 默认与SM_PREEMPT相同
757         const SM_MASK_PREEMPT	= Self::SM_PREEMPT.bits;
758     }
759 }
760 
761 #[derive(Copy, Clone, Debug, PartialEq)]
762 pub enum OnRq {
763     Queued,
764     Migrating,
765     None,
766 }
767 
768 impl ProcessManager {
769     pub fn update_process_times(user_tick: bool) {
770         let pcb = Self::current_pcb();
771         CpuTimeFunc::irqtime_account_process_tick(&pcb, user_tick, 1);
772 
773         scheduler_tick();
774     }
775 }
776 
777 /// ## 时钟tick时调用此函数
778 pub fn scheduler_tick() {
779     fence(Ordering::SeqCst);
780     // 获取当前CPU索引
781     let cpu_idx = smp_get_processor_id().data() as usize;
782 
783     // 获取当前CPU的请求队列
784     let rq = cpu_rq(cpu_idx);
785 
786     let (rq, guard) = rq.self_lock();
787 
788     // 获取当前请求队列的当前请求
789     let current = rq.current();
790 
791     // 更新请求队列时钟
792     rq.update_rq_clock();
793 
794     match current.sched_info().policy() {
795         SchedPolicy::CFS => CompletelyFairScheduler::tick(rq, current, false),
796         SchedPolicy::FIFO => todo!(),
797         SchedPolicy::RT => todo!(),
798         SchedPolicy::IDLE => IdleScheduler::tick(rq, current, false),
799     }
800 
801     rq.calculate_global_load_tick();
802 
803     drop(guard);
804     // TODO:处理负载均衡
805 }
806 
807 /// ## 执行调度
808 /// 若preempt_count不为0则报错
809 #[inline]
810 pub fn schedule(sched_mod: SchedMode) {
811     let _guard = unsafe { CurrentIrqArch::save_and_disable_irq() };
812     assert_eq!(ProcessManager::current_pcb().preempt_count(), 0);
813     __schedule(sched_mod);
814 }
815 
816 /// ## 执行调度
817 /// 此函数与schedule的区别为,该函数不会检查preempt_count
818 /// 适用于时钟中断等场景
819 pub fn __schedule(sched_mod: SchedMode) {
820     let cpu = smp_get_processor_id().data() as usize;
821     let rq = cpu_rq(cpu);
822 
823     let mut prev = rq.current();
824     if let ProcessState::Exited(_) = prev.clone().sched_info().inner_lock_read_irqsave().state() {
825         // 从exit进的Schedule
826         prev = ProcessManager::current_pcb();
827     }
828 
829     // TODO: hrtick_clear(rq);
830 
831     let (rq, _guard) = rq.self_lock();
832 
833     rq.clock_updata_flags = ClockUpdataFlag::from_bits_truncate(rq.clock_updata_flags.bits() << 1);
834 
835     rq.update_rq_clock();
836     rq.clock_updata_flags = ClockUpdataFlag::RQCF_UPDATE;
837 
838     // kBUG!(
839     //     "before cfs rq pcbs {:?}\nvruntimes {:?}\n",
840     //     rq.cfs
841     //         .entities
842     //         .iter()
843     //         .map(|x| { x.1.pcb().pid() })
844     //         .collect::<Vec<_>>(),
845     //     rq.cfs
846     //         .entities
847     //         .iter()
848     //         .map(|x| { x.1.vruntime })
849     //         .collect::<Vec<_>>(),
850     // );
851     // kwarn!(
852     //     "before cfs rq {:?} prev {:?}",
853     //     rq.cfs
854     //         .entities
855     //         .iter()
856     //         .map(|x| { x.1.pcb().pid() })
857     //         .collect::<Vec<_>>(),
858     //     prev.pid()
859     // );
860 
861     // kerror!("prev pid {:?} {:?}", prev.pid(), prev.sched_info().policy());
862     if !sched_mod.contains(SchedMode::SM_MASK_PREEMPT)
863         && prev.sched_info().policy() != SchedPolicy::IDLE
864         && prev.sched_info().inner_lock_read_irqsave().is_mark_sleep()
865     {
866         // kwarn!("deactivate_task prev {:?}", prev.pid());
867         // TODO: 这里需要处理信号
868         // https://code.dragonos.org.cn/xref/linux-6.6.21/kernel/sched/core.c?r=&mo=172979&fi=6578#6630
869         rq.deactivate_task(
870             prev.clone(),
871             DequeueFlag::DEQUEUE_SLEEP | DequeueFlag::DEQUEUE_NOCLOCK,
872         );
873     }
874 
875     let next = rq.pick_next_task(prev.clone());
876 
877     // kBUG!(
878     //     "after cfs rq pcbs {:?}\nvruntimes {:?}\n",
879     //     rq.cfs
880     //         .entities
881     //         .iter()
882     //         .map(|x| { x.1.pcb().pid() })
883     //         .collect::<Vec<_>>(),
884     //     rq.cfs
885     //         .entities
886     //         .iter()
887     //         .map(|x| { x.1.vruntime })
888     //         .collect::<Vec<_>>(),
889     // );
890 
891     // kerror!("next {:?}", next.pid());
892 
893     prev.flags().remove(ProcessFlags::NEED_SCHEDULE);
894     fence(Ordering::SeqCst);
895     if likely(!Arc::ptr_eq(&prev, &next)) {
896         rq.set_current(Arc::downgrade(&next));
897         // kwarn!(
898         //     "switch_process prev {:?} next {:?} sched_mode {sched_mod:?}",
899         //     prev.pid(),
900         //     next.pid()
901         // );
902 
903         // send_to_default_serial8250_port(
904         //     format!(
905         //         "switch_process prev {:?} next {:?} sched_mode {sched_mod:?}\n",
906         //         prev.pid(),
907         //         next.pid()
908         //     )
909         //     .as_bytes(),
910         // );
911 
912         // CurrentApic.send_eoi();
913         compiler_fence(Ordering::SeqCst);
914 
915         unsafe { ProcessManager::switch_process(prev, next) };
916     } else {
917         kwarn!(
918             "!!!switch_process {} {:?} to self ",
919             prev.basic().name(),
920             prev.pid(),
921         );
922 
923         assert!(
924             Arc::ptr_eq(&ProcessManager::current_pcb(), &prev),
925             "{}",
926             ProcessManager::current_pcb().basic().name()
927         );
928     }
929 }
930 
931 pub fn sched_fork(pcb: &Arc<ProcessControlBlock>) -> Result<(), SystemError> {
932     let mut prio_guard = pcb.sched_info().prio_data.write_irqsave();
933     let current = ProcessManager::current_pcb();
934 
935     prio_guard.prio = current.sched_info().prio_data.read_irqsave().normal_prio;
936 
937     if PrioUtil::dl_prio(prio_guard.prio) {
938         return Err(SystemError::EAGAIN_OR_EWOULDBLOCK);
939     } else if PrioUtil::rt_prio(prio_guard.prio) {
940         let policy = &pcb.sched_info().sched_policy;
941         *policy.write_irqsave() = SchedPolicy::RT;
942     } else {
943         let policy = &pcb.sched_info().sched_policy;
944         *policy.write_irqsave() = SchedPolicy::CFS;
945     }
946 
947     pcb.sched_info()
948         .sched_entity()
949         .force_mut()
950         .init_entity_runnable_average();
951 
952     Ok(())
953 }
954 
955 pub fn sched_cgroup_fork(pcb: &Arc<ProcessControlBlock>) {
956     __set_task_cpu(pcb, smp_get_processor_id());
957     match pcb.sched_info().policy() {
958         SchedPolicy::RT => todo!(),
959         SchedPolicy::FIFO => todo!(),
960         SchedPolicy::CFS => CompletelyFairScheduler::task_fork(pcb.clone()),
961         SchedPolicy::IDLE => todo!(),
962     }
963 }
964 
965 fn __set_task_cpu(pcb: &Arc<ProcessControlBlock>, cpu: ProcessorId) {
966     // TODO: Fixme There is not implement group sched;
967     let se = pcb.sched_info().sched_entity();
968     let rq = cpu_rq(cpu.data() as usize);
969     se.force_mut().set_cfs(Arc::downgrade(&rq.cfs));
970 }
971 
972 #[inline(never)]
973 pub fn sched_init() {
974     // 初始化percpu变量
975     unsafe {
976         CPU_IRQ_TIME = Some(Vec::with_capacity(PerCpu::MAX_CPU_NUM as usize));
977         CPU_IRQ_TIME
978             .as_mut()
979             .unwrap()
980             .resize_with(PerCpu::MAX_CPU_NUM as usize, || Box::leak(Box::default()));
981 
982         let mut cpu_runqueue = Vec::with_capacity(PerCpu::MAX_CPU_NUM as usize);
983         for cpu in 0..PerCpu::MAX_CPU_NUM as usize {
984             let rq = Arc::new(CpuRunQueue::new(cpu));
985             rq.cfs.force_mut().set_rq(Arc::downgrade(&rq));
986             cpu_runqueue.push(rq);
987         }
988 
989         CPU_RUNQUEUE.init(PerCpuVar::new(cpu_runqueue).unwrap());
990     };
991 }
992 
993 #[inline]
994 pub fn send_resched_ipi(cpu: ProcessorId) {
995     send_ipi(IpiKind::KickCpu, IpiTarget::Specified(cpu));
996 }
997