xref: /DragonOS/kernel/src/perf/bpf.rs (revision fae6e9ade46a52976ad5d099643d51cc20876448)
1*fae6e9adSlinfeng use super::{PerfEventOps, Result};
2*fae6e9adSlinfeng use crate::arch::mm::LockedFrameAllocator;
3*fae6e9adSlinfeng use crate::arch::MMArch;
4*fae6e9adSlinfeng use crate::filesystem::vfs::file::PageCache;
5*fae6e9adSlinfeng use crate::filesystem::vfs::{FilePrivateData, FileSystem, IndexNode};
6*fae6e9adSlinfeng use crate::include::bindings::linux_bpf::{
7*fae6e9adSlinfeng     perf_event_header, perf_event_mmap_page, perf_event_type,
8*fae6e9adSlinfeng };
9*fae6e9adSlinfeng use crate::libs::spinlock::{SpinLock, SpinLockGuard};
10*fae6e9adSlinfeng use crate::mm::allocator::page_frame::{FrameAllocator, PageFrameCount, PhysPageFrame};
11*fae6e9adSlinfeng use crate::mm::page::{page_manager_lock_irqsave, Page};
12*fae6e9adSlinfeng use crate::mm::{MemoryManagementArch, PhysAddr};
13*fae6e9adSlinfeng use crate::perf::util::{LostSamples, PerfProbeArgs, PerfSample, SampleHeader};
14*fae6e9adSlinfeng use alloc::string::String;
15*fae6e9adSlinfeng use alloc::sync::Arc;
16*fae6e9adSlinfeng use alloc::vec::Vec;
17*fae6e9adSlinfeng use core::any::Any;
18*fae6e9adSlinfeng use core::fmt::Debug;
19*fae6e9adSlinfeng use system_error::SystemError;
20*fae6e9adSlinfeng const PAGE_SIZE: usize = MMArch::PAGE_SIZE;
21*fae6e9adSlinfeng #[derive(Debug)]
22*fae6e9adSlinfeng pub struct BpfPerfEvent {
23*fae6e9adSlinfeng     _args: PerfProbeArgs,
24*fae6e9adSlinfeng     data: SpinLock<BpfPerfEventData>,
25*fae6e9adSlinfeng }
26*fae6e9adSlinfeng 
27*fae6e9adSlinfeng #[derive(Debug)]
28*fae6e9adSlinfeng pub struct BpfPerfEventData {
29*fae6e9adSlinfeng     enabled: bool,
30*fae6e9adSlinfeng     mmap_page: RingPage,
31*fae6e9adSlinfeng     page_cache: Arc<PageCache>,
32*fae6e9adSlinfeng     offset: usize,
33*fae6e9adSlinfeng }
34*fae6e9adSlinfeng 
35*fae6e9adSlinfeng #[derive(Debug)]
36*fae6e9adSlinfeng pub struct RingPage {
37*fae6e9adSlinfeng     size: usize,
38*fae6e9adSlinfeng     ptr: usize,
39*fae6e9adSlinfeng     data_region_size: usize,
40*fae6e9adSlinfeng     lost: usize,
41*fae6e9adSlinfeng     phys_addr: PhysAddr,
42*fae6e9adSlinfeng }
43*fae6e9adSlinfeng 
44*fae6e9adSlinfeng impl RingPage {
45*fae6e9adSlinfeng     pub fn empty() -> Self {
46*fae6e9adSlinfeng         RingPage {
47*fae6e9adSlinfeng             ptr: 0,
48*fae6e9adSlinfeng             size: 0,
49*fae6e9adSlinfeng             data_region_size: 0,
50*fae6e9adSlinfeng             lost: 0,
51*fae6e9adSlinfeng             phys_addr: PhysAddr::new(0),
52*fae6e9adSlinfeng         }
53*fae6e9adSlinfeng     }
54*fae6e9adSlinfeng 
55*fae6e9adSlinfeng     pub fn new_init(start: usize, len: usize, phys_addr: PhysAddr) -> Self {
56*fae6e9adSlinfeng         Self::init(start as _, len, phys_addr)
57*fae6e9adSlinfeng     }
58*fae6e9adSlinfeng 
59*fae6e9adSlinfeng     fn init(ptr: *mut u8, size: usize, phys_addr: PhysAddr) -> Self {
60*fae6e9adSlinfeng         assert_eq!(size % PAGE_SIZE, 0);
61*fae6e9adSlinfeng         assert!(size / PAGE_SIZE >= 2);
62*fae6e9adSlinfeng         // The first page will be filled with perf_event_mmap_page
63*fae6e9adSlinfeng         unsafe {
64*fae6e9adSlinfeng             let perf_event_mmap_page = &mut *(ptr as *mut perf_event_mmap_page);
65*fae6e9adSlinfeng             perf_event_mmap_page.data_offset = PAGE_SIZE as u64;
66*fae6e9adSlinfeng             perf_event_mmap_page.data_size = (size - PAGE_SIZE) as u64;
67*fae6e9adSlinfeng             // user will read sample or lost record from data_tail
68*fae6e9adSlinfeng             perf_event_mmap_page.data_tail = 0;
69*fae6e9adSlinfeng             // kernel will write sample or lost record from data_head
70*fae6e9adSlinfeng             perf_event_mmap_page.data_head = 0;
71*fae6e9adSlinfeng             // It is a ring buffer.
72*fae6e9adSlinfeng         }
73*fae6e9adSlinfeng         RingPage {
74*fae6e9adSlinfeng             ptr: ptr as usize,
75*fae6e9adSlinfeng             size,
76*fae6e9adSlinfeng             data_region_size: size - PAGE_SIZE,
77*fae6e9adSlinfeng             lost: 0,
78*fae6e9adSlinfeng             phys_addr,
79*fae6e9adSlinfeng         }
80*fae6e9adSlinfeng     }
81*fae6e9adSlinfeng 
82*fae6e9adSlinfeng     fn can_write(&self, data_size: usize, data_tail: usize, data_head: usize) -> bool {
83*fae6e9adSlinfeng         if (data_head + 1) % self.data_region_size == data_tail {
84*fae6e9adSlinfeng             // The buffer is full
85*fae6e9adSlinfeng             return false;
86*fae6e9adSlinfeng         }
87*fae6e9adSlinfeng         let capacity = if data_head >= data_tail {
88*fae6e9adSlinfeng             self.data_region_size - data_head + data_tail
89*fae6e9adSlinfeng         } else {
90*fae6e9adSlinfeng             data_tail - data_head
91*fae6e9adSlinfeng         };
92*fae6e9adSlinfeng         data_size <= capacity
93*fae6e9adSlinfeng     }
94*fae6e9adSlinfeng 
95*fae6e9adSlinfeng     pub fn write_event(&mut self, data: &[u8]) -> Result<()> {
96*fae6e9adSlinfeng         let data_tail = unsafe { &mut (*(self.ptr as *mut perf_event_mmap_page)).data_tail };
97*fae6e9adSlinfeng         let data_head = unsafe { &mut (*(self.ptr as *mut perf_event_mmap_page)).data_head };
98*fae6e9adSlinfeng         // data_tail..data_head is the region that can be written
99*fae6e9adSlinfeng         // check if there is enough space to write the event
100*fae6e9adSlinfeng         let sample_size = PerfSample::calculate_size(data.len());
101*fae6e9adSlinfeng 
102*fae6e9adSlinfeng         let can_write_sample =
103*fae6e9adSlinfeng             self.can_write(sample_size, *data_tail as usize, *data_head as usize);
104*fae6e9adSlinfeng         // log::error!(
105*fae6e9adSlinfeng         //     "can_write_sample: {}, data_tail: {}, data_head: {}, data.len(): {}, region_size: {}",
106*fae6e9adSlinfeng         //     can_write_sample,
107*fae6e9adSlinfeng         //     *data_tail,
108*fae6e9adSlinfeng         //     *data_head,
109*fae6e9adSlinfeng         //     data.len(),
110*fae6e9adSlinfeng         //     self.data_region_size
111*fae6e9adSlinfeng         // );
112*fae6e9adSlinfeng         if !can_write_sample {
113*fae6e9adSlinfeng             //we need record it to the lost record
114*fae6e9adSlinfeng             self.lost += 1;
115*fae6e9adSlinfeng             // log::error!(
116*fae6e9adSlinfeng             //     "Lost record: {}, data_tail: {}, data_head: {}",
117*fae6e9adSlinfeng             //     self.lost,
118*fae6e9adSlinfeng             //     *data_tail,
119*fae6e9adSlinfeng             //     *data_head
120*fae6e9adSlinfeng             // );
121*fae6e9adSlinfeng             Ok(())
122*fae6e9adSlinfeng         } else {
123*fae6e9adSlinfeng             // we can write the sample to the page
124*fae6e9adSlinfeng             // If the lost record is not zero, we need to write the lost record first.
125*fae6e9adSlinfeng             let can_write_lost_record = self.can_write(
126*fae6e9adSlinfeng                 size_of::<LostSamples>(),
127*fae6e9adSlinfeng                 *data_tail as usize,
128*fae6e9adSlinfeng                 *data_head as usize,
129*fae6e9adSlinfeng             );
130*fae6e9adSlinfeng             if self.lost > 0 && can_write_lost_record {
131*fae6e9adSlinfeng                 let new_data_head = self.write_lost(*data_head as usize)?;
132*fae6e9adSlinfeng                 *data_head = new_data_head as u64;
133*fae6e9adSlinfeng                 // log::info!(
134*fae6e9adSlinfeng                 //     "Write lost record: {}, data_tail: {}, new_data_head: {}",
135*fae6e9adSlinfeng                 //     self.lost,
136*fae6e9adSlinfeng                 //     *data_tail,
137*fae6e9adSlinfeng                 //     *data_head
138*fae6e9adSlinfeng                 // );
139*fae6e9adSlinfeng                 self.lost = 0;
140*fae6e9adSlinfeng                 self.write_event(data)
141*fae6e9adSlinfeng             } else {
142*fae6e9adSlinfeng                 let new_data_head = self.write_sample(data, *data_head as usize)?;
143*fae6e9adSlinfeng                 *data_head = new_data_head as u64;
144*fae6e9adSlinfeng                 // log::info!(
145*fae6e9adSlinfeng                 //     "Write sample record, data_tail: {}, new_data_head: {}",
146*fae6e9adSlinfeng                 //     *data_tail,
147*fae6e9adSlinfeng                 //     *data_head
148*fae6e9adSlinfeng                 // );
149*fae6e9adSlinfeng                 Ok(())
150*fae6e9adSlinfeng             }
151*fae6e9adSlinfeng         }
152*fae6e9adSlinfeng     }
153*fae6e9adSlinfeng 
154*fae6e9adSlinfeng     /// Write any data to the page.
155*fae6e9adSlinfeng     ///
156*fae6e9adSlinfeng     /// Return the new data_head
157*fae6e9adSlinfeng     fn write_any(&mut self, data: &[u8], data_head: usize) -> Result<usize> {
158*fae6e9adSlinfeng         let data_region_len = self.data_region_size;
159*fae6e9adSlinfeng         let data_region = self.as_mut_slice()[PAGE_SIZE..].as_mut();
160*fae6e9adSlinfeng         let data_len = data.len();
161*fae6e9adSlinfeng         let end = (data_head + data_len) % data_region_len;
162*fae6e9adSlinfeng         let start = data_head;
163*fae6e9adSlinfeng         if start < end {
164*fae6e9adSlinfeng             data_region[start..end].copy_from_slice(data);
165*fae6e9adSlinfeng         } else {
166*fae6e9adSlinfeng             let first_len = data_region_len - start;
167*fae6e9adSlinfeng             data_region[start..start + first_len].copy_from_slice(&data[..first_len]);
168*fae6e9adSlinfeng             data_region[0..end].copy_from_slice(&data[first_len..]);
169*fae6e9adSlinfeng         }
170*fae6e9adSlinfeng         Ok(end)
171*fae6e9adSlinfeng     }
172*fae6e9adSlinfeng 
173*fae6e9adSlinfeng     /// Write a sample to the page.
174*fae6e9adSlinfeng     fn write_sample(&mut self, data: &[u8], data_head: usize) -> Result<usize> {
175*fae6e9adSlinfeng         let perf_sample = PerfSample {
176*fae6e9adSlinfeng             s_hdr: SampleHeader {
177*fae6e9adSlinfeng                 header: perf_event_header {
178*fae6e9adSlinfeng                     type_: perf_event_type::PERF_RECORD_SAMPLE as u32,
179*fae6e9adSlinfeng                     misc: 0,
180*fae6e9adSlinfeng                     size: size_of::<SampleHeader>() as u16 + data.len() as u16,
181*fae6e9adSlinfeng                 },
182*fae6e9adSlinfeng                 size: data.len() as u32,
183*fae6e9adSlinfeng             },
184*fae6e9adSlinfeng             value: data,
185*fae6e9adSlinfeng         };
186*fae6e9adSlinfeng         let new_head = self.write_any(perf_sample.s_hdr.as_bytes(), data_head)?;
187*fae6e9adSlinfeng         self.write_any(perf_sample.value, new_head)
188*fae6e9adSlinfeng     }
189*fae6e9adSlinfeng 
190*fae6e9adSlinfeng     /// Write a lost record to the page.
191*fae6e9adSlinfeng     ///
192*fae6e9adSlinfeng     /// Return the new data_head
193*fae6e9adSlinfeng     fn write_lost(&mut self, data_head: usize) -> Result<usize> {
194*fae6e9adSlinfeng         let lost = LostSamples {
195*fae6e9adSlinfeng             header: perf_event_header {
196*fae6e9adSlinfeng                 type_: perf_event_type::PERF_RECORD_LOST as u32,
197*fae6e9adSlinfeng                 misc: 0,
198*fae6e9adSlinfeng                 size: size_of::<LostSamples>() as u16,
199*fae6e9adSlinfeng             },
200*fae6e9adSlinfeng             id: 0,
201*fae6e9adSlinfeng             count: self.lost as u64,
202*fae6e9adSlinfeng         };
203*fae6e9adSlinfeng         self.write_any(lost.as_bytes(), data_head)
204*fae6e9adSlinfeng     }
205*fae6e9adSlinfeng 
206*fae6e9adSlinfeng     pub fn readable(&self) -> bool {
207*fae6e9adSlinfeng         let data_tail = unsafe { &(*(self.ptr as *mut perf_event_mmap_page)).data_tail };
208*fae6e9adSlinfeng         let data_head = unsafe { &(*(self.ptr as *mut perf_event_mmap_page)).data_head };
209*fae6e9adSlinfeng         data_tail != data_head
210*fae6e9adSlinfeng     }
211*fae6e9adSlinfeng     pub fn as_slice(&self) -> &[u8] {
212*fae6e9adSlinfeng         unsafe { core::slice::from_raw_parts(self.ptr as *const u8, self.size) }
213*fae6e9adSlinfeng     }
214*fae6e9adSlinfeng     pub fn as_mut_slice(&mut self) -> &mut [u8] {
215*fae6e9adSlinfeng         unsafe { core::slice::from_raw_parts_mut(self.ptr as *mut u8, self.size) }
216*fae6e9adSlinfeng     }
217*fae6e9adSlinfeng }
218*fae6e9adSlinfeng 
219*fae6e9adSlinfeng impl BpfPerfEvent {
220*fae6e9adSlinfeng     pub fn new(args: PerfProbeArgs) -> Self {
221*fae6e9adSlinfeng         BpfPerfEvent {
222*fae6e9adSlinfeng             _args: args,
223*fae6e9adSlinfeng             data: SpinLock::new(BpfPerfEventData {
224*fae6e9adSlinfeng                 enabled: false,
225*fae6e9adSlinfeng                 mmap_page: RingPage::empty(),
226*fae6e9adSlinfeng                 page_cache: PageCache::new(None),
227*fae6e9adSlinfeng                 offset: 0,
228*fae6e9adSlinfeng             }),
229*fae6e9adSlinfeng         }
230*fae6e9adSlinfeng     }
231*fae6e9adSlinfeng     pub fn do_mmap(&self, _start: usize, len: usize, offset: usize) -> Result<()> {
232*fae6e9adSlinfeng         let mut data = self.data.lock();
233*fae6e9adSlinfeng         // alloc page frame
234*fae6e9adSlinfeng         let (phy_addr, page_count) =
235*fae6e9adSlinfeng             unsafe { LockedFrameAllocator.allocate(PageFrameCount::new(len / PAGE_SIZE)) }
236*fae6e9adSlinfeng                 .ok_or(SystemError::ENOSPC)?;
237*fae6e9adSlinfeng         let mut page_manager_guard = page_manager_lock_irqsave();
238*fae6e9adSlinfeng         let mut cur_phys = PhysPageFrame::new(phy_addr);
239*fae6e9adSlinfeng         for i in 0..page_count.data() {
240*fae6e9adSlinfeng             let page = Arc::new(Page::new(true, cur_phys.phys_address()));
241*fae6e9adSlinfeng             let paddr = cur_phys.phys_address();
242*fae6e9adSlinfeng             page_manager_guard.insert(paddr, &page);
243*fae6e9adSlinfeng             data.page_cache.add_page(i, &page);
244*fae6e9adSlinfeng             cur_phys = cur_phys.next();
245*fae6e9adSlinfeng         }
246*fae6e9adSlinfeng         let virt_addr = unsafe { MMArch::phys_2_virt(phy_addr) }.ok_or(SystemError::EFAULT)?;
247*fae6e9adSlinfeng         // create mmap page
248*fae6e9adSlinfeng         let mmap_page = RingPage::new_init(virt_addr.data(), len, phy_addr);
249*fae6e9adSlinfeng         data.mmap_page = mmap_page;
250*fae6e9adSlinfeng         data.offset = offset;
251*fae6e9adSlinfeng         Ok(())
252*fae6e9adSlinfeng     }
253*fae6e9adSlinfeng 
254*fae6e9adSlinfeng     pub fn write_event(&self, data: &[u8]) -> Result<()> {
255*fae6e9adSlinfeng         let mut inner_data = self.data.lock();
256*fae6e9adSlinfeng         inner_data.mmap_page.write_event(data)?;
257*fae6e9adSlinfeng         Ok(())
258*fae6e9adSlinfeng     }
259*fae6e9adSlinfeng }
260*fae6e9adSlinfeng 
261*fae6e9adSlinfeng impl Drop for BpfPerfEvent {
262*fae6e9adSlinfeng     fn drop(&mut self) {
263*fae6e9adSlinfeng         let mut page_manager_guard = page_manager_lock_irqsave();
264*fae6e9adSlinfeng         let data = self.data.lock();
265*fae6e9adSlinfeng         let phy_addr = data.mmap_page.phys_addr;
266*fae6e9adSlinfeng         let len = data.mmap_page.size;
267*fae6e9adSlinfeng         let page_count = PageFrameCount::new(len / PAGE_SIZE);
268*fae6e9adSlinfeng         let mut cur_phys = PhysPageFrame::new(phy_addr);
269*fae6e9adSlinfeng         for _ in 0..page_count.data() {
270*fae6e9adSlinfeng             page_manager_guard.remove_page(&cur_phys.phys_address());
271*fae6e9adSlinfeng             cur_phys = cur_phys.next();
272*fae6e9adSlinfeng         }
273*fae6e9adSlinfeng     }
274*fae6e9adSlinfeng }
275*fae6e9adSlinfeng 
276*fae6e9adSlinfeng impl IndexNode for BpfPerfEvent {
277*fae6e9adSlinfeng     fn mmap(&self, start: usize, len: usize, offset: usize) -> Result<()> {
278*fae6e9adSlinfeng         self.do_mmap(start, len, offset)
279*fae6e9adSlinfeng     }
280*fae6e9adSlinfeng 
281*fae6e9adSlinfeng     fn read_at(
282*fae6e9adSlinfeng         &self,
283*fae6e9adSlinfeng         _offset: usize,
284*fae6e9adSlinfeng         _len: usize,
285*fae6e9adSlinfeng         _buf: &mut [u8],
286*fae6e9adSlinfeng         _data: SpinLockGuard<FilePrivateData>,
287*fae6e9adSlinfeng     ) -> Result<usize> {
288*fae6e9adSlinfeng         panic!("PerfEventInode does not support read")
289*fae6e9adSlinfeng     }
290*fae6e9adSlinfeng 
291*fae6e9adSlinfeng     fn write_at(
292*fae6e9adSlinfeng         &self,
293*fae6e9adSlinfeng         _offset: usize,
294*fae6e9adSlinfeng         _len: usize,
295*fae6e9adSlinfeng         _buf: &[u8],
296*fae6e9adSlinfeng         _data: SpinLockGuard<FilePrivateData>,
297*fae6e9adSlinfeng     ) -> Result<usize> {
298*fae6e9adSlinfeng         panic!("PerfEventInode does not support write")
299*fae6e9adSlinfeng     }
300*fae6e9adSlinfeng 
301*fae6e9adSlinfeng     fn fs(&self) -> Arc<dyn FileSystem> {
302*fae6e9adSlinfeng         panic!("PerfEventInode does not have a filesystem")
303*fae6e9adSlinfeng     }
304*fae6e9adSlinfeng 
305*fae6e9adSlinfeng     fn as_any_ref(&self) -> &dyn Any {
306*fae6e9adSlinfeng         self
307*fae6e9adSlinfeng     }
308*fae6e9adSlinfeng     fn list(&self) -> Result<Vec<String>> {
309*fae6e9adSlinfeng         Err(SystemError::ENOSYS)
310*fae6e9adSlinfeng     }
311*fae6e9adSlinfeng 
312*fae6e9adSlinfeng     fn page_cache(&self) -> Option<Arc<PageCache>> {
313*fae6e9adSlinfeng         Some(self.data.lock().page_cache.clone())
314*fae6e9adSlinfeng     }
315*fae6e9adSlinfeng }
316*fae6e9adSlinfeng 
317*fae6e9adSlinfeng impl PerfEventOps for BpfPerfEvent {
318*fae6e9adSlinfeng     fn enable(&self) -> Result<()> {
319*fae6e9adSlinfeng         self.data.lock().enabled = true;
320*fae6e9adSlinfeng         Ok(())
321*fae6e9adSlinfeng     }
322*fae6e9adSlinfeng     fn disable(&self) -> Result<()> {
323*fae6e9adSlinfeng         self.data.lock().enabled = false;
324*fae6e9adSlinfeng         Ok(())
325*fae6e9adSlinfeng     }
326*fae6e9adSlinfeng     fn readable(&self) -> bool {
327*fae6e9adSlinfeng         self.data.lock().mmap_page.readable()
328*fae6e9adSlinfeng     }
329*fae6e9adSlinfeng }
330*fae6e9adSlinfeng 
331*fae6e9adSlinfeng pub fn perf_event_open_bpf(args: PerfProbeArgs) -> BpfPerfEvent {
332*fae6e9adSlinfeng     BpfPerfEvent::new(args)
333*fae6e9adSlinfeng }
334