From 1fd3cf6ff5fbab1ca1846a14b93691d3213f9f56 Mon Sep 17 00:00:00 2001 From: Johnathan Sharratt Date: Tue, 18 Jun 2024 21:54:55 +1000 Subject: [PATCH 1/2] Added a dirty map implementation --- lib/api/src/externals/memory.rs | 11 +++ lib/api/src/sys/externals/memory.rs | 9 ++ .../src/memory/fd_memory/memories.rs | 11 +++ lib/vm/src/lib.rs | 2 + lib/vm/src/memory.rs | 66 ++++++++++++++ lib/vm/src/mmap.rs | 87 ++++++++++++++++++- .../journal/effector/memory_and_snapshot.rs | 55 +++++++----- tests/integration/cli/tests/publish.rs | 1 + 8 files changed, 217 insertions(+), 25 deletions(-) diff --git a/lib/api/src/externals/memory.rs b/lib/api/src/externals/memory.rs index 3e2b07efa22..f50a7da1877 100644 --- a/lib/api/src/externals/memory.rs +++ b/lib/api/src/externals/memory.rs @@ -137,6 +137,17 @@ impl Memory { Ok(()) } + /// Returns the list of dirty regions since the mmap was made + pub fn dirty_regions(&self, store: &mut impl AsStoreMut) -> std::collections::BTreeMap { + self.0.dirty_regions(store) + } + + /// Resets the dirty pages in the memory map + /// by remmap'ing the memory address space + pub fn remap(&self, store: &mut impl AsStoreMut) -> Result<(), MemoryError> { + self.0.remap(store) + } + /// Attempts to duplicate this memory (if its clonable) in a new store /// (copied memory) pub fn copy_to_store( diff --git a/lib/api/src/sys/externals/memory.rs b/lib/api/src/sys/externals/memory.rs index b61adc4ff68..0a77f2b64be 100644 --- a/lib/api/src/sys/externals/memory.rs +++ b/lib/api/src/sys/externals/memory.rs @@ -70,6 +70,15 @@ impl Memory { Ok(()) } + pub fn dirty_regions(&self, store: &mut impl AsStoreMut) -> std::collections::BTreeMap { + self.handle.get_mut(store.objects_mut()).dirty_map() + } + + pub fn remap(&self, store: &mut impl AsStoreMut) -> Result<(), MemoryError> { + self.handle.get_mut(store.objects_mut()).reset_dirty_map()?; + Ok(()) + } + pub(crate) fn from_vm_extern(store: &impl AsStoreRef, vm_extern: VMExternMemory) -> Self { Self { handle: unsafe { diff --git a/lib/sys-utils/src/memory/fd_memory/memories.rs b/lib/sys-utils/src/memory/fd_memory/memories.rs index f11e35ce215..34f4847b0b7 100644 --- a/lib/sys-utils/src/memory/fd_memory/memories.rs +++ b/lib/sys-utils/src/memory/fd_memory/memories.rs @@ -553,6 +553,17 @@ impl LinearMemory for VMMemory { Ok(()) } + /// Returns the list of dirty regions since the mmap was made + fn dirty_map<'a>(&'a mut self) -> std::collections::BTreeMap { + self.0.dirty_map() + } + + /// Resets the dirty pages in the memory map using a remap + /// of the memory mapped region + fn reset_dirty_map(&mut self) -> Result<(), MemoryError> { + self.0.reset_dirty_map() + } + /// Returns the memory style for this memory. fn style(&self) -> MemoryStyle { self.0.style() diff --git a/lib/vm/src/lib.rs b/lib/vm/src/lib.rs index fabdc38a48f..5ffec1c337b 100644 --- a/lib/vm/src/lib.rs +++ b/lib/vm/src/lib.rs @@ -32,6 +32,8 @@ mod trap; mod vmcontext; pub mod libcalls; +#[cfg(target_os = "linux")] +mod dirty_map; use std::ptr::NonNull; diff --git a/lib/vm/src/memory.rs b/lib/vm/src/memory.rs index 1052b61ef0e..81a61d894e3 100644 --- a/lib/vm/src/memory.rs +++ b/lib/vm/src/memory.rs @@ -142,6 +142,20 @@ impl WasmMmap { Ok(()) } + /// Returns the list of dirty regions since the mmap was made + #[cfg(target_os = "linux")] + pub fn dirty_map<'a>(&'a mut self) -> &'a std::collections::BTreeMap { + self.alloc.dirty_map() + } + + /// Remaps the existing mmap region again discarding anything + /// that was already captured. This is useful for clearing all + /// the dirty flags + #[cfg(target_os = "linux")] + pub fn reset_dirty_map(&mut self) -> Result<(), String> { + self.alloc.reset_dirty_map() + } + /// Copies the memory /// (in this case it performs a copy-on-write to save memory) pub fn copy(&mut self) -> Result { @@ -414,6 +428,19 @@ impl LinearMemory for VMOwnedMemory { Ok(()) } + /// Returns the list of dirty regions since the mmap was made + #[cfg(target_os = "linux")] + fn dirty_map<'a>(&'a mut self) -> std::collections::BTreeMap { + self.mmap.dirty_map().clone() + } + + /// Resets the dirty pages in the memory map using a remap + /// of the memory mapped region + #[cfg(target_os = "linux")] + fn reset_dirty_map(&mut self) -> Result<(), MemoryError> { + self.mmap.reset_dirty_map().map_err(|s| MemoryError::Region(s)) + } + /// Return a `VMMemoryDefinition` for exposing the memory to compiled wasm code. fn vmmemory(&self) -> NonNull { self.mmap.vm_memory_definition.as_ptr() @@ -565,6 +592,21 @@ impl LinearMemory for VMSharedMemory { Ok(()) } + /// Returns the list of dirty regions since the mmap was made + #[cfg(target_os = "linux")] + fn dirty_map<'a>(&'a mut self) -> std::collections::BTreeMap { + let mut guard = self.mmap.write().unwrap(); + guard.dirty_map().clone() + } + + /// Resets the dirty pages in the memory map using a remap + /// of the memory mapped region + #[cfg(target_os = "linux")] + fn reset_dirty_map(&mut self) -> Result<(), MemoryError> { + let mut guard = self.mmap.write().unwrap(); + guard.reset_dirty_map().map_err(|s| MemoryError::Region(s)) + } + /// Return a `VMMemoryDefinition` for exposing the memory to compiled wasm code. fn vmmemory(&self) -> NonNull { let guard = self.mmap.read().unwrap(); @@ -654,6 +696,17 @@ impl LinearMemory for VMMemory { Ok(()) } + /// Returns the list of dirty regions since the mmap was made + fn dirty_map<'a>(&'a mut self) -> std::collections::BTreeMap { + self.0.dirty_map() + } + + /// Resets the dirty pages in the memory map using a remap + /// of the memory mapped region + fn reset_dirty_map(&mut self) -> Result<(), MemoryError> { + self.0.reset_dirty_map() + } + /// Returns the memory style for this memory. fn style(&self) -> MemoryStyle { self.0.style() @@ -811,6 +864,19 @@ where }) } + /// Returns the list of dirty regions since the mmap was made + fn dirty_map<'a>(&'a mut self) -> std::collections::BTreeMap { + std::collections::BTreeMap::new() + } + + /// Resets the dirty pages in the memory map using a remap + /// of the memory mapped region + fn reset_dirty_map(&mut self) -> Result<(), MemoryError> { + Err(MemoryError::UnsupportedOperation { + message: "remap() is not supported".to_string(), + }) + } + /// Return a `VMMemoryDefinition` for exposing the memory to compiled wasm code. fn vmmemory(&self) -> NonNull; diff --git a/lib/vm/src/mmap.rs b/lib/vm/src/mmap.rs index 6d1ce905494..77a45bf7ced 100644 --- a/lib/vm/src/mmap.rs +++ b/lib/vm/src/mmap.rs @@ -5,10 +5,16 @@ //! of memory. use more_asserts::assert_le; +use std::collections::BTreeMap; use std::io; use std::ptr; use std::slice; +#[cfg(target_os = "linux")] +use crate::dirty_map::DirtyMapController; +#[cfg(target_os = "linux")] +use crate::dirty_map::DirtyMapWatcher; + /// Round `size` up to the nearest multiple of `page_size`. fn round_up_to_page_size(size: usize, page_size: usize) -> usize { (size + (page_size - 1)) & !(page_size - 1) @@ -26,6 +32,10 @@ pub struct Mmap { total_size: usize, accessible_size: usize, sync_on_drop: bool, + flags: i32, + #[cfg(target_os = "linux")] + dirty_map: DirtyMapWatcher, + memory_fd: i32, } /// The type of mmap to create @@ -45,11 +55,16 @@ impl Mmap { // contains code to create a non-null dangling pointer value when // constructed empty, so we reuse that here. let empty = Vec::::new(); + let ptr = empty.as_ptr() as usize; Self { - ptr: empty.as_ptr() as usize, + ptr, + flags: 0, total_size: 0, accessible_size: 0, sync_on_drop: false, + memory_fd: -1, + #[cfg(target_os = "linux")] + dirty_map: DirtyMapController::new().watch(ptr), } } @@ -70,7 +85,7 @@ impl Mmap { mut backing_file: Option, memory_type: MmapType, ) -> Result { - use std::os::fd::IntoRawFd; + use std::os::fd::{FromRawFd, IntoRawFd}; let page_size = region::page::size(); assert_le!(accessible_size, mapping_size); @@ -141,6 +156,7 @@ impl Mmap { ) }; if ptr as isize == -1_isize { + std::mem::drop(unsafe { std::fs::File::from_raw_fd(memory_fd) }); return Err(io::Error::last_os_error().to_string()); } @@ -148,7 +164,11 @@ impl Mmap { ptr: ptr as usize, total_size: mapping_size, accessible_size, + flags, sync_on_drop: memory_fd != -1 && memory_type == MmapType::Shared, + memory_fd, + #[cfg(target_os = "linux")] + dirty_map: DirtyMapController::new().watch(ptr as usize), } } else { // Reserve the mapping size. @@ -163,6 +183,7 @@ impl Mmap { ) }; if ptr as isize == -1_isize { + std::mem::drop(unsafe { std::fs::File::from_raw_fd(memory_fd) }); return Err(io::Error::last_os_error().to_string()); } @@ -170,7 +191,11 @@ impl Mmap { ptr: ptr as usize, total_size: mapping_size, accessible_size, + flags, sync_on_drop: memory_fd != -1 && memory_type == MmapType::Shared, + memory_fd, + #[cfg(target_os = "linux")] + dirty_map: DirtyMapController::new().watch(ptr as usize), }; if accessible_size != 0 { @@ -224,6 +249,7 @@ impl Mmap { ptr: ptr as usize, total_size: mapping_size, accessible_size, + flags: 0, sync_on_drop: false, } } else { @@ -238,6 +264,7 @@ impl Mmap { ptr: ptr as usize, total_size: mapping_size, accessible_size, + flags: 0, sync_on_drop: false, }; @@ -299,6 +326,56 @@ impl Mmap { Ok(()) } + /// Returns the list of dirty regions since the mmap was made + #[cfg(target_os = "linux")] + pub fn dirty_map<'a>(&'a mut self) -> &'a BTreeMap { + self.dirty_map.track_changes(self.accessible_size) + } + + /// Remaps the existing mmap region again discarding anything + /// that was already captured. This is useful for clearing all + /// the dirty flags + #[cfg(target_os = "linux")] + pub fn reset_dirty_map(&mut self) -> Result<(), String> { + if self.accessible_size == self.total_size { + // Allocate a single read-write region at once. + let ptr = unsafe { + libc::mmap( + self.ptr as *mut libc::c_void, + self.total_size, + libc::PROT_READ | libc::PROT_WRITE, + self.flags, + self.memory_fd, + 0, + ) + }; + if ptr as isize == -1_isize { + return Err(io::Error::last_os_error().to_string()); + } + } else { + // Reserve the mapping size. + let ptr = unsafe { + libc::mmap( + self.ptr as *mut libc::c_void, + self.total_size, + libc::PROT_NONE, + self.flags, + self.memory_fd, + 0, + ) + }; + if ptr as isize == -1_isize { + return Err(io::Error::last_os_error().to_string()); + } + + if self.accessible_size != 0 { + // Commit the accessible size. + self.make_accessible(0, self.accessible_size)?; + } + } + Ok(()) + } + /// Return the allocated memory as a slice of u8. pub fn as_slice(&self) -> &[u8] { unsafe { slice::from_raw_parts(self.ptr as *const u8, self.total_size) } @@ -377,6 +454,8 @@ impl Mmap { impl Drop for Mmap { #[cfg(not(target_os = "windows"))] fn drop(&mut self) { + use std::os::fd::FromRawFd; + if self.total_size != 0 { if self.sync_on_drop { let r = unsafe { @@ -391,6 +470,10 @@ impl Drop for Mmap { let r = unsafe { libc::munmap(self.ptr as *mut libc::c_void, self.total_size) }; assert_eq!(r, 0, "munmap failed: {}", io::Error::last_os_error()); } + if self.memory_fd >= 0 { + std::mem::drop(unsafe { std::fs::File::from_raw_fd(self.memory_fd) }); + self.memory_fd = -1; + } } #[cfg(target_os = "windows")] diff --git a/lib/wasix/src/journal/effector/memory_and_snapshot.rs b/lib/wasix/src/journal/effector/memory_and_snapshot.rs index 6804e46f9c5..16192c5c075 100644 --- a/lib/wasix/src/journal/effector/memory_and_snapshot.rs +++ b/lib/wasix/src/journal/effector/memory_and_snapshot.rs @@ -40,6 +40,9 @@ impl JournalEffector { guard: &mut MutexGuard<'_, WasiProcessInner>, trigger: SnapshotTrigger, ) -> anyhow::Result<()> { + // Grab the regions in the memory that are dirty + let (data, mut store) = ctx.data_and_store_mut(); + let dirty_regions = unsafe { data.memory().dirty_regions(&mut store) }; let env = ctx.data(); let memory = unsafe { env.memory_view(ctx) }; @@ -49,33 +52,34 @@ impl JournalEffector { // We do not want the regions to be greater than 64KB as this will // otherwise create too much inefficiency. We choose 64KB as its // aligned with the standard WASM page size. - let mut cur = 0u64; let mut regions = Vec::::new(); - while cur < memory.data_size() { - //let mut again = false; - let next = ((cur + MEMORY_REGION_RESOLUTION) / MEMORY_REGION_RESOLUTION) - * MEMORY_REGION_RESOLUTION; - let end = memory.data_size().min(next); - /* - for (_, thread) in guard.threads.iter() { - let layout = thread.memory_layout(); - if cur >= layout.stack_lower && cur < layout.stack_upper { - cur = layout.stack_upper; - again = true; - break; + for (mut cur, r2) in dirty_regions { + while cur < r2 { + //let mut again = false; + let next = ((cur + MEMORY_REGION_RESOLUTION) / MEMORY_REGION_RESOLUTION) + * MEMORY_REGION_RESOLUTION; + let end = memory.data_size().min(next); + /* + for (_, thread) in guard.threads.iter() { + let layout = thread.memory_layout(); + if cur >= layout.stack_lower && cur < layout.stack_upper { + cur = layout.stack_upper; + again = true; + break; + } + if end > layout.stack_lower && end < layout.stack_upper { + end = end.min(layout.stack_lower); + } } - if end > layout.stack_lower && end < layout.stack_upper { - end = end.min(layout.stack_lower); + if again { + continue; } - } - if again { - continue; - } - */ + */ - let region = cur..end; - regions.push(region.into()); - cur = end; + let region = cur..end; + regions.push(region.into()); + cur = end; + } } // Next we examine the dirty page manager and filter out any pages @@ -191,6 +195,11 @@ impl JournalEffector { // When writing snapshots we also flush the journal so that // its guaranteed to be on the disk or network pipe journal.flush().map_err(map_snapshot_err)?; + + // Reset the memory mapping so that all the dirty + // regions are reset + let (data, mut store) = ctx.data_and_store_mut(); + unsafe { data.memory().remap(&mut store)?; } Ok(()) } diff --git a/tests/integration/cli/tests/publish.rs b/tests/integration/cli/tests/publish.rs index c58a224a238..cdd417f4057 100644 --- a/tests/integration/cli/tests/publish.rs +++ b/tests/integration/cli/tests/publish.rs @@ -133,6 +133,7 @@ fn wasmer_init_publish() { .assert() .success(); + #[allow(unused)] let s = std::fs::read_to_string(path.join("randomversion").join("wasmer.toml")).unwrap(); // publish From d2f7ed0742869aa118a9cf94ef9a5fe79d83eb83 Mon Sep 17 00:00:00 2001 From: Johnathan Sharratt Date: Wed, 19 Jun 2024 09:16:26 +1000 Subject: [PATCH 2/2] Missed a file --- lib/vm/src/dirty_map.rs | 161 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 161 insertions(+) create mode 100644 lib/vm/src/dirty_map.rs diff --git a/lib/vm/src/dirty_map.rs b/lib/vm/src/dirty_map.rs new file mode 100644 index 00000000000..e48a33b5c7c --- /dev/null +++ b/lib/vm/src/dirty_map.rs @@ -0,0 +1,161 @@ +use std::{collections::BTreeMap, fs::File, sync::Arc}; +use std::os::unix::fs::FileExt; + +// We assume the page size is 4K which means we don't support huge pages +const PAGE_SIZE: u64 = 4096; +const N2R: u64 = PAGE_SIZE / std::mem::size_of::() as u64; + +// The MMU structure represents a page within the page map +#[repr(packed)] +struct MMU<'a>(&'a [u8]); +impl<'a> MMU<'a> { + #[inline] + const fn pte(&self) -> u64 { + let p = self.0; + u64::from_ne_bytes([p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7]]) + } + + #[inline] + const fn dirty(&self) -> bool { + // Dirty pages are bit 55 in the MMU + const PTE_DIRTY: u64 = 1 << (55 - 1); + (self.pte() & PTE_DIRTY) != 0 + } +} + +/// Watches a specific piece of memory for soft dirty flags +/// so that it can detect changes +#[derive(Debug)] +pub struct DirtyMapWatcher { + // Reference back to the controller + controller: DirtyMapController, + // Start of the virtual address space + vas: u64, + // The current status of the pages in the virtual address space + // (the length of this buffer represents the number of pages in the block) + mmu: Vec, + // Represents all the ranges that have been detected as dirty + dirty: BTreeMap, +} + +impl DirtyMapWatcher { + /// Tracks changes to the memory region we are watching + /// and returns a map of the dirty extents (measured in bytes) + pub fn track_changes<'a>(&'a mut self, size: usize) -> &'a BTreeMap { + // Resize the mmu to match the size we are scanning + self.mmu.resize(size / N2R as usize, 0); + self.dirty.clear(); + + // Read all the page map entries for the region we are watching + let mmu_offset = self.vas / N2R; + self.controller.pagemap_fd.read_exact_at(&mut self.mmu, mmu_offset).unwrap(); + + // Loop through all the blocks we are monitoring + let mut n1 = 0usize; + let mut n2 = std::mem::size_of::(); + while n1 < self.mmu.len() { + let mmu = MMU(&self.mmu[n1..n2]); + if mmu.dirty() { + let r1 = n1 as u64 * N2R; + let r2 = n2 as u64 * N2R; + + // Insert this region into the hashmap + // (optimization - given we walk through the pages linearly from + // front to back we can make some optimizations on how to + // quickly expand the extents) + if let Some((_, r)) = self.dirty.range_mut(..r1).rev().next() { + if *r == r1 { + *r = r2; + } else { + self.dirty.insert(r1, r2); + } + } else { + self.dirty.insert(r1, r2); + } + } + + n1 += std::mem::size_of::(); + n2 += std::mem::size_of::(); + } + + &self.dirty + } +} + +/// This is a dirty map that tracks which pages have been written to +/// since they are cleared. This works on a process level +#[derive(Debug, Clone)] +pub struct DirtyMapController { + pagemap_fd: Arc, +} + +impl DirtyMapController { + /// Creates a dirty map controller which can be used to check for + /// memory changes (writes) to a piece of virtual memory + pub fn new() -> Self { + let file = std::fs::OpenOptions::new() + .read(true) + .open("/proc/self/pagemap") + .unwrap(); + Self { + pagemap_fd: Arc::new(file), + } + } + + /// Creates a watcher that will watch for changes to a specific + /// piece of virtual memory using the soft dirty flags + pub fn watch(&self, ptr: usize) -> DirtyMapWatcher { + DirtyMapWatcher { + controller: self.clone(), + vas: ptr as u64, + mmu: Default::default(), + dirty: Default::default() + } + } +} + +/* +https://linux-kernel.vger.kernel.narkive.com/IED371rj/patch-0-1-pagemap-clear-refs-modify-to-specify-anon-or-mapped-vma-clearing + +This patch makes the clear_refs proc interface a bit more versatile. It +adds support for clearing either anonymous, file mapped pages or both. + +echo 1 > /proc/pid/clear_refs clears ANON pages +echo 2 > /proc/pid/clear_refs clears file mapped pages +echo 3 > /proc/pid/clear_refs clears all pages +echo 4 > /proc/pid/clear_refs clears the soft dirty flag + +There are four components to pagemap: + +# /proc/pid/pagemap. + + - This file lets a userspace process find out which physical frame each virtual page is + mapped to. It contains one 64-bit value for each virtual page, containing the + following data (from fs/proc/task_mmu.c, above pagemap_read): + + - Bits 0-54 page frame number (PFN) if present + - Bits 0-4 swap type if swapped + - Bits 5-54 swap offset if swapped + - Bit 55 pte is soft-dirty (see Soft-Dirty PTEs) + - Bit 56 page exclusively mapped (since 4.2) + - Bit 57 pte is uffd-wp write-protected (since 5.13) (see Userfaultfd) + - Bits 58-60 zero + - Bit 61 page is file-page or shared-anon (since 3.5) + - Bit 62 page swapped + - Bit 63 page present + + - Since Linux 4.0 only users with the CAP_SYS_ADMIN capability can get PFNs. In 4.0 and + 4.1 opens by unprivileged fail with -EPERM. Starting from 4.2 the PFN field is zeroed if + the user does not have CAP_SYS_ADMIN. Reason: information about PFNs helps in exploiting + Rowhammer vulnerability. + + - If the page is not present but in swap, then the PFN contains an encoding of the swap file + number and the page’s offset into the swap. Unmapped pages return a null PFN. This allows + determining precisely which pages are mapped (or in swap) and comparing mapped pages + between processes. + + - Efficient users of this interface will use /proc/pid/maps to determine which areas of + memory are actually mapped and llseek to skip over unmapped regions. + +# /proc/kpagecount. This file contains a 64-bit count of the number of times each page is mapped, indexed by PFN. +*/ \ No newline at end of file