// SPDX-License-Identifier: GPL-2.0 #include #include #include #include #include #include "kvm_mm.h" struct kvm_gmem { struct kvm *kvm; struct xarray bindings; struct list_head entry; }; /** * folio_file_pfn - like folio_file_page, but return a pfn. * @folio: The folio which contains this index. * @index: The index we want to look up. * * Return: The pfn for this index. */ static inline kvm_pfn_t folio_file_pfn(struct folio *folio, pgoff_t index) { return folio_pfn(folio) + (index & (folio_nr_pages(folio) - 1)); } static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, pgoff_t index, struct folio *folio) { #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE kvm_pfn_t pfn = folio_file_pfn(folio, index); gfn_t gfn = slot->base_gfn + index - slot->gmem.pgoff; int rc = kvm_arch_gmem_prepare(kvm, gfn, pfn, folio_order(folio)); if (rc) { pr_warn_ratelimited("gmem: Failed to prepare folio for index %lx GFN %llx PFN %llx error %d.\n", index, gfn, pfn, rc); return rc; } #endif return 0; } static inline void kvm_gmem_mark_prepared(struct folio *folio) { folio_mark_uptodate(folio); } /* * Process @folio, which contains @gfn, so that the guest can use it. * The folio must be locked and the gfn must be contained in @slot. * On successful return the guest sees a zero page so as to avoid * leaking host data and the up-to-date flag is set. */ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, struct folio *folio) { unsigned long nr_pages, i; pgoff_t index; int r; nr_pages = folio_nr_pages(folio); for (i = 0; i < nr_pages; i++) clear_highpage(folio_page(folio, i)); /* * Preparing huge folios should always be safe, since it should * be possible to split them later if needed. * * Right now the folio order is always going to be zero, but the * code is ready for huge folios. The only assumption is that * the base pgoff of memslots is naturally aligned with the * requested page order, ensuring that huge folios can also use * huge page table entries for GPA->HPA mapping. * * The order will be passed when creating the guest_memfd, and * checked when creating memslots. */ WARN_ON(!IS_ALIGNED(slot->gmem.pgoff, 1 << folio_order(folio))); index = gfn - slot->base_gfn + slot->gmem.pgoff; index = ALIGN_DOWN(index, 1 << folio_order(folio)); r = __kvm_gmem_prepare_folio(kvm, slot, index, folio); if (!r) kvm_gmem_mark_prepared(folio); return r; } /* * Returns a locked folio on success. The caller is responsible for * setting the up-to-date flag before the memory is mapped into the guest. * There is no backing storage for the memory, so the folio will remain * up-to-date until it's removed. * * Ignore accessed, referenced, and dirty flags. The memory is * unevictable and there is no storage to write back to. */ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index) { /* TODO: Support huge pages. */ return filemap_grab_folio(inode->i_mapping, index); } static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start, pgoff_t end) { bool flush = false, found_memslot = false; struct kvm_memory_slot *slot; struct kvm *kvm = gmem->kvm; unsigned long index; xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) { pgoff_t pgoff = slot->gmem.pgoff; struct kvm_gfn_range gfn_range = { .start = slot->base_gfn + max(pgoff, start) - pgoff, .end = slot->base_gfn + min(pgoff + slot->npages, end) - pgoff, .slot = slot, .may_block = true, }; if (!found_memslot) { found_memslot = true; KVM_MMU_LOCK(kvm); kvm_mmu_invalidate_begin(kvm); } flush |= kvm_mmu_unmap_gfn_range(kvm, &gfn_range); } if (flush) kvm_flush_remote_tlbs(kvm); if (found_memslot) KVM_MMU_UNLOCK(kvm); } static void kvm_gmem_invalidate_end(struct kvm_gmem *gmem, pgoff_t start, pgoff_t end) { struct kvm *kvm = gmem->kvm; if (xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT)) { KVM_MMU_LOCK(kvm); kvm_mmu_invalidate_end(kvm); KVM_MMU_UNLOCK(kvm); } } static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len) { struct list_head *gmem_list = &inode->i_mapping->i_private_list; pgoff_t start = offset >> PAGE_SHIFT; pgoff_t end = (offset + len) >> PAGE_SHIFT; struct kvm_gmem *gmem; /* * Bindings must be stable across invalidation to ensure the start+end * are balanced. */ filemap_invalidate_lock(inode->i_mapping); list_for_each_entry(gmem, gmem_list, entry) kvm_gmem_invalidate_begin(gmem, start, end); truncate_inode_pages_range(inode->i_mapping, offset, offset + len - 1); list_for_each_entry(gmem, gmem_list, entry) kvm_gmem_invalidate_end(gmem, start, end); filemap_invalidate_unlock(inode->i_mapping); return 0; } static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len) { struct address_space *mapping = inode->i_mapping; pgoff_t start, index, end; int r; /* Dedicated guest is immutable by default. */ if (offset + len > i_size_read(inode)) return -EINVAL; filemap_invalidate_lock_shared(mapping); start = offset >> PAGE_SHIFT; end = (offset + len) >> PAGE_SHIFT; r = 0; for (index = start; index < end; ) { struct folio *folio; if (signal_pending(current)) { r = -EINTR; break; } folio = kvm_gmem_get_folio(inode, index); if (IS_ERR(folio)) { r = PTR_ERR(folio); break; } index = folio_next_index(folio); folio_unlock(folio); folio_put(folio); /* 64-bit only, wrapping the index should be impossible. */ if (WARN_ON_ONCE(!index)) break; cond_resched(); } filemap_invalidate_unlock_shared(mapping); return r; } static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { int ret; if (!(mode & FALLOC_FL_KEEP_SIZE)) return -EOPNOTSUPP; if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) return -EOPNOTSUPP; if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len)) return -EINVAL; if (mode & FALLOC_FL_PUNCH_HOLE) ret = kvm_gmem_punch_hole(file_inode(file), offset, len); else ret = kvm_gmem_allocate(file_inode(file), offset, len); if (!ret) file_modified(file); return ret; } static int kvm_gmem_release(struct inode *inode, struct file *file) { struct kvm_gmem *gmem = file->private_data; struct kvm_memory_slot *slot; struct kvm *kvm = gmem->kvm; unsigned long index; /* * Prevent concurrent attempts to *unbind* a memslot. This is the last * reference to the file and thus no new bindings can be created, but * dereferencing the slot for existing bindings needs to be protected * against memslot updates, specifically so that unbind doesn't race * and free the memslot (kvm_gmem_get_file() will return NULL). */ mutex_lock(&kvm->slots_lock); filemap_invalidate_lock(inode->i_mapping); xa_for_each(&gmem->bindings, index, slot) rcu_assign_pointer(slot->gmem.file, NULL); synchronize_rcu(); /* * All in-flight operations are gone and new bindings can be created. * Zap all SPTEs pointed at by this file. Do not free the backing * memory, as its lifetime is associated with the inode, not the file. */ kvm_gmem_invalidate_begin(gmem, 0, -1ul); kvm_gmem_invalidate_end(gmem, 0, -1ul); list_del(&gmem->entry); filemap_invalidate_unlock(inode->i_mapping); mutex_unlock(&kvm->slots_lock); xa_destroy(&gmem->bindings); kfree(gmem); kvm_put_kvm(kvm); return 0; } static inline struct file *kvm_gmem_get_file(struct kvm_memory_slot *slot) { /* * Do not return slot->gmem.file if it has already been closed; * there might be some time between the last fput() and when * kvm_gmem_release() clears slot->gmem.file, and you do not * want to spin in the meanwhile. */ return get_file_active(&slot->gmem.file); } static struct file_operations kvm_gmem_fops = { .open = generic_file_open, .release = kvm_gmem_release, .fallocate = kvm_gmem_fallocate, }; void kvm_gmem_init(struct module *module) { kvm_gmem_fops.owner = module; } static int kvm_gmem_migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode) { WARN_ON_ONCE(1); return -EINVAL; } static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *folio) { struct list_head *gmem_list = &mapping->i_private_list; struct kvm_gmem *gmem; pgoff_t start, end; filemap_invalidate_lock_shared(mapping); start = folio->index; end = start + folio_nr_pages(folio); list_for_each_entry(gmem, gmem_list, entry) kvm_gmem_invalidate_begin(gmem, start, end); /* * Do not truncate the range, what action is taken in response to the * error is userspace's decision (assuming the architecture supports * gracefully handling memory errors). If/when the guest attempts to * access a poisoned page, kvm_gmem_get_pfn() will return -EHWPOISON, * at which point KVM can either terminate the VM or propagate the * error to userspace. */ list_for_each_entry(gmem, gmem_list, entry) kvm_gmem_invalidate_end(gmem, start, end); filemap_invalidate_unlock_shared(mapping); return MF_DELAYED; } #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE static void kvm_gmem_free_folio(struct folio *folio) { struct page *page = folio_page(folio, 0); kvm_pfn_t pfn = page_to_pfn(page); int order = folio_order(folio); kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order)); } #endif static const struct address_space_operations kvm_gmem_aops = { .dirty_folio = noop_dirty_folio, .migrate_folio = kvm_gmem_migrate_folio, .error_remove_folio = kvm_gmem_error_folio, #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE .free_folio = kvm_gmem_free_folio, #endif }; static int kvm_gmem_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = path->dentry->d_inode; generic_fillattr(idmap, request_mask, inode, stat); return 0; } static int kvm_gmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { return -EINVAL; } static const struct inode_operations kvm_gmem_iops = { .getattr = kvm_gmem_getattr, .setattr = kvm_gmem_setattr, }; static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) { const char *anon_name = "[kvm-gmem]"; struct kvm_gmem *gmem; struct inode *inode; struct file *file; int fd, err; fd = get_unused_fd_flags(0); if (fd < 0) return fd; gmem = kzalloc(sizeof(*gmem), GFP_KERNEL); if (!gmem) { err = -ENOMEM; goto err_fd; } file = anon_inode_create_getfile(anon_name, &kvm_gmem_fops, gmem, O_RDWR, NULL); if (IS_ERR(file)) { err = PTR_ERR(file); goto err_gmem; } file->f_flags |= O_LARGEFILE; inode = file->f_inode; WARN_ON(file->f_mapping != inode->i_mapping); inode->i_private = (void *)(unsigned long)flags; inode->i_op = &kvm_gmem_iops; inode->i_mapping->a_ops = &kvm_gmem_aops; inode->i_mode |= S_IFREG; inode->i_size = size; mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); mapping_set_inaccessible(inode->i_mapping); /* Unmovable mappings are supposed to be marked unevictable as well. */ WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping)); kvm_get_kvm(kvm); gmem->kvm = kvm; xa_init(&gmem->bindings); list_add(&gmem->entry, &inode->i_mapping->i_private_list); fd_install(fd, file); return fd; err_gmem: kfree(gmem); err_fd: put_unused_fd(fd); return err; } int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args) { loff_t size = args->size; u64 flags = args->flags; u64 valid_flags = 0; if (flags & ~valid_flags) return -EINVAL; if (size <= 0 || !PAGE_ALIGNED(size)) return -EINVAL; return __kvm_gmem_create(kvm, size, flags); } int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot, unsigned int fd, loff_t offset) { loff_t size = slot->npages << PAGE_SHIFT; unsigned long start, end; struct kvm_gmem *gmem; struct inode *inode; struct file *file; int r = -EINVAL; BUILD_BUG_ON(sizeof(gfn_t) != sizeof(slot->gmem.pgoff)); file = fget(fd); if (!file) return -EBADF; if (file->f_op != &kvm_gmem_fops) goto err; gmem = file->private_data; if (gmem->kvm != kvm) goto err; inode = file_inode(file); if (offset < 0 || !PAGE_ALIGNED(offset) || offset + size > i_size_read(inode)) goto err; filemap_invalidate_lock(inode->i_mapping); start = offset >> PAGE_SHIFT; end = start + slot->npages; if (!xa_empty(&gmem->bindings) && xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT)) { filemap_invalidate_unlock(inode->i_mapping); goto err; } /* * No synchronize_rcu() needed, any in-flight readers are guaranteed to * be see either a NULL file or this new file, no need for them to go * away. */ rcu_assign_pointer(slot->gmem.file, file); slot->gmem.pgoff = start; xa_store_range(&gmem->bindings, start, end - 1, slot, GFP_KERNEL); filemap_invalidate_unlock(inode->i_mapping); /* * Drop the reference to the file, even on success. The file pins KVM, * not the other way 'round. Active bindings are invalidated if the * file is closed before memslots are destroyed. */ r = 0; err: fput(file); return r; } void kvm_gmem_unbind(struct kvm_memory_slot *slot) { unsigned long start = slot->gmem.pgoff; unsigned long end = start + slot->npages; struct kvm_gmem *gmem; struct file *file; /* * Nothing to do if the underlying file was already closed (or is being * closed right now), kvm_gmem_release() invalidates all bindings. */ file = kvm_gmem_get_file(slot); if (!file) return; gmem = file->private_data; filemap_invalidate_lock(file->f_mapping); xa_store_range(&gmem->bindings, start, end - 1, NULL, GFP_KERNEL); rcu_assign_pointer(slot->gmem.file, NULL); synchronize_rcu(); filemap_invalidate_unlock(file->f_mapping); fput(file); } /* Returns a locked folio on success. */ static struct folio * __kvm_gmem_get_pfn(struct file *file, struct kvm_memory_slot *slot, gfn_t gfn, kvm_pfn_t *pfn, bool *is_prepared, int *max_order) { pgoff_t index = gfn - slot->base_gfn + slot->gmem.pgoff; struct kvm_gmem *gmem = file->private_data; struct folio *folio; if (file != slot->gmem.file) { WARN_ON_ONCE(slot->gmem.file); return ERR_PTR(-EFAULT); } gmem = file->private_data; if (xa_load(&gmem->bindings, index) != slot) { WARN_ON_ONCE(xa_load(&gmem->bindings, index)); return ERR_PTR(-EIO); } folio = kvm_gmem_get_folio(file_inode(file), index); if (IS_ERR(folio)) return folio; if (folio_test_hwpoison(folio)) { folio_unlock(folio); folio_put(folio); return ERR_PTR(-EHWPOISON); } *pfn = folio_file_pfn(folio, index); if (max_order) *max_order = 0; *is_prepared = folio_test_uptodate(folio); return folio; } int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, kvm_pfn_t *pfn, int *max_order) { struct file *file = kvm_gmem_get_file(slot); struct folio *folio; bool is_prepared = false; int r = 0; if (!file) return -EFAULT; folio = __kvm_gmem_get_pfn(file, slot, gfn, pfn, &is_prepared, max_order); if (IS_ERR(folio)) { r = PTR_ERR(folio); goto out; } if (!is_prepared) r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio); folio_unlock(folio); if (r < 0) folio_put(folio); out: fput(file); return r; } EXPORT_SYMBOL_GPL(kvm_gmem_get_pfn); #ifdef CONFIG_KVM_GENERIC_PRIVATE_MEM long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long npages, kvm_gmem_populate_cb post_populate, void *opaque) { struct file *file; struct kvm_memory_slot *slot; void __user *p; int ret = 0, max_order; long i; lockdep_assert_held(&kvm->slots_lock); if (npages < 0) return -EINVAL; slot = gfn_to_memslot(kvm, start_gfn); if (!kvm_slot_can_be_private(slot)) return -EINVAL; file = kvm_gmem_get_file(slot); if (!file) return -EFAULT; filemap_invalidate_lock(file->f_mapping); npages = min_t(ulong, slot->npages - (start_gfn - slot->base_gfn), npages); for (i = 0; i < npages; i += (1 << max_order)) { struct folio *folio; gfn_t gfn = start_gfn + i; bool is_prepared = false; kvm_pfn_t pfn; if (signal_pending(current)) { ret = -EINTR; break; } folio = __kvm_gmem_get_pfn(file, slot, gfn, &pfn, &is_prepared, &max_order); if (IS_ERR(folio)) { ret = PTR_ERR(folio); break; } if (is_prepared) { folio_unlock(folio); folio_put(folio); ret = -EEXIST; break; } folio_unlock(folio); WARN_ON(!IS_ALIGNED(gfn, 1 << max_order) || (npages - i) < (1 << max_order)); ret = -EINVAL; while (!kvm_range_has_memory_attributes(kvm, gfn, gfn + (1 << max_order), KVM_MEMORY_ATTRIBUTE_PRIVATE, KVM_MEMORY_ATTRIBUTE_PRIVATE)) { if (!max_order) goto put_folio_and_exit; max_order--; } p = src ? src + i * PAGE_SIZE : NULL; ret = post_populate(kvm, gfn, pfn, p, max_order, opaque); if (!ret) kvm_gmem_mark_prepared(folio); put_folio_and_exit: folio_put(folio); if (ret) break; } filemap_invalidate_unlock(file->f_mapping); fput(file); return ret && !i ? ret : i; } EXPORT_SYMBOL_GPL(kvm_gmem_populate); #endif