diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 4a2d4e0..9012d65 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -33,6 +33,7 @@ static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) { + unsigned long flags; unsigned cpu = smp_processor_id(); if (likely(prev != next)) { @@ -43,6 +44,10 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, percpu_write(cpu_tlbstate.active_mm, next); #endif cpumask_set_cpu(cpu, mm_cpumask(next)); + spin_lock_irqsave(&lazy_unmap_lock, flags); + list_splice_tail(&unmapped_in_tlb[cpu], &unmapped_safe[cpu]); + INIT_LIST_HEAD(&unmapped_in_tlb[cpu]); + spin_unlock_irqrestore(&lazy_unmap_lock, flags); /* Re-load page tables */ load_cr3(next->pgd); diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h index 8d3ad0a..98dfe9d 100644 --- a/arch/x86/include/asm/unistd_64.h +++ b/arch/x86/include/asm/unistd_64.h @@ -662,6 +662,9 @@ __SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo) #define __NR_perf_event_open 298 __SYSCALL(__NR_perf_event_open, sys_perf_event_open) +#define __NR_munmap_lazy 299 +__SYSCALL(__NR_munmap_lazy, sys_munmap_lazy) + #ifndef __NO_STUBS #define __ARCH_WANT_OLD_READDIR #define __ARCH_WANT_OLD_STAT diff --git a/fs/aio.c b/fs/aio.c index 02a2c93..fe2fd67 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -90,7 +90,7 @@ static void aio_free_ring(struct kioctx *ctx) if (info->mmap_size) { down_write(&ctx->mm->mmap_sem); - do_munmap(ctx->mm, info->mmap_base, info->mmap_size); + do_munmap(ctx->mm, info->mmap_base, info->mmap_size, 0); up_write(&ctx->mm->mmap_sem); } diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index b9b3bb5..4cc594a 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -345,7 +345,8 @@ static unsigned long elf_map(struct file *filep, unsigned long addr, total_size = ELF_PAGEALIGN(total_size); map_addr = do_mmap(filep, addr, total_size, prot, type, off); if (!BAD_ADDR(map_addr)) - do_munmap(current->mm, map_addr+size, total_size-size); + do_munmap(current->mm, map_addr+size, total_size-size, + 0); } else map_addr = do_mmap(filep, addr, size, prot, type, off); diff --git a/include/asm-generic/unistd.h b/include/asm-generic/unistd.h index d76b66a..ee5565a 100644 --- a/include/asm-generic/unistd.h +++ b/include/asm-generic/unistd.h @@ -623,8 +623,11 @@ __SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo) #define __NR_perf_event_open 241 __SYSCALL(__NR_perf_event_open, sys_perf_event_open) +#define __NR_munmap_lazy 242 +__SYSCALL(__NR_munmap_lazy, sys_munmap_lazy) + #undef __NR_syscalls -#define __NR_syscalls 242 +#define __NR_syscalls 243 /* * All syscalls below here should go away really, diff --git a/include/linux/mm.h b/include/linux/mm.h index 24c3956..2de2d39 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -103,6 +103,7 @@ extern unsigned int kobjsize(const void *objp); #define VM_CAN_NONLINEAR 0x08000000 /* Has ->fault & does nonlinear pages */ #define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */ #define VM_SAO 0x20000000 /* Strong Access Ordering (powerpc) */ +#define VM_PHANTOM_UNLINKED 0x20000000 /* Lazily unmapped but also unlinked from mm due to exit (XXX new flag) */ #define VM_PFN_AT_MMAP 0x40000000 /* PFNMAP vma that is fully mapped at mmap time */ #define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */ @@ -256,6 +257,18 @@ static inline int put_page_testzero(struct page *page) return atomic_dec_and_test(&page->_count); } +static inline void get_page_lazy(struct page *page) +{ + VM_BUG_ON(atomic_read(&page->lazy_unmap_count)); + atomic_inc(&page->lazy_unmap_count); +} + +static inline int put_page_lazy(struct page *page) +{ + VM_BUG_ON(atomic_read(&page->lazy_unmap_count) == 0); + return atomic_dec_and_test(&page->lazy_unmap_count); +} + /* * Try to grab a ref unless the page has a refcount of zero, return false if * that is the case. @@ -750,6 +763,9 @@ unsigned long unmap_vmas(struct mmu_gather **tlb, struct vm_area_struct *start_vma, unsigned long start_addr, unsigned long end_addr, unsigned long *nr_accounted, struct zap_details *); +unsigned long unmap_vmas_lazy(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long start_addr, unsigned long end_addr, + struct list_head *batches); /** * mm_walk - callbacks for walk_page_range @@ -1151,7 +1167,7 @@ out: return ret; } -extern int do_munmap(struct mm_struct *, unsigned long, size_t); +extern int do_munmap(struct mm_struct *, unsigned long, size_t, int); extern unsigned long do_brk(unsigned long, unsigned long); @@ -1322,5 +1338,11 @@ extern int sysctl_memory_failure_early_kill; extern int sysctl_memory_failure_recovery; extern atomic_long_t mce_bad_pages; + +extern struct list_head unmapped_in_tlb[NR_CPUS]; +extern struct list_head unmapped_safe[NR_CPUS]; +extern spinlock_t lazy_unmap_lock; + + #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 84a524a..e2f74dc 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -41,6 +41,7 @@ struct page { unsigned long flags; /* Atomic flags, some possibly * updated asynchronously */ atomic_t _count; /* Usage count, see below. */ + atomic_t lazy_unmap_count; union { atomic_t _mapcount; /* Count of ptes mapped in mms, * to show when page is mapped @@ -186,6 +187,8 @@ struct vm_area_struct { #ifdef CONFIG_NUMA struct mempolicy *vm_policy; /* NUMA policy for the VMA */ #endif + struct list_head lazy_unmap_list; + atomic_t vm_phantom_count; }; struct core_thread { @@ -292,4 +295,13 @@ struct mm_struct { /* Future-safe accessor for struct mm_struct's cpu_vm_mask. */ #define mm_cpumask(mm) (&(mm)->cpu_vm_mask) +/* XXX use slightly less than 512 to internal fragmentation? */ +#define LAZY_UNMAP_BATCH_SIZE 512 + +struct lazy_unmap_batch { + struct list_head list; + struct vm_area_struct *vma; + struct page *pages[LAZY_UNMAP_BATCH_SIZE]; +}; + #endif /* _LINUX_MM_TYPES_H */ diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index a990ace..a522b26 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -527,6 +527,7 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags); asmlinkage long sys_fadvise64(int fd, loff_t offset, size_t len, int advice); asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice); asmlinkage long sys_munmap(unsigned long addr, size_t len); +asmlinkage long sys_munmap_lazy(unsigned long addr, size_t len); asmlinkage long sys_mlock(unsigned long start, size_t len); asmlinkage long sys_munlock(unsigned long start, size_t len); asmlinkage long sys_mlockall(int flags); diff --git a/ipc/shm.c b/ipc/shm.c index 464694e..7398cd9 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -1019,7 +1019,7 @@ SYSCALL_DEFINE1(shmdt, char __user *, shmaddr) size = vma->vm_file->f_path.dentry->d_inode->i_size; - do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start); + do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start, 0); /* * We discovered the size of the shm segment, so * break out of here and fall through to the next @@ -1046,7 +1046,7 @@ SYSCALL_DEFINE1(shmdt, char __user *, shmaddr) if ((vma->vm_ops == &shm_vm_ops) && (vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) - do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start); + do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start, 0); vma = next; } diff --git a/mm/memory.c b/mm/memory.c index 6ab19dd..5240a4f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -959,6 +959,154 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb, return addr; } +static unsigned long +zap_pte_range_lazy(struct mm_struct *mm, struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, unsigned long end, struct list_head *batches, + int *pages) +{ + struct lazy_unmap_batch *batch; + spinlock_t *ptl; + pte_t *pte; + int i; + + i = 0; + batch = kzalloc(sizeof(struct lazy_unmap_batch), GFP_KERNEL); + list_add(&batch->list, batches); + batch->vma = vma; + + pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + do { + pte_t ptent = *pte; + if (pte_none(ptent)) + continue; + if (pte_present(ptent)) { + struct page *page; + + page = vm_normal_page(vma, addr, ptent); + ptent = ptep_get_and_clear(mm, addr, pte); + if (page == NULL) { + /* XXX do a regular TLB shootdown */ + //printk("%s: current %s not normal page addr 0x%lx\n", __func__, current->comm, addr); + continue; + } + if (!PageAnon(page)) { + if (pte_dirty(ptent)) + set_page_dirty(page); + if (pte_young(ptent) && + likely(!VM_SequentialReadHint(vma))) + mark_page_accessed(page); + } + /* + * If we're the first lazy unmap of the page, get a + * reference to it to make sure it doesn't get + * freed by someone else until it's still in a batch. + */ + if (atomic_add_return(num_online_cpus(), + &page->lazy_unmap_count) == 0) + get_page(page); + if (unlikely(page_mapcount(page) < 0)) + print_bad_pte(vma, addr, ptent, page); + batch->pages[i++] = page; + //printk(KERN_ALERT "%s: page %p count %d lazy count %d\n", __func__, page, page_count(page), atomic_read(&page->lazy_unmap_count)); + (*pages)++; + } + /* XXX deal with non-present (nonlinear/swap) */ + } while (pte++, addr += PAGE_SIZE, addr != end); + + pte_unmap_unlock(pte - 1, ptl); + return addr; +} + +static unsigned long +zap_pmd_range_lazy(struct mm_struct *mm, struct vm_area_struct *vma, pud_t *pud, + unsigned long addr, unsigned long end, struct list_head *batches, + int *pages) +{ + pmd_t *pmd; + unsigned long next; + + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (pmd_none_or_clear_bad(pmd)) + continue; + next = zap_pte_range_lazy(mm, vma, pmd, addr, next, batches, + pages); + } while (pmd++, addr = next, addr != end); + + return addr; +} + +static unsigned long +zap_pud_range_lazy(struct mm_struct *mm, struct vm_area_struct *vma, pgd_t *pgd, + unsigned long addr, unsigned long end, struct list_head *batches, + int *pages) +{ + pud_t *pud; + unsigned long next; + + pud = pud_offset(pgd, addr); + do { + next = pud_addr_end(addr, end); + if (pud_none_or_clear_bad(pud)) + continue; + next = zap_pmd_range_lazy(mm, vma, pud, addr, next, batches, + pages); + } while (pud++, addr = next, addr != end); + + return addr; +} + +static unsigned long +unmap_page_range_lazy(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, unsigned long end, struct list_head *batches, + int *pages) +{ + pgd_t *pgd; + unsigned long next; + + BUG_ON(addr >= end); + pgd = pgd_offset(vma->vm_mm, addr); + do { + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(pgd)) + continue; + next = zap_pud_range_lazy(mm, vma, pgd, addr, next, batches, + pages); + } while (pgd++, addr = next, addr != end); + + return addr; +} + +unsigned long +unmap_vmas_lazy(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long start_addr, unsigned long end_addr, struct list_head *batches) +{ + unsigned long start = start_addr; + int vma_pages; + + for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) { + unsigned long end; + + start = max(vma->vm_start, start_addr); + if (start >= vma->vm_end) + continue; + end = min(vma->vm_end, end_addr); + if (end <= vma->vm_start) + continue; + vma_pages = 0; + while (start != end) { + start = unmap_page_range_lazy(mm, vma, start, end, + batches, &vma_pages); + } + + atomic_add(vma_pages, &vma->vm_phantom_count); + //printk(KERN_ALERT "%s: vma %p new count %d vma_pages %d\n", __func__, vma, atomic_read(&vma->vm_phantom_count), vma_pages); + } + + return (start); +} + #ifdef CONFIG_PREEMPT # define ZAP_BLOCK_SIZE (8 * PAGE_SIZE) #else diff --git a/mm/mmap.c b/mm/mmap.c index 73f5e4b..eedf50a 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include @@ -88,6 +89,15 @@ int sysctl_overcommit_ratio = 50; /* default is 50% */ int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; struct percpu_counter vm_committed_as; +/* XXX avoid false sharing? */ +struct list_head unmapped_in_tlb[NR_CPUS]; +struct list_head unmapped_safe[NR_CPUS]; +struct timer_list lazy_unmap_timer[NR_CPUS]; +struct list_head lazy_unmap_free_vmas[NR_CPUS];; + +DEFINE_SPINLOCK(lazy_unmap_lock); +static void lazy_unmap_start_timer(int cpu); + /* * Check that a process has enough memory to allocate a new virtual * mapping. 0 means there is enough memory for the allocation to @@ -230,6 +240,13 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) { struct vm_area_struct *next = vma->vm_next; + /* Lazily unmapped VMA. Let the laziness take care of the freeing */ + if (atomic_read(&vma->vm_phantom_count)) { + /* XXX should be atomic? */ + vma->vm_flags |= VM_PHANTOM_UNLINKED; + return next; + } + might_sleep(); if (vma->vm_ops && vma->vm_ops->close) vma->vm_ops->close(vma); @@ -278,7 +295,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) /* Always allow shrinking brk. */ if (brk <= mm->brk) { - if (!do_munmap(mm, newbrk, oldbrk-newbrk)) + if (!do_munmap(mm, newbrk, oldbrk-newbrk, 0)) goto set_brk; goto out; } @@ -513,6 +530,8 @@ void vma_adjust(struct vm_area_struct *vma, unsigned long start, long adjust_next = 0; int remove_next = 0; + BUG_ON(atomic_read(&vma->vm_phantom_count) != 0); + if (next && !insert) { if (end >= next->vm_end) { /* @@ -641,6 +660,10 @@ again: remove_next = 1 + (end > next->vm_end); } mm->map_count--; mpol_put(vma_policy(next)); + if (atomic_read(&next->vm_phantom_count)) { + printk(KERN_ALERT "%s: freeing phantom vma %p\n", __func__, next); + WARN_ON(1); + } kmem_cache_free(vm_area_cachep, next); /* * In mprotect's case 6 (see comments on vma_merge), @@ -670,6 +693,8 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma, return 0; if (vma->vm_ops && vma->vm_ops->close) return 0; + if (atomic_read(&vma->vm_phantom_count) != 0) + return 0; return 1; } @@ -1134,7 +1159,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, munmap_back: vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); if (vma && vma->vm_start < addr + len) { - if (do_munmap(mm, addr, len)) + if (do_munmap(mm, addr, len, 0)) return -ENOMEM; goto munmap_back; } @@ -1260,6 +1285,9 @@ unmap_and_free_vma: unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end); charged = 0; free_vma: + if (atomic_read(&vma->vm_phantom_count)) { + printk(KERN_ALERT "%s: freeing phanto vma %p\n", __func__, vma); + } kmem_cache_free(vm_area_cachep, vma); unacct_error: if (charged) @@ -1805,20 +1833,24 @@ static void unmap_region(struct mm_struct *mm, */ static void detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, - struct vm_area_struct *prev, unsigned long end) + struct vm_area_struct *prev, unsigned long end, int lazy) { struct vm_area_struct **insertion_point; struct vm_area_struct *tail_vma = NULL; unsigned long addr; + int i; + i = 0; insertion_point = (prev ? &prev->vm_next : &mm->mmap); do { rb_erase(&vma->vm_rb, &mm->mm_rb); mm->map_count--; tail_vma = vma; vma = vma->vm_next; + i++; } while (vma && vma->vm_start < end); *insertion_point = vma; + WARN_ON(lazy && i > 1); tail_vma->vm_next = NULL; if (mm->unmap_area == arch_unmap_area) addr = prev ? prev->vm_end : mm->mmap_base; @@ -1851,6 +1883,7 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, /* most fields are the same, copy all, and then fixup */ *new = *vma; + atomic_set(&new->vm_phantom_count, 0); if (new_below) new->vm_end = addr; @@ -1861,6 +1894,9 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, pol = mpol_dup(vma_policy(vma)); if (IS_ERR(pol)) { + if (atomic_read(&vma->vm_phantom_count)) { + printk(KERN_ALERT "%s: freeing phanto vma %p\n", __func__, vma); + } kmem_cache_free(vm_area_cachep, new); return PTR_ERR(pol); } @@ -1884,12 +1920,160 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, return 0; } +static void +lazy_unmap_work(struct work_struct *work) +{ + struct vm_area_struct *next, *prev, *_vma, *vma, **insertion_point; + struct mm_struct *mm; + unsigned long flags; + int cpu; + + cpu = get_cpu(); + local_irq_save(flags); + list_for_each_entry_safe(vma, next, &lazy_unmap_free_vmas[cpu], + lazy_unmap_list) { + mm = vma->vm_mm; + if (vma->vm_flags & VM_PHANTOM_UNLINKED) { + //printk(KERN_ALERT "%s: cpu %d delayed free vma %p start 0x%lx end 0x%lx\n", __func__, cpu, vma, vma->vm_start, vma->vm_end); + list_del(&vma->lazy_unmap_list); + kmem_cache_free(vm_area_cachep, vma); + continue; + } + BUG_ON(mm == NULL); + //printk(KERN_ALERT "%s: cpu %d delayed detach vma %p start 0x%lx end 0x%lx\n", __func__, cpu, vma, vma->vm_start, vma->vm_end); + list_del(&vma->lazy_unmap_list); + local_irq_restore(flags); + + down_write(&mm->mmap_sem); + _vma = find_vma_prev(mm, vma->vm_start, &prev); + BUG_ON(_vma != vma); + WARN_ON(prev == NULL); + + insertion_point = (prev ? &prev->vm_next : &mm->mmap); + rb_erase(&vma->vm_rb, &mm->mm_rb); + mm->map_count--; + *insertion_point = vma->vm_next; + vma->vm_next = NULL; + + //detach_vmas_to_be_unmapped(mm, vma, prev, vma->vm_end, 1); + remove_vma_list(mm, vma); + up_write(&mm->mmap_sem); + + local_irq_save(flags); + } + local_irq_restore(flags); + put_cpu(); + + kfree(work); +} + +static void +lazy_unmap_free_page(struct page *page, struct vm_area_struct *vma) +{ + struct work_struct *work; + int cpu; + + //printk("%s: releasing page %p mapcount %d\n", __func__, page, page_mapcount(page)); + + cpu = smp_processor_id(); + + page_remove_rmap(page); + put_page(page); + + //printk(KERN_ALERT "cpu %ld vma %p count %d\n", cpu, vma, atomic_read(&vma->vm_phantom_count)); + + BUG_ON(atomic_read(&vma->vm_phantom_count) <= 0); + + /* Do we need to free the vma? */ + if (atomic_dec_and_test(&vma->vm_phantom_count)) { + + /* The mm was destroyed already, so we can just free the vma */ + if (vma->vm_flags & VM_PHANTOM_UNLINKED) { + //printk(KERN_ALERT "%s: freeing (immediate) vma %p start 0x%lx end 0x%lx\n", __func__, vma, vma->vm_start, vma->vm_end); + kmem_cache_free(vm_area_cachep, vma); + return; + } + + //printk(KERN_ALERT "%s: cpu %d freeing (delayed) vma %p start 0x%lx end 0x%lx\n", __func__, cpu, vma, vma->vm_start, vma->vm_end); + list_add(&vma->lazy_unmap_list, &lazy_unmap_free_vmas[cpu]); + work = kmalloc(sizeof(struct work_struct), GFP_ATOMIC); + if (work == NULL) + return; + INIT_WORK(work, lazy_unmap_work); + schedule_work_on(cpu, work); + } +} + +static void +lazy_unmap_timer_fn(unsigned long cpu) +{ + struct lazy_unmap_batch *b, *next; + struct vm_area_struct *vma; + unsigned long flags; + struct page *page; + int done, i; + + done = 0; + spin_lock_irqsave(&lazy_unmap_lock, flags); + list_for_each_entry_safe(b, next, &unmapped_safe[cpu], list) { + for (i = 0; i < LAZY_UNMAP_BATCH_SIZE; i++) { + vma = b->vma; + page = b->pages[i]; + if (page == NULL) + break; + done++; + //printk(KERN_ALERT "%s cpu %ld page %p count %d lazy count %d vma %p count %d\n", __func__, cpu, page, page_count(page), atomic_read(&page->lazy_unmap_count), vma, atomic_read(&vma->vm_phantom_count)); + if (put_page_lazy(page)) + lazy_unmap_free_page(page, vma); + } + list_del(&b->list); + kfree(b); + if (done >= 512) /* XXX tunable? */ + break; + } + spin_unlock_irqrestore(&lazy_unmap_lock, flags); + + lazy_unmap_start_timer(cpu); +} + +static void +do_lazy_munmap(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + struct lazy_unmap_batch *b, *cpubatch, *next; + struct list_head batches; + unsigned long flags; + int cpu; + + INIT_LIST_HEAD(&batches); + + unmap_vmas_lazy(mm, vma, start, end, &batches); + + list_for_each_entry_safe(b, next, &batches, list) { + list_del(&b->list); + for_each_online_cpu(cpu) { + cpubatch = kmalloc(sizeof(struct lazy_unmap_batch), + GFP_KERNEL); + memcpy(cpubatch, b, sizeof(struct lazy_unmap_batch)); + + /* XXX don't grab lock for each one? */ + spin_lock_irqsave(&lazy_unmap_lock, flags); + /* XXX move to safe list for current cpu */ + list_add(&cpubatch->list, &unmapped_in_tlb[cpu]); + spin_unlock_irqrestore(&lazy_unmap_lock, flags); + } + } + /* XXX make it work */ + //free_pgtables(tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS, + // next? next->vm_start: 0); + +} /* Munmap is split into 2 main parts -- this part which finds * what needs doing, and the areas themselves, which do the * work. This now handles partial unmappings. * Jeremy Fitzhardinge */ -int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) +int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, int lazy) { unsigned long end; struct vm_area_struct *vma, *prev, *last; @@ -1919,7 +2103,11 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) * places tmp vma above, and higher split_vma places tmp vma below. */ if (start > vma->vm_start) { - int error = split_vma(mm, vma, start, 0); + int error; + /* XXX disallow splitting (unless it's the last vma) for lazy unmap? */ + //if (lazy) + //printk(KERN_ALERT "%s: splitting lazy vma %p start 0x%lx end 0x%lx requested start 0x%lx requested end 0x%lx\n", __func__, vma, vma->vm_start, vma->vm_end, start, end); + error = split_vma(mm, vma, start, 0); if (error) return error; prev = vma; @@ -1928,7 +2116,10 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) /* Does it split the last one? */ last = find_vma(mm, end); if (last && end > last->vm_start) { - int error = split_vma(mm, last, end, 1); + int error; + //if (lazy) + //printk(KERN_ALERT "%s: splitting lazy last vma %p start 0x%lx end 0x%lx requested start 0x%lx requested end 0x%lx\n", __func__, last, last->vm_start, last->vm_end, start, end); + error = split_vma(mm, last, end, 1); if (error) return error; } @@ -1948,14 +2139,18 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) } } - /* - * Remove the vma's, and unmap the actual pages - */ - detach_vmas_to_be_unmapped(mm, vma, prev, end); - unmap_region(mm, vma, prev, start, end); + if (lazy == 0) { + /* + * Remove the vma's, and unmap the actual pages + */ + detach_vmas_to_be_unmapped(mm, vma, prev, end, 0); + unmap_region(mm, vma, prev, start, end); - /* Fix up all other VM information */ - remove_vma_list(mm, vma); + /* Fix up all other VM information */ + remove_vma_list(mm, vma); + } else { + do_lazy_munmap(mm, vma, start, end); + } return 0; } @@ -1970,11 +2165,27 @@ SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) profile_munmap(addr); down_write(&mm->mmap_sem); - ret = do_munmap(mm, addr, len); + ret = do_munmap(mm, addr, len, 0); up_write(&mm->mmap_sem); return ret; } +SYSCALL_DEFINE2(munmap_lazy, unsigned long, addr, size_t, len) +{ + struct mm_struct *mm; + int ret; + + mm = current->mm; + + //printk(KERN_ALERT "%s: addr %lx len %ld\n", __func__, addr, len); + + down_write(&mm->mmap_sem); + ret = do_munmap(mm, addr, len, 1); + up_write(&mm->mmap_sem); + + return (ret); +} + static inline void verify_mm_writelocked(struct mm_struct *mm) { #ifdef CONFIG_DEBUG_VM @@ -2044,7 +2255,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len) munmap_back: vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); if (vma && vma->vm_start < addr + len) { - if (do_munmap(mm, addr, len)) + if (do_munmap(mm, addr, len, 0)) return -ENOMEM; goto munmap_back; } @@ -2136,7 +2347,7 @@ void exit_mmap(struct mm_struct *mm) while (vma) vma = remove_vma(vma); - BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); + //BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); } /* Insert vm structure into process list sorted by address @@ -2209,8 +2420,12 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (new_vma) { *new_vma = *vma; + atomic_set(&new_vma->vm_phantom_count, 0); pol = mpol_dup(vma_policy(vma)); if (IS_ERR(pol)) { + if (atomic_read(&vma->vm_phantom_count)) { + printk(KERN_ALERT "%s: freeing phanto vma %p\n", __func__, vma); + } kmem_cache_free(vm_area_cachep, new_vma); return NULL; } @@ -2317,6 +2532,9 @@ int install_special_mapping(struct mm_struct *mm, vma->vm_private_data = pages; if (unlikely(insert_vm_struct(mm, vma))) { + if (atomic_read(&vma->vm_phantom_count)) { + printk(KERN_ALERT "%s: freeing phanto vma %p\n", __func__, vma); + } kmem_cache_free(vm_area_cachep, vma); return -ENOMEM; } @@ -2492,13 +2710,56 @@ void mm_drop_all_locks(struct mm_struct *mm) mutex_unlock(&mm_all_locks_mutex); } +static void +lazy_unmap_start_timer(int cpu) +{ + struct timer_list *timer; + + timer = &lazy_unmap_timer[cpu]; + init_timer(timer); + timer_stats_timer_set_start_info(timer); + timer->expires = jiffies + HZ * 1; /* XXX constant */ + timer->data = cpu; + timer->function = lazy_unmap_timer_fn; + + add_timer_on(timer, cpu); +} + +static int __cpuinit +lazy_unmap_cpu_up(struct notifier_block *nfb, unsigned long action, void *hcpu) +{ + long cpu; + + cpu = (long)hcpu; + switch (action) { + case CPU_ONLINE: + lazy_unmap_start_timer(cpu); + break; + /* XXX offline */ + } + + return (NOTIFY_OK); +} + +static struct notifier_block __cpuinitdata lazy_unmap_cpu_up_notifier = { + &lazy_unmap_cpu_up, NULL, 0 +}; + /* * initialise the VMA slab */ void __init mmap_init(void) { - int ret; + int i, ret; ret = percpu_counter_init(&vm_committed_as, 0); VM_BUG_ON(ret); + + for (i = 0; i < NR_CPUS; i++) { + INIT_LIST_HEAD(&unmapped_in_tlb[i]); + INIT_LIST_HEAD(&unmapped_safe[i]); + INIT_LIST_HEAD(&lazy_unmap_free_vmas[i]); + } + register_cpu_notifier(&lazy_unmap_cpu_up_notifier); + lazy_unmap_start_timer(smp_processor_id()); } diff --git a/mm/mremap.c b/mm/mremap.c index 97bff25..127262f 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -237,7 +237,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, mm->total_vm += new_len >> PAGE_SHIFT; vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT); - if (do_munmap(mm, old_addr, old_len) < 0) { + if (do_munmap(mm, old_addr, old_len, 0) < 0) { /* OOM: unable to split vma, just get accounts right */ vm_unacct_memory(excess >> PAGE_SHIFT); excess = 0; @@ -317,7 +317,7 @@ unsigned long do_mremap(unsigned long addr, if (ret) goto out; - ret = do_munmap(mm, new_addr, new_len); + ret = do_munmap(mm, new_addr, new_len, 0); if (ret) goto out; } @@ -328,7 +328,7 @@ unsigned long do_mremap(unsigned long addr, * do_munmap does all the needed commit accounting */ if (old_len >= new_len) { - ret = do_munmap(mm, addr+new_len, old_len - new_len); + ret = do_munmap(mm, addr+new_len, old_len - new_len, 0); if (ret && old_len != new_len) goto out; ret = addr; diff --git a/mm/nommu.c b/mm/nommu.c index 9876fa0..4c17796 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1508,7 +1508,7 @@ static int shrink_vma(struct mm_struct *mm, * - under NOMMU conditions the chunk to be unmapped must be backed by a single * VMA, though it need not cover the whole VMA */ -int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) +int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, int lazy) { struct vm_area_struct *vma; struct rb_node *rb; @@ -1589,7 +1589,7 @@ SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) struct mm_struct *mm = current->mm; down_write(&mm->mmap_sem); - ret = do_munmap(mm, addr, len); + ret = do_munmap(mm, addr, len, 0); up_write(&mm->mmap_sem); return ret; }