From 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 16 Apr 2005 15:20:36 -0700 Subject: Linux-2.6.12-rc2 Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip! --- mm/Makefile | 20 + mm/bootmem.c | 400 +++++++ mm/fadvise.c | 111 ++ mm/filemap.c | 2306 ++++++++++++++++++++++++++++++++++++++ mm/fremap.c | 256 +++++ mm/highmem.c | 607 ++++++++++ mm/hugetlb.c | 260 +++++ mm/internal.h | 13 + mm/madvise.c | 242 ++++ mm/memory.c | 2165 ++++++++++++++++++++++++++++++++++++ mm/mempolicy.c | 1138 +++++++++++++++++++ mm/mempool.c | 290 +++++ mm/mincore.c | 191 ++++ mm/mlock.c | 253 +++++ mm/mmap.c | 2082 +++++++++++++++++++++++++++++++++++ mm/mprotect.c | 282 +++++ mm/mremap.c | 426 +++++++ mm/msync.c | 236 ++++ mm/nommu.c | 1180 ++++++++++++++++++++ mm/oom_kill.c | 292 +++++ mm/page-writeback.c | 819 ++++++++++++++ mm/page_alloc.c | 2220 +++++++++++++++++++++++++++++++++++++ mm/page_io.c | 160 +++ mm/pdflush.c | 228 ++++ mm/prio_tree.c | 207 ++++ mm/readahead.c | 557 ++++++++++ mm/rmap.c | 862 +++++++++++++++ mm/shmem.c | 2326 +++++++++++++++++++++++++++++++++++++++ mm/slab.c | 3060 +++++++++++++++++++++++++++++++++++++++++++++++++++ mm/swap.c | 485 ++++++++ mm/swap_state.c | 382 +++++++ mm/swapfile.c | 1672 ++++++++++++++++++++++++++++ mm/thrash.c | 102 ++ mm/tiny-shmem.c | 122 ++ mm/truncate.c | 336 ++++++ mm/vmalloc.c | 588 ++++++++++ mm/vmscan.c | 1311 ++++++++++++++++++++++ 37 files changed, 28187 insertions(+) create mode 100644 mm/Makefile create mode 100644 mm/bootmem.c create mode 100644 mm/fadvise.c create mode 100644 mm/filemap.c create mode 100644 mm/fremap.c create mode 100644 mm/highmem.c create mode 100644 mm/hugetlb.c create mode 100644 mm/internal.h create mode 100644 mm/madvise.c create mode 100644 mm/memory.c create mode 100644 mm/mempolicy.c create mode 100644 mm/mempool.c create mode 100644 mm/mincore.c create mode 100644 mm/mlock.c create mode 100644 mm/mmap.c create mode 100644 mm/mprotect.c create mode 100644 mm/mremap.c create mode 100644 mm/msync.c create mode 100644 mm/nommu.c create mode 100644 mm/oom_kill.c create mode 100644 mm/page-writeback.c create mode 100644 mm/page_alloc.c create mode 100644 mm/page_io.c create mode 100644 mm/pdflush.c create mode 100644 mm/prio_tree.c create mode 100644 mm/readahead.c create mode 100644 mm/rmap.c create mode 100644 mm/shmem.c create mode 100644 mm/slab.c create mode 100644 mm/swap.c create mode 100644 mm/swap_state.c create mode 100644 mm/swapfile.c create mode 100644 mm/thrash.c create mode 100644 mm/tiny-shmem.c create mode 100644 mm/truncate.c create mode 100644 mm/vmalloc.c create mode 100644 mm/vmscan.c (limited to 'mm') diff --git a/mm/Makefile b/mm/Makefile new file mode 100644 index 000000000000..097408064f6a --- /dev/null +++ b/mm/Makefile @@ -0,0 +1,20 @@ +# +# Makefile for the linux memory manager. +# + +mmu-y := nommu.o +mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ + mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ + vmalloc.o + +obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ + page_alloc.o page-writeback.o pdflush.o \ + readahead.o slab.o swap.o truncate.o vmscan.o \ + prio_tree.o $(mmu-y) + +obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o +obj-$(CONFIG_HUGETLBFS) += hugetlb.o +obj-$(CONFIG_NUMA) += mempolicy.o +obj-$(CONFIG_SHMEM) += shmem.o +obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o + diff --git a/mm/bootmem.c b/mm/bootmem.c new file mode 100644 index 000000000000..260e703850d8 --- /dev/null +++ b/mm/bootmem.c @@ -0,0 +1,400 @@ +/* + * linux/mm/bootmem.c + * + * Copyright (C) 1999 Ingo Molnar + * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 + * + * simple boot-time physical memory area allocator and + * free memory collector. It's used to deal with reserved + * system memory and memory holes as well. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +/* + * Access to this subsystem has to be serialized externally. (this is + * true for the boot process anyway) + */ +unsigned long max_low_pfn; +unsigned long min_low_pfn; +unsigned long max_pfn; + +EXPORT_SYMBOL(max_pfn); /* This is exported so + * dma_get_required_mask(), which uses + * it, can be an inline function */ + +/* return the number of _pages_ that will be allocated for the boot bitmap */ +unsigned long __init bootmem_bootmap_pages (unsigned long pages) +{ + unsigned long mapsize; + + mapsize = (pages+7)/8; + mapsize = (mapsize + ~PAGE_MASK) & PAGE_MASK; + mapsize >>= PAGE_SHIFT; + + return mapsize; +} + +/* + * Called once to set up the allocator itself. + */ +static unsigned long __init init_bootmem_core (pg_data_t *pgdat, + unsigned long mapstart, unsigned long start, unsigned long end) +{ + bootmem_data_t *bdata = pgdat->bdata; + unsigned long mapsize = ((end - start)+7)/8; + + pgdat->pgdat_next = pgdat_list; + pgdat_list = pgdat; + + mapsize = (mapsize + (sizeof(long) - 1UL)) & ~(sizeof(long) - 1UL); + bdata->node_bootmem_map = phys_to_virt(mapstart << PAGE_SHIFT); + bdata->node_boot_start = (start << PAGE_SHIFT); + bdata->node_low_pfn = end; + + /* + * Initially all pages are reserved - setup_arch() has to + * register free RAM areas explicitly. + */ + memset(bdata->node_bootmem_map, 0xff, mapsize); + + return mapsize; +} + +/* + * Marks a particular physical memory range as unallocatable. Usable RAM + * might be used for boot-time allocations - or it might get added + * to the free page pool later on. + */ +static void __init reserve_bootmem_core(bootmem_data_t *bdata, unsigned long addr, unsigned long size) +{ + unsigned long i; + /* + * round up, partially reserved pages are considered + * fully reserved. + */ + unsigned long sidx = (addr - bdata->node_boot_start)/PAGE_SIZE; + unsigned long eidx = (addr + size - bdata->node_boot_start + + PAGE_SIZE-1)/PAGE_SIZE; + unsigned long end = (addr + size + PAGE_SIZE-1)/PAGE_SIZE; + + BUG_ON(!size); + BUG_ON(sidx >= eidx); + BUG_ON((addr >> PAGE_SHIFT) >= bdata->node_low_pfn); + BUG_ON(end > bdata->node_low_pfn); + + for (i = sidx; i < eidx; i++) + if (test_and_set_bit(i, bdata->node_bootmem_map)) { +#ifdef CONFIG_DEBUG_BOOTMEM + printk("hm, page %08lx reserved twice.\n", i*PAGE_SIZE); +#endif + } +} + +static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr, unsigned long size) +{ + unsigned long i; + unsigned long start; + /* + * round down end of usable mem, partially free pages are + * considered reserved. + */ + unsigned long sidx; + unsigned long eidx = (addr + size - bdata->node_boot_start)/PAGE_SIZE; + unsigned long end = (addr + size)/PAGE_SIZE; + + BUG_ON(!size); + BUG_ON(end > bdata->node_low_pfn); + + if (addr < bdata->last_success) + bdata->last_success = addr; + + /* + * Round up the beginning of the address. + */ + start = (addr + PAGE_SIZE-1) / PAGE_SIZE; + sidx = start - (bdata->node_boot_start/PAGE_SIZE); + + for (i = sidx; i < eidx; i++) { + if (unlikely(!test_and_clear_bit(i, bdata->node_bootmem_map))) + BUG(); + } +} + +/* + * We 'merge' subsequent allocations to save space. We might 'lose' + * some fraction of a page if allocations cannot be satisfied due to + * size constraints on boxes where there is physical RAM space + * fragmentation - in these cases (mostly large memory boxes) this + * is not a problem. + * + * On low memory boxes we get it right in 100% of the cases. + * + * alignment has to be a power of 2 value. + * + * NOTE: This function is _not_ reentrant. + */ +static void * __init +__alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size, + unsigned long align, unsigned long goal) +{ + unsigned long offset, remaining_size, areasize, preferred; + unsigned long i, start = 0, incr, eidx; + void *ret; + + if(!size) { + printk("__alloc_bootmem_core(): zero-sized request\n"); + BUG(); + } + BUG_ON(align & (align-1)); + + eidx = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT); + offset = 0; + if (align && + (bdata->node_boot_start & (align - 1UL)) != 0) + offset = (align - (bdata->node_boot_start & (align - 1UL))); + offset >>= PAGE_SHIFT; + + /* + * We try to allocate bootmem pages above 'goal' + * first, then we try to allocate lower pages. + */ + if (goal && (goal >= bdata->node_boot_start) && + ((goal >> PAGE_SHIFT) < bdata->node_low_pfn)) { + preferred = goal - bdata->node_boot_start; + + if (bdata->last_success >= preferred) + preferred = bdata->last_success; + } else + preferred = 0; + + preferred = ((preferred + align - 1) & ~(align - 1)) >> PAGE_SHIFT; + preferred += offset; + areasize = (size+PAGE_SIZE-1)/PAGE_SIZE; + incr = align >> PAGE_SHIFT ? : 1; + +restart_scan: + for (i = preferred; i < eidx; i += incr) { + unsigned long j; + i = find_next_zero_bit(bdata->node_bootmem_map, eidx, i); + i = ALIGN(i, incr); + if (test_bit(i, bdata->node_bootmem_map)) + continue; + for (j = i + 1; j < i + areasize; ++j) { + if (j >= eidx) + goto fail_block; + if (test_bit (j, bdata->node_bootmem_map)) + goto fail_block; + } + start = i; + goto found; + fail_block: + i = ALIGN(j, incr); + } + + if (preferred > offset) { + preferred = offset; + goto restart_scan; + } + return NULL; + +found: + bdata->last_success = start << PAGE_SHIFT; + BUG_ON(start >= eidx); + + /* + * Is the next page of the previous allocation-end the start + * of this allocation's buffer? If yes then we can 'merge' + * the previous partial page with this allocation. + */ + if (align < PAGE_SIZE && + bdata->last_offset && bdata->last_pos+1 == start) { + offset = (bdata->last_offset+align-1) & ~(align-1); + BUG_ON(offset > PAGE_SIZE); + remaining_size = PAGE_SIZE-offset; + if (size < remaining_size) { + areasize = 0; + /* last_pos unchanged */ + bdata->last_offset = offset+size; + ret = phys_to_virt(bdata->last_pos*PAGE_SIZE + offset + + bdata->node_boot_start); + } else { + remaining_size = size - remaining_size; + areasize = (remaining_size+PAGE_SIZE-1)/PAGE_SIZE; + ret = phys_to_virt(bdata->last_pos*PAGE_SIZE + offset + + bdata->node_boot_start); + bdata->last_pos = start+areasize-1; + bdata->last_offset = remaining_size; + } + bdata->last_offset &= ~PAGE_MASK; + } else { + bdata->last_pos = start + areasize - 1; + bdata->last_offset = size & ~PAGE_MASK; + ret = phys_to_virt(start * PAGE_SIZE + bdata->node_boot_start); + } + + /* + * Reserve the area now: + */ + for (i = start; i < start+areasize; i++) + if (unlikely(test_and_set_bit(i, bdata->node_bootmem_map))) + BUG(); + memset(ret, 0, size); + return ret; +} + +static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat) +{ + struct page *page; + bootmem_data_t *bdata = pgdat->bdata; + unsigned long i, count, total = 0; + unsigned long idx; + unsigned long *map; + int gofast = 0; + + BUG_ON(!bdata->node_bootmem_map); + + count = 0; + /* first extant page of the node */ + page = virt_to_page(phys_to_virt(bdata->node_boot_start)); + idx = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT); + map = bdata->node_bootmem_map; + /* Check physaddr is O(LOG2(BITS_PER_LONG)) page aligned */ + if (bdata->node_boot_start == 0 || + ffs(bdata->node_boot_start) - PAGE_SHIFT > ffs(BITS_PER_LONG)) + gofast = 1; + for (i = 0; i < idx; ) { + unsigned long v = ~map[i / BITS_PER_LONG]; + if (gofast && v == ~0UL) { + int j, order; + + count += BITS_PER_LONG; + __ClearPageReserved(page); + order = ffs(BITS_PER_LONG) - 1; + set_page_refs(page, order); + for (j = 1; j < BITS_PER_LONG; j++) { + if (j + 16 < BITS_PER_LONG) + prefetchw(page + j + 16); + __ClearPageReserved(page + j); + } + __free_pages(page, order); + i += BITS_PER_LONG; + page += BITS_PER_LONG; + } else if (v) { + unsigned long m; + for (m = 1; m && i < idx; m<<=1, page++, i++) { + if (v & m) { + count++; + __ClearPageReserved(page); + set_page_refs(page, 0); + __free_page(page); + } + } + } else { + i+=BITS_PER_LONG; + page += BITS_PER_LONG; + } + } + total += count; + + /* + * Now free the allocator bitmap itself, it's not + * needed anymore: + */ + page = virt_to_page(bdata->node_bootmem_map); + count = 0; + for (i = 0; i < ((bdata->node_low_pfn-(bdata->node_boot_start >> PAGE_SHIFT))/8 + PAGE_SIZE-1)/PAGE_SIZE; i++,page++) { + count++; + __ClearPageReserved(page); + set_page_count(page, 1); + __free_page(page); + } + total += count; + bdata->node_bootmem_map = NULL; + + return total; +} + +unsigned long __init init_bootmem_node (pg_data_t *pgdat, unsigned long freepfn, unsigned long startpfn, unsigned long endpfn) +{ + return(init_bootmem_core(pgdat, freepfn, startpfn, endpfn)); +} + +void __init reserve_bootmem_node (pg_data_t *pgdat, unsigned long physaddr, unsigned long size) +{ + reserve_bootmem_core(pgdat->bdata, physaddr, size); +} + +void __init free_bootmem_node (pg_data_t *pgdat, unsigned long physaddr, unsigned long size) +{ + free_bootmem_core(pgdat->bdata, physaddr, size); +} + +unsigned long __init free_all_bootmem_node (pg_data_t *pgdat) +{ + return(free_all_bootmem_core(pgdat)); +} + +unsigned long __init init_bootmem (unsigned long start, unsigned long pages) +{ + max_low_pfn = pages; + min_low_pfn = start; + return(init_bootmem_core(NODE_DATA(0), start, 0, pages)); +} + +#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE +void __init reserve_bootmem (unsigned long addr, unsigned long size) +{ + reserve_bootmem_core(NODE_DATA(0)->bdata, addr, size); +} +#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ + +void __init free_bootmem (unsigned long addr, unsigned long size) +{ + free_bootmem_core(NODE_DATA(0)->bdata, addr, size); +} + +unsigned long __init free_all_bootmem (void) +{ + return(free_all_bootmem_core(NODE_DATA(0))); +} + +void * __init __alloc_bootmem (unsigned long size, unsigned long align, unsigned long goal) +{ + pg_data_t *pgdat = pgdat_list; + void *ptr; + + for_each_pgdat(pgdat) + if ((ptr = __alloc_bootmem_core(pgdat->bdata, size, + align, goal))) + return(ptr); + + /* + * Whoops, we cannot satisfy the allocation request. + */ + printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size); + panic("Out of memory"); + return NULL; +} + +void * __init __alloc_bootmem_node (pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal) +{ + void *ptr; + + ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal); + if (ptr) + return (ptr); + + return __alloc_bootmem(size, align, goal); +} + diff --git a/mm/fadvise.c b/mm/fadvise.c new file mode 100644 index 000000000000..57264d74b8bf --- /dev/null +++ b/mm/fadvise.c @@ -0,0 +1,111 @@ +/* + * mm/fadvise.c + * + * Copyright (C) 2002, Linus Torvalds + * + * 11Jan2003 akpm@digeo.com + * Initial version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* + * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could + * deactivate the pages and clear PG_Referenced. + */ +asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) +{ + struct file *file = fget(fd); + struct address_space *mapping; + struct backing_dev_info *bdi; + loff_t endbyte; + pgoff_t start_index; + pgoff_t end_index; + unsigned long nrpages; + int ret = 0; + + if (!file) + return -EBADF; + + mapping = file->f_mapping; + if (!mapping || len < 0) { + ret = -EINVAL; + goto out; + } + + /* Careful about overflows. Len == 0 means "as much as possible" */ + endbyte = offset + len; + if (!len || endbyte < len) + endbyte = -1; + + bdi = mapping->backing_dev_info; + + switch (advice) { + case POSIX_FADV_NORMAL: + file->f_ra.ra_pages = bdi->ra_pages; + break; + case POSIX_FADV_RANDOM: + file->f_ra.ra_pages = 0; + break; + case POSIX_FADV_SEQUENTIAL: + file->f_ra.ra_pages = bdi->ra_pages * 2; + break; + case POSIX_FADV_WILLNEED: + case POSIX_FADV_NOREUSE: + if (!mapping->a_ops->readpage) { + ret = -EINVAL; + break; + } + + /* First and last PARTIAL page! */ + start_index = offset >> PAGE_CACHE_SHIFT; + end_index = (endbyte-1) >> PAGE_CACHE_SHIFT; + + /* Careful about overflow on the "+1" */ + nrpages = end_index - start_index + 1; + if (!nrpages) + nrpages = ~0UL; + + ret = force_page_cache_readahead(mapping, file, + start_index, + max_sane_readahead(nrpages)); + if (ret > 0) + ret = 0; + break; + case POSIX_FADV_DONTNEED: + if (!bdi_write_congested(mapping->backing_dev_info)) + filemap_flush(mapping); + + /* First and last FULL page! */ + start_index = (offset + (PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT; + end_index = (endbyte >> PAGE_CACHE_SHIFT); + + if (end_index > start_index) + invalidate_mapping_pages(mapping, start_index, end_index-1); + break; + default: + ret = -EINVAL; + } +out: + fput(file); + return ret; +} + +#ifdef __ARCH_WANT_SYS_FADVISE64 + +asmlinkage long sys_fadvise64(int fd, loff_t offset, size_t len, int advice) +{ + return sys_fadvise64_64(fd, offset, len, advice); +} + +#endif diff --git a/mm/filemap.c b/mm/filemap.c new file mode 100644 index 000000000000..439b2bea8e34 --- /dev/null +++ b/mm/filemap.c @@ -0,0 +1,2306 @@ +/* + * linux/mm/filemap.c + * + * Copyright (C) 1994-1999 Linus Torvalds + */ + +/* + * This file handles the generic file mmap semantics used by + * most "normal" filesystems (but you don't /have/ to use this: + * the NFS filesystem used to do this differently, for example) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +/* + * This is needed for the following functions: + * - try_to_release_page + * - block_invalidatepage + * - generic_osync_inode + * + * FIXME: remove all knowledge of the buffer layer from the core VM + */ +#include /* for generic_osync_inode */ + +#include +#include + +/* + * Shared mappings implemented 30.11.1994. It's not fully working yet, + * though. + * + * Shared mappings now work. 15.8.1995 Bruno. + * + * finished 'unifying' the page and buffer cache and SMP-threaded the + * page-cache, 21.05.1999, Ingo Molnar + * + * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli + */ + +/* + * Lock ordering: + * + * ->i_mmap_lock (vmtruncate) + * ->private_lock (__free_pte->__set_page_dirty_buffers) + * ->swap_list_lock + * ->swap_device_lock (exclusive_swap_page, others) + * ->mapping->tree_lock + * + * ->i_sem + * ->i_mmap_lock (truncate->unmap_mapping_range) + * + * ->mmap_sem + * ->i_mmap_lock + * ->page_table_lock (various places, mainly in mmap.c) + * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock) + * + * ->mmap_sem + * ->lock_page (access_process_vm) + * + * ->mmap_sem + * ->i_sem (msync) + * + * ->i_sem + * ->i_alloc_sem (various) + * + * ->inode_lock + * ->sb_lock (fs/fs-writeback.c) + * ->mapping->tree_lock (__sync_single_inode) + * + * ->i_mmap_lock + * ->anon_vma.lock (vma_adjust) + * + * ->anon_vma.lock + * ->page_table_lock (anon_vma_prepare and various) + * + * ->page_table_lock + * ->swap_device_lock (try_to_unmap_one) + * ->private_lock (try_to_unmap_one) + * ->tree_lock (try_to_unmap_one) + * ->zone.lru_lock (follow_page->mark_page_accessed) + * ->private_lock (page_remove_rmap->set_page_dirty) + * ->tree_lock (page_remove_rmap->set_page_dirty) + * ->inode_lock (page_remove_rmap->set_page_dirty) + * ->inode_lock (zap_pte_range->set_page_dirty) + * ->private_lock (zap_pte_range->__set_page_dirty_buffers) + * + * ->task->proc_lock + * ->dcache_lock (proc_pid_lookup) + */ + +/* + * Remove a page from the page cache and free it. Caller has to make + * sure the page is locked and that nobody else uses it - or that usage + * is safe. The caller must hold a write_lock on the mapping's tree_lock. + */ +void __remove_from_page_cache(struct page *page) +{ + struct address_space *mapping = page->mapping; + + radix_tree_delete(&mapping->page_tree, page->index); + page->mapping = NULL; + mapping->nrpages--; + pagecache_acct(-1); +} + +void remove_from_page_cache(struct page *page) +{ + struct address_space *mapping = page->mapping; + + if (unlikely(!PageLocked(page))) + PAGE_BUG(page); + + write_lock_irq(&mapping->tree_lock); + __remove_from_page_cache(page); + write_unlock_irq(&mapping->tree_lock); +} + +static int sync_page(void *word) +{ + struct address_space *mapping; + struct page *page; + + page = container_of((page_flags_t *)word, struct page, flags); + + /* + * FIXME, fercrissake. What is this barrier here for? + */ + smp_mb(); + mapping = page_mapping(page); + if (mapping && mapping->a_ops && mapping->a_ops->sync_page) + mapping->a_ops->sync_page(page); + io_schedule(); + return 0; +} + +/** + * filemap_fdatawrite_range - start writeback against all of a mapping's + * dirty pages that lie within the byte offsets + * @mapping: address space structure to write + * @start: offset in bytes where the range starts + * @end : offset in bytes where the range ends + * + * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as + * opposed to a regular memory * cleansing writeback. The difference between + * these two operations is that if a dirty page/buffer is encountered, it must + * be waited upon, and not just skipped over. + */ +static int __filemap_fdatawrite_range(struct address_space *mapping, + loff_t start, loff_t end, int sync_mode) +{ + int ret; + struct writeback_control wbc = { + .sync_mode = sync_mode, + .nr_to_write = mapping->nrpages * 2, + .start = start, + .end = end, + }; + + if (!mapping_cap_writeback_dirty(mapping)) + return 0; + + ret = do_writepages(mapping, &wbc); + return ret; +} + +static inline int __filemap_fdatawrite(struct address_space *mapping, + int sync_mode) +{ + return __filemap_fdatawrite_range(mapping, 0, 0, sync_mode); +} + +int filemap_fdatawrite(struct address_space *mapping) +{ + return __filemap_fdatawrite(mapping, WB_SYNC_ALL); +} +EXPORT_SYMBOL(filemap_fdatawrite); + +static int filemap_fdatawrite_range(struct address_space *mapping, + loff_t start, loff_t end) +{ + return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL); +} + +/* + * This is a mostly non-blocking flush. Not suitable for data-integrity + * purposes - I/O may not be started against all dirty pages. + */ +int filemap_flush(struct address_space *mapping) +{ + return __filemap_fdatawrite(mapping, WB_SYNC_NONE); +} +EXPORT_SYMBOL(filemap_flush); + +/* + * Wait for writeback to complete against pages indexed by start->end + * inclusive + */ +static int wait_on_page_writeback_range(struct address_space *mapping, + pgoff_t start, pgoff_t end) +{ + struct pagevec pvec; + int nr_pages; + int ret = 0; + pgoff_t index; + + if (end < start) + return 0; + + pagevec_init(&pvec, 0); + index = start; + while ((index <= end) && + (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_WRITEBACK, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) { + unsigned i; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + /* until radix tree lookup accepts end_index */ + if (page->index > end) + continue; + + wait_on_page_writeback(page); + if (PageError(page)) + ret = -EIO; + } + pagevec_release(&pvec); + cond_resched(); + } + + /* Check for outstanding write errors */ + if (test_and_clear_bit(AS_ENOSPC, &mapping->flags)) + ret = -ENOSPC; + if (test_and_clear_bit(AS_EIO, &mapping->flags)) + ret = -EIO; + + return ret; +} + +/* + * Write and wait upon all the pages in the passed range. This is a "data + * integrity" operation. It waits upon in-flight writeout before starting and + * waiting upon new writeout. If there was an IO error, return it. + * + * We need to re-take i_sem during the generic_osync_inode list walk because + * it is otherwise livelockable. + */ +int sync_page_range(struct inode *inode, struct address_space *mapping, + loff_t pos, size_t count) +{ + pgoff_t start = pos >> PAGE_CACHE_SHIFT; + pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT; + int ret; + + if (!mapping_cap_writeback_dirty(mapping) || !count) + return 0; + ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1); + if (ret == 0) { + down(&inode->i_sem); + ret = generic_osync_inode(inode, mapping, OSYNC_METADATA); + up(&inode->i_sem); + } + if (ret == 0) + ret = wait_on_page_writeback_range(mapping, start, end); + return ret; +} +EXPORT_SYMBOL(sync_page_range); + +/* + * Note: Holding i_sem across sync_page_range_nolock is not a good idea + * as it forces O_SYNC writers to different parts of the same file + * to be serialised right until io completion. + */ +int sync_page_range_nolock(struct inode *inode, struct address_space *mapping, + loff_t pos, size_t count) +{ + pgoff_t start = pos >> PAGE_CACHE_SHIFT; + pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT; + int ret; + + if (!mapping_cap_writeback_dirty(mapping) || !count) + return 0; + ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1); + if (ret == 0) + ret = generic_osync_inode(inode, mapping, OSYNC_METADATA); + if (ret == 0) + ret = wait_on_page_writeback_range(mapping, start, end); + return ret; +} +EXPORT_SYMBOL(sync_page_range_nolock); + +/** + * filemap_fdatawait - walk the list of under-writeback pages of the given + * address space and wait for all of them. + * + * @mapping: address space structure to wait for + */ +int filemap_fdatawait(struct address_space *mapping) +{ + loff_t i_size = i_size_read(mapping->host); + + if (i_size == 0) + return 0; + + return wait_on_page_writeback_range(mapping, 0, + (i_size - 1) >> PAGE_CACHE_SHIFT); +} +EXPORT_SYMBOL(filemap_fdatawait); + +int filemap_write_and_wait(struct address_space *mapping) +{ + int retval = 0; + + if (mapping->nrpages) { + retval = filemap_fdatawrite(mapping); + if (retval == 0) + retval = filemap_fdatawait(mapping); + } + return retval; +} + +int filemap_write_and_wait_range(struct address_space *mapping, + loff_t lstart, loff_t lend) +{ + int retval = 0; + + if (mapping->nrpages) { + retval = __filemap_fdatawrite_range(mapping, lstart, lend, + WB_SYNC_ALL); + if (retval == 0) + retval = wait_on_page_writeback_range(mapping, + lstart >> PAGE_CACHE_SHIFT, + lend >> PAGE_CACHE_SHIFT); + } + return retval; +} + +/* + * This function is used to add newly allocated pagecache pages: + * the page is new, so we can just run SetPageLocked() against it. + * The other page state flags were set by rmqueue(). + * + * This function does not add the page to the LRU. The caller must do that. + */ +int add_to_page_cache(struct page *page, struct address_space *mapping, + pgoff_t offset, int gfp_mask) +{ + int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); + + if (error == 0) { + write_lock_irq(&mapping->tree_lock); + error = radix_tree_insert(&mapping->page_tree, offset, page); + if (!error) { + page_cache_get(page); + SetPageLocked(page); + page->mapping = mapping; + page->index = offset; + mapping->nrpages++; + pagecache_acct(1); + } + write_unlock_irq(&mapping->tree_lock); + radix_tree_preload_end(); + } + return error; +} + +EXPORT_SYMBOL(add_to_page_cache); + +int add_to_page_cache_lru(struct page *page, struct address_space *mapping, + pgoff_t offset, int gfp_mask) +{ + int ret = add_to_page_cache(page, mapping, offset, gfp_mask); + if (ret == 0) + lru_cache_add(page); + return ret; +} + +/* + * In order to wait for pages to become available there must be + * waitqueues associated with pages. By using a hash table of + * waitqueues where the bucket discipline is to maintain all + * waiters on the same queue and wake all when any of the pages + * become available, and for the woken contexts to check to be + * sure the appropriate page became available, this saves space + * at a cost of "thundering herd" phenomena during rare hash + * collisions. + */ +static wait_queue_head_t *page_waitqueue(struct page *page) +{ + const struct zone *zone = page_zone(page); + + return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)]; +} + +static inline void wake_up_page(struct page *page, int bit) +{ + __wake_up_bit(page_waitqueue(page), &page->flags, bit); +} + +void fastcall wait_on_page_bit(struct page *page, int bit_nr) +{ + DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); + + if (test_bit(bit_nr, &page->flags)) + __wait_on_bit(page_waitqueue(page), &wait, sync_page, + TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_on_page_bit); + +/** + * unlock_page() - unlock a locked page + * + * @page: the page + * + * Unlocks the page and wakes up sleepers in ___wait_on_page_locked(). + * Also wakes sleepers in wait_on_page_writeback() because the wakeup + * mechananism between PageLocked pages and PageWriteback pages is shared. + * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep. + * + * The first mb is necessary to safely close the critical section opened by the + * TestSetPageLocked(), the second mb is necessary to enforce ordering between + * the clear_bit and the read of the waitqueue (to avoid SMP races with a + * parallel wait_on_page_locked()). + */ +void fastcall unlock_page(struct page *page) +{ + smp_mb__before_clear_bit(); + if (!TestClearPageLocked(page)) + BUG(); + smp_mb__after_clear_bit(); + wake_up_page(page, PG_locked); +} +EXPORT_SYMBOL(unlock_page); + +/* + * End writeback against a page. + */ +void end_page_writeback(struct page *page) +{ + if (!TestClearPageReclaim(page) || rotate_reclaimable_page(page)) { + if (!test_clear_page_writeback(page)) + BUG(); + } + smp_mb__after_clear_bit(); + wake_up_page(page, PG_writeback); +} +EXPORT_SYMBOL(end_page_writeback); + +/* + * Get a lock on the page, assuming we need to sleep to get it. + * + * Ugly: running sync_page() in state TASK_UNINTERRUPTIBLE is scary. If some + * random driver's requestfn sets TASK_RUNNING, we could busywait. However + * chances are that on the second loop, the block layer's plug list is empty, + * so sync_page() will then return in state TASK_UNINTERRUPTIBLE. + */ +void fastcall __lock_page(struct page *page) +{ + DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); + + __wait_on_bit_lock(page_waitqueue(page), &wait, sync_page, + TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(__lock_page); + +/* + * a rather lightweight function, finding and getting a reference to a + * hashed page atomically. + */ +struct page * find_get_page(struct address_space *mapping, unsigned long offset) +{ + struct page *page; + + read_lock_irq(&mapping->tree_lock); + page = radix_tree_lookup(&mapping->page_tree, offset); + if (page) + page_cache_get(page); + read_unlock_irq(&mapping->tree_lock); + return page; +} + +EXPORT_SYMBOL(find_get_page); + +/* + * Same as above, but trylock it instead of incrementing the count. + */ +struct page *find_trylock_page(struct address_space *mapping, unsigned long offset) +{ + struct page *page; + + read_lock_irq(&mapping->tree_lock); + page = radix_tree_lookup(&mapping->page_tree, offset); + if (page && TestSetPageLocked(page)) + page = NULL; + read_unlock_irq(&mapping->tree_lock); + return page; +} + +EXPORT_SYMBOL(find_trylock_page); + +/** + * find_lock_page - locate, pin and lock a pagecache page + * + * @mapping - the address_space to search + * @offset - the page index + * + * Locates the desired pagecache page, locks it, increments its reference + * count and returns its address. + * + * Returns zero if the page was not present. find_lock_page() may sleep. + */ +struct page *find_lock_page(struct address_space *mapping, + unsigned long offset) +{ + struct page *page; + + read_lock_irq(&mapping->tree_lock); +repeat: + page = radix_tree_lookup(&mapping->page_tree, offset); + if (page) { + page_cache_get(page); + if (TestSetPageLocked(page)) { + read_unlock_irq(&mapping->tree_lock); + lock_page(page); + read_lock_irq(&mapping->tree_lock); + + /* Has the page been truncated while we slept? */ + if (page->mapping != mapping || page->index != offset) { + unlock_page(page); + page_cache_release(page); + goto repeat; + } + } + } + read_unlock_irq(&mapping->tree_lock); + return page; +} + +EXPORT_SYMBOL(find_lock_page); + +/** + * find_or_create_page - locate or add a pagecache page + * + * @mapping - the page's address_space + * @index - the page's index into the mapping + * @gfp_mask - page allocation mode + * + * Locates a page in the pagecache. If the page is not present, a new page + * is allocated using @gfp_mask and is added to the pagecache and to the VM's + * LRU list. The returned page is locked and has its reference count + * incremented. + * + * find_or_create_page() may sleep, even if @gfp_flags specifies an atomic + * allocation! + * + * find_or_create_page() returns the desired page's address, or zero on + * memory exhaustion. + */ +struct page *find_or_create_page(struct address_space *mapping, + unsigned long index, unsigned int gfp_mask) +{ + struct page *page, *cached_page = NULL; + int err; +repeat: + page = find_lock_page(mapping, index); + if (!page) { + if (!cached_page) { + cached_page = alloc_page(gfp_mask); + if (!cached_page) + return NULL; + } + err = add_to_page_cache_lru(cached_page, mapping, + index, gfp_mask); + if (!err) { + page = cached_page; + cached_page = NULL; + } else if (err == -EEXIST) + goto repeat; + } + if (cached_page) + page_cache_release(cached_page); + return page; +} + +EXPORT_SYMBOL(find_or_create_page); + +/** + * find_get_pages - gang pagecache lookup + * @mapping: The address_space to search + * @start: The starting page index + * @nr_pages: The maximum number of pages + * @pages: Where the resulting pages are placed + * + * find_get_pages() will search for and return a group of up to + * @nr_pages pages in the mapping. The pages are placed at @pages. + * find_get_pages() takes a reference against the returned pages. + * + * The search returns a group of mapping-contiguous pages with ascending + * indexes. There may be holes in the indices due to not-present pages. + * + * find_get_pages() returns the number of pages which were found. + */ +unsigned find_get_pages(struct address_space *mapping, pgoff_t start, + unsigned int nr_pages, struct page **pages) +{ + unsigned int i; + unsigned int ret; + + read_lock_irq(&mapping->tree_lock); + ret = radix_tree_gang_lookup(&mapping->page_tree, + (void **)pages, start, nr_pages); + for (i = 0; i < ret; i++) + page_cache_get(pages[i]); + read_unlock_irq(&mapping->tree_lock); + return ret; +} + +/* + * Like find_get_pages, except we only return pages which are tagged with + * `tag'. We update *index to index the next page for the traversal. + */ +unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, + int tag, unsigned int nr_pages, struct page **pages) +{ + unsigned int i; + unsigned int ret; + + read_lock_irq(&mapping->tree_lock); + ret = radix_tree_gang_lookup_tag(&mapping->page_tree, + (void **)pages, *index, nr_pages, tag); + for (i = 0; i < ret; i++) + page_cache_get(pages[i]); + if (ret) + *index = pages[ret - 1]->index + 1; + read_unlock_irq(&mapping->tree_lock); + return ret; +} + +/* + * Same as grab_cache_page, but do not wait if the page is unavailable. + * This is intended for speculative data generators, where the data can + * be regenerated if the page couldn't be grabbed. This routine should + * be safe to call while holding the lock for another page. + * + * Clear __GFP_FS when allocating the page to avoid recursion into the fs + * and deadlock against the caller's locked page. + */ +struct page * +grab_cache_page_nowait(struct address_space *mapping, unsigned long index) +{ + struct page *page = find_get_page(mapping, index); + unsigned int gfp_mask; + + if (page) { + if (!TestSetPageLocked(page)) + return page; + page_cache_release(page); + return NULL; + } + gfp_mask = mapping_gfp_mask(mapping) & ~__GFP_FS; + page = alloc_pages(gfp_mask, 0); + if (page && add_to_page_cache_lru(page, mapping, index, gfp_mask)) { + page_cache_release(page); + page = NULL; + } + return page; +} + +EXPORT_SYMBOL(grab_cache_page_nowait); + +/* + * This is a generic file read routine, and uses the + * mapping->a_ops->readpage() function for the actual low-level + * stuff. + * + * This is really ugly. But the goto's actually try to clarify some + * of the logic when it comes to error handling etc. + * + * Note the struct file* is only passed for the use of readpage. It may be + * NULL. + */ +void do_generic_mapping_read(struct address_space *mapping, + struct file_ra_state *_ra, + struct file *filp, + loff_t *ppos, + read_descriptor_t *desc, + read_actor_t actor) +{ + struct inode *inode = mapping->host; + unsigned long index; + unsigned long end_index; + unsigned long offset; + unsigned long last_index; + unsigned long next_index; + unsigned long prev_index; + loff_t isize; + struct page *cached_page; + int error; + struct file_ra_state ra = *_ra; + + cached_page = NULL; + index = *ppos >> PAGE_CACHE_SHIFT; + next_index = index; + prev_index = ra.prev_page; + last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; + offset = *ppos & ~PAGE_CACHE_MASK; + + isize = i_size_read(inode); + if (!isize) + goto out; + + end_index = (isize - 1) >> PAGE_CACHE_SHIFT; + for (;;) { + struct page *page; + unsigned long nr, ret; + + /* nr is the maximum number of bytes to copy from this page */ + nr = PAGE_CACHE_SIZE; + if (index >= end_index) { + if (index > end_index) + goto out; + nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; + if (nr <= offset) { + goto out; + } + } + nr = nr - offset; + + cond_resched(); + if (index == next_index) + next_index = page_cache_readahead(mapping, &ra, filp, + index, last_index - index); + +find_page: + page = find_get_page(mapping, index); + if (unlikely(page == NULL)) { + handle_ra_miss(mapping, &ra, index); + goto no_cached_page; + } + if (!PageUptodate(page)) + goto page_not_up_to_date; +page_ok: + + /* If users can be writing to this page using arbitrary + * virtual addresses, take care about potential aliasing + * before reading the page on the kernel side. + */ + if (mapping_writably_mapped(mapping)) + flush_dcache_page(page); + + /* + * When (part of) the same page is read multiple times + * in succession, only mark it as accessed the first time. + */ + if (prev_index != index) + mark_page_accessed(page); + prev_index = index; + + /* + * Ok, we have the page, and it's up-to-date, so + * now we can copy it to user space... + * + * The actor routine returns how many bytes were actually used.. + * NOTE! This may not be the same as how much of a user buffer + * we filled up (we may be padding etc), so we can only update + * "pos" here (the actor routine has to update the user buffer + * pointers and the remaining count). + */ + ret = actor(desc, page, offset, nr); + offset += ret; + index += offset >> PAGE_CACHE_SHIFT; + offset &= ~PAGE_CACHE_MASK; + + page_cache_release(page); + if (ret == nr && desc->count) + continue; + goto out; + +page_not_up_to_date: + /* Get exclusive access to the page ... */ + lock_page(page); + + /* Did it get unhashed before we got the lock? */ + if (!page->mapping) { + unlock_page(page); + page_cache_release(page); + continue; + } + + /* Did somebody else fill it already? */ + if (PageUptodate(page)) { + unlock_page(page); + goto page_ok; + } + +readpage: + /* Start the actual read. The read will unlock the page. */ + error = mapping->a_ops->readpage(filp, page); + + if (unlikely(error)) + goto readpage_error; + + if (!PageUptodate(page)) { + lock_page(page); + if (!PageUptodate(page)) { + if (page->mapping == NULL) { + /* + * invalidate_inode_pages got it + */ + unlock_page(page); + page_cache_release(page); + goto find_page; + } + unlock_page(page); + error = -EIO; + goto readpage_error; + } + unlock_page(page); + } + + /* + * i_size must be checked after we have done ->readpage. + * + * Checking i_size after the readpage allows us to calculate + * the correct value for "nr", which means the zero-filled + * part of the page is not copied back to userspace (unless + * another truncate extends the file - this is desired though). + */ + isize = i_size_read(inode); + end_index = (isize - 1) >> PAGE_CACHE_SHIFT; + if (unlikely(!isize || index > end_index)) { + page_cache_release(page); + goto out; + } + + /* nr is the maximum number of bytes to copy from this page */ + nr = PAGE_CACHE_SIZE; + if (index == end_index) { + nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; + if (nr <= offset) { + page_cache_release(page); + goto out; + } + } + nr = nr - offset; + goto page_ok; + +readpage_error: + /* UHHUH! A synchronous read error occurred. Report it */ + desc->error = error; + page_cache_release(page); + goto out; + +no_cached_page: + /* + * Ok, it wasn't cached, so we need to create a new + * page.. + */ + if (!cached_page) { + cached_page = page_cache_alloc_cold(mapping); + if (!cached_page) { + desc->error = -ENOMEM; + goto out; + } + } + error = add_to_page_cache_lru(cached_page, mapping, + index, GFP_KERNEL); + if (error) { + if (error == -EEXIST) + goto find_page; + desc->error = error; + goto out; + } + page = cached_page; + cached_page = NULL; + goto readpage; + } + +out: + *_ra = ra; + + *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset; + if (cached_page) + page_cache_release(cached_page); + if (filp) + file_accessed(filp); +} + +EXPORT_SYMBOL(do_generic_mapping_read); + +int file_read_actor(read_descriptor_t *desc, struct page *page, + unsigned long offset, unsigned long size) +{ + char *kaddr; + unsigned long left, count = desc->count; + + if (size > count) + size = count; + + /* + * Faults on the destination of a read are common, so do it before + * taking the kmap. + */ + if (!fault_in_pages_writeable(desc->arg.buf, size)) { + kaddr = kmap_atomic(page, KM_USER0); + left = __copy_to_user_inatomic(desc->arg.buf, + kaddr + offset, size); + kunmap_atomic(kaddr, KM_USER0); + if (left == 0) + goto success; + } + + /* Do it the slow way */ + kaddr = kmap(page); + left = __copy_to_user(desc->arg.buf, kaddr + offset, size); + kunmap(page); + + if (left) { + size -= left; + desc->error = -EFAULT; + } +success: + desc->count = count - size; + desc->written += size; + desc->arg.buf += size; + return size; +} + +/* + * This is the "read()" routine for all filesystems + * that can use the page cache directly. + */ +ssize_t +__generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t *ppos) +{ + struct file *filp = iocb->ki_filp; + ssize_t retval; + unsigned long seg; + size_t count; + + count = 0; + for (seg = 0; seg < nr_segs; seg++) { + const struct iovec *iv = &iov[seg]; + + /* + * If any segment has a negative length, or the cumulative + * length ever wraps negative then return -EINVAL. + */ + count += iv->iov_len; + if (unlikely((ssize_t)(count|iv->iov_len) < 0)) + return -EINVAL; + if (access_ok(VERIFY_WRITE, iv->iov_base, iv->iov_len)) + continue; + if (seg == 0) + return -EFAULT; + nr_segs = seg; + count -= iv->iov_len; /* This segment is no good */ + break; + } + + /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ + if (filp->f_flags & O_DIRECT) { + loff_t pos = *ppos, size; + struct address_space *mapping; + struct inode *inode; + + mapping = filp->f_mapping; + inode = mapping->host; + retval = 0; + if (!count) + goto out; /* skip atime */ + size = i_size_read(inode); + if (pos < size) { + retval = generic_file_direct_IO(READ, iocb, + iov, pos, nr_segs); + if (retval >= 0 && !is_sync_kiocb(iocb)) + retval = -EIOCBQUEUED; + if (retval > 0) + *ppos = pos + retval; + } + file_accessed(filp); + goto out; + } + + retval = 0; + if (count) { + for (seg = 0; seg < nr_segs; seg++) { + read_descriptor_t desc; + + desc.written = 0; + desc.arg.buf = iov[seg].iov_base; + desc.count = iov[seg].iov_len; + if (desc.count == 0) + continue; + desc.error = 0; + do_generic_file_read(filp,ppos,&desc,file_read_actor); + retval += desc.written; + if (!retval) { + retval = desc.error; + break; + } + } + } +out: + return retval; +} + +EXPORT_SYMBOL(__generic_file_aio_read); + +ssize_t +generic_file_aio_read(struct kiocb *iocb, char __user *buf, size_t count, loff_t pos) +{ + struct iovec local_iov = { .iov_base = buf, .iov_len = count }; + + BUG_ON(iocb->ki_pos != pos); + return __generic_file_aio_read(iocb, &local_iov, 1, &iocb->ki_pos); +} + +EXPORT_SYMBOL(generic_file_aio_read); + +ssize_t +generic_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos) +{ + struct iovec local_iov = { .iov_base = buf, .iov_len = count }; + struct kiocb kiocb; + ssize_t ret; + + init_sync_kiocb(&kiocb, filp); + ret = __generic_file_aio_read(&kiocb, &local_iov, 1, ppos); + if (-EIOCBQUEUED == ret) + ret = wait_on_sync_kiocb(&kiocb); + return ret; +} + +EXPORT_SYMBOL(generic_file_read); + +int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size) +{ + ssize_t written; + unsigned long count = desc->count; + struct file *file = desc->arg.data; + + if (size > count) + size = count; + + written = file->f_op->sendpage(file, page, offset, + size, &file->f_pos, sizeerror = written; + written = 0; + } + desc->count = count - written; + desc->written += written; + return written; +} + +ssize_t generic_file_sendfile(struct file *in_file, loff_t *ppos, + size_t count, read_actor_t actor, void *target) +{ + read_descriptor_t desc; + + if (!count) + return 0; + + desc.written = 0; + desc.count = count; + desc.arg.data = target; + desc.error = 0; + + do_generic_file_read(in_file, ppos, &desc, actor); + if (desc.written) + return desc.written; + return desc.error; +} + +EXPORT_SYMBOL(generic_file_sendfile); + +static ssize_t +do_readahead(struct address_space *mapping, struct file *filp, + unsigned long index, unsigned long nr) +{ + if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) + return -EINVAL; + + force_page_cache_readahead(mapping, filp, index, + max_sane_readahead(nr)); + return 0; +} + +asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count) +{ + ssize_t ret; + struct file *file; + + ret = -EBADF; + file = fget(fd); + if (file) { + if (file->f_mode & FMODE_READ) { + struct address_space *mapping = file->f_mapping; + unsigned long start = offset >> PAGE_CACHE_SHIFT; + unsigned long end = (offset + count - 1) >> PAGE_CACHE_SHIFT; + unsigned long len = end - start + 1; + ret = do_readahead(mapping, file, start, len); + } + fput(file); + } + return ret; +} + +#ifdef CONFIG_MMU +/* + * This adds the requested page to the page cache if it isn't already there, + * and schedules an I/O to read in its contents from disk. + */ +static int FASTCALL(page_cache_read(struct file * file, unsigned long offset)); +static int fastcall page_cache_read(struct file * file, unsigned long offset) +{ + struct address_space *mapping = file->f_mapping; + struct page *page; + int error; + + page = page_cache_alloc_cold(mapping); + if (!page) + return -ENOMEM; + + error = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL); + if (!error) { + error = mapping->a_ops->readpage(file, page); + page_cache_release(page); + return error; + } + + /* + * We arrive here in the unlikely event that someone + * raced with us and added our page to the cache first + * or we are out of memory for radix-tree nodes. + */ + page_cache_release(page); + return error == -EEXIST ? 0 : error; +} + +#define MMAP_LOTSAMISS (100) + +/* + * filemap_nopage() is invoked via the vma operations vector for a + * mapped memory region to read in file data during a page fault. + * + * The goto's are kind of ugly, but this streamlines the normal case of having + * it in the page cache, and handles the special cases reasonably without + * having a lot of duplicated code. + */ +struct page *filemap_nopage(struct vm_area_struct *area, + unsigned long address, int *type) +{ + int error; + struct file *file = area->vm_file; + struct address_space *mapping = file->f_mapping; + struct file_ra_state *ra = &file->f_ra; + struct inode *inode = mapping->host; + struct page *page; + unsigned long size, pgoff; + int did_readaround = 0, majmin = VM_FAULT_MINOR; + + pgoff = ((address-area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff; + +retry_all: + size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + if (pgoff >= size) + goto outside_data_content; + + /* If we don't want any read-ahead, don't bother */ + if (VM_RandomReadHint(area)) + goto no_cached_page; + + /* + * The readahead code wants to be told about each and every page + * so it can build and shrink its windows appropriately + * + * For sequential accesses, we use the generic readahead logic. + */ + if (VM_SequentialReadHint(area)) + page_cache_readahead(mapping, ra, file, pgoff, 1); + + /* + * Do we have something in the page cache already? + */ +retry_find: + page = find_get_page(mapping, pgoff); + if (!page) { + unsigned long ra_pages; + + if (VM_SequentialReadHint(area)) { + handle_ra_miss(mapping, ra, pgoff); + goto no_cached_page; + } + ra->mmap_miss++; + + /* + * Do we miss much more than hit in this file? If so, + * stop bothering with read-ahead. It will only hurt. + */ + if (ra->mmap_miss > ra->mmap_hit + MMAP_LOTSAMISS) + goto no_cached_page; + + /* + * To keep the pgmajfault counter straight, we need to + * check did_readaround, as this is an inner loop. + */ + if (!did_readaround) { + majmin = VM_FAULT_MAJOR; + inc_page_state(pgmajfault); + } + did_readaround = 1; + ra_pages = max_sane_readahead(file->f_ra.ra_pages); + if (ra_pages) { + pgoff_t start = 0; + + if (pgoff > ra_pages / 2) + start = pgoff - ra_pages / 2; + do_page_cache_readahead(mapping, file, start, ra_pages); + } + page = find_get_page(mapping, pgoff); + if (!page) + goto no_cached_page; + } + + if (!did_readaround) + ra->mmap_hit++; + + /* + * Ok, found a page in the page cache, now we need to check + * that it's up-to-date. + */ + if (!PageUptodate(page)) + goto page_not_uptodate; + +success: + /* + * Found the page and have a reference on it. + */ + mark_page_accessed(page); + if (type) + *type = majmin; + return page; + +outside_data_content: + /* + * An external ptracer can access pages that normally aren't + * accessible.. + */ + if (area->vm_mm == current->mm) + return NULL; + /* Fall through to the non-read-ahead case */ +no_cached_page: + /* + * We're only likely to ever get here if MADV_RANDOM is in + * effect. + */ + error = page_cache_read(file, pgoff); + grab_swap_token(); + + /* + * The page we want has now been added to the page cache. + * In the unlikely event that someone removed it in the + * meantime, we'll just come back here and read it again. + */ + if (error >= 0) + goto retry_find; + + /* + * An error return from page_cache_read can result if the + * system is low on memory, or a problem occurs while trying + * to schedule I/O. + */ + if (error == -ENOMEM) + return NOPAGE_OOM; + return NULL; + +page_not_uptodate: + if (!did_readaround) { + majmin = VM_FAULT_MAJOR; + inc_page_state(pgmajfault); + } + lock_page(page); + + /* Did it get unhashed while we waited for it? */ + if (!page->mapping) { + unlock_page(page); + page_cache_release(page); + goto retry_all; + } + + /* Did somebody else get it up-to-date? */ + if (PageUptodate(page)) { + unlock_page(page); + goto success; + } + + if (!mapping->a_ops->readpage(file, page)) { + wait_on_page_locked(page); + if (PageUptodate(page)) + goto success; + } + + /* + * Umm, take care of errors if the page isn't up-to-date. + * Try to re-read it _once_. We do this synchronously, + * because there really aren't any performance issues here + * and we need to check for errors. + */ + lock_page(page); + + /* Somebody truncated the page on us? */ + if (!page->mapping) { + unlock_page(page); + page_cache_release(page); + goto retry_all; + } + + /* Somebody else successfully read it in? */ + if (PageUptodate(page)) { + unlock_page(page); + goto success; + } + ClearPageError(page); + if (!mapping->a_ops->readpage(file, page)) { + wait_on_page_locked(page); + if (PageUptodate(page)) + goto success; + } + + /* + * Things didn't work out. Return zero to tell the + * mm layer so, possibly freeing the page cache page first. + */ + page_cache_release(page); + return NULL; +} + +EXPORT_SYMBOL(filemap_nopage); + +static struct page * filemap_getpage(struct file *file, unsigned long pgoff, + int nonblock) +{ + struct address_space *mapping = file->f_mapping; + struct page *page; + int error; + + /* + * Do we have something in the page cache already? + */ +retry_find: + page = find_get_page(mapping, pgoff); + if (!page) { + if (nonblock) + return NULL; + goto no_cached_page; + } + + /* + * Ok, found a page in the page cache, now we need to check + * that it's up-to-date. + */ + if (!PageUptodate(page)) + goto page_not_uptodate; + +success: + /* + * Found the page and have a reference on it. + */ + mark_page_accessed(page); + return page; + +no_cached_page: + error = page_cache_read(file, pgoff); + + /* + * The page we want has now been added to the page cache. + * In the unlikely event that someone removed it in the + * meantime, we'll just come back here and read it again. + */ + if (error >= 0) + goto retry_find; + + /* + * An error return from page_cache_read can result if the + * system is low on memory, or a problem occurs while trying + * to schedule I/O. + */ + return NULL; + +page_not_uptodate: + lock_page(page); + + /* Did it get unhashed while we waited for it? */ + if (!page->mapping) { + unlock_page(page); + goto err; + } + + /* Did somebody else get it up-to-date? */ + if (PageUptodate(page)) { + unlock_page(page); + goto success; + } + + if (!mapping->a_ops->readpage(file, page)) { + wait_on_page_locked(page); + if (PageUptodate(page)) + goto success; + } + + /* + * Umm, take care of errors if the page isn't up-to-date. + * Try to re-read it _once_. We do this synchronously, + * because there really aren't any performance issues here + * and we need to check for errors. + */ + lock_page(page); + + /* Somebody truncated the page on us? */ + if (!page->mapping) { + unlock_page(page); + goto err; + } + /* Somebody else successfully read it in? */ + if (PageUptodate(page)) { + unlock_page(page); + goto success; + } + + ClearPageError(page); + if (!mapping->a_ops->readpage(file, page)) { + wait_on_page_locked(page); + if (PageUptodate(page)) + goto success; + } + + /* + * Things didn't work out. Return zero to tell the + * mm layer so, possibly freeing the page cache page first. + */ +err: + page_cache_release(page); + + return NULL; +} + +int filemap_populate(struct vm_area_struct *vma, unsigned long addr, + unsigned long len, pgprot_t prot, unsigned long pgoff, + int nonblock) +{ + struct file *file = vma->vm_file; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + unsigned long size; + struct mm_struct *mm = vma->vm_mm; + struct page *page; + int err; + + if (!nonblock) + force_page_cache_readahead(mapping, vma->vm_file, + pgoff, len >> PAGE_CACHE_SHIFT); + +repeat: + size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + if (pgoff + (len >> PAGE_CACHE_SHIFT) > size) + return -EINVAL; + + page = filemap_getpage(file, pgoff, nonblock); + if (!page && !nonblock) + return -ENOMEM; + if (page) { + err = install_page(mm, vma, addr, page, prot); + if (err) { + page_cache_release(page); + return err; + } + } else { + err = install_file_pte(mm, vma, addr, pgoff, prot); + if (err) + return err; + } + + len -= PAGE_SIZE; + addr += PAGE_SIZE; + pgoff++; + if (len) + goto repeat; + + return 0; +} + +struct vm_operations_struct generic_file_vm_ops = { + .nopage = filemap_nopage, + .populate = filemap_populate, +}; + +/* This is used for a general mmap of a disk file */ + +int generic_file_mmap(struct file * file, struct vm_area_struct * vma) +{ + struct address_space *mapping = file->f_mapping; + + if (!mapping->a_ops->readpage) + return -ENOEXEC; + file_accessed(file); + vma->vm_ops = &generic_file_vm_ops; + return 0; +} +EXPORT_SYMBOL(filemap_populate); + +/* + * This is for filesystems which do not implement ->writepage. + */ +int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma) +{ + if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) + return -EINVAL; + return generic_file_mmap(file, vma); +} +#else +int generic_file_mmap(struct file * file, struct vm_area_struct * vma) +{ + return -ENOSYS; +} +int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma) +{ + return -ENOSYS; +} +#endif /* CONFIG_MMU */ + +EXPORT_SYMBOL(generic_file_mmap); +EXPORT_SYMBOL(generic_file_readonly_mmap); + +static inline struct page *__read_cache_page(struct address_space *mapping, + unsigned long index, + int (*filler)(void *,struct page*), + void *data) +{ + struct page *page, *cached_page = NULL; + int err; +repeat: + page = find_get_page(mapping, index); + if (!page) { + if (!cached_page) { + cached_page = page_cache_alloc_cold(mapping); + if (!cached_page) + return ERR_PTR(-ENOMEM); + } + err = add_to_page_cache_lru(cached_page, mapping, + index, GFP_KERNEL); + if (err == -EEXIST) + goto repeat; + if (err < 0) { + /* Presumably ENOMEM for radix tree node */ + page_cache_release(cached_page); + return ERR_PTR(err); + } + page = cached_page; + cached_page = NULL; + err = filler(data, page); + if (err < 0) { + page_cache_release(page); + page = ERR_PTR(err); + } + } + if (cached_page) + page_cache_release(cached_page); + return page; +} + +/* + * Read into the page cache. If a page already exists, + * and PageUptodate() is not set, try to fill the page. + */ +struct page *read_cache_page(struct address_space *mapping, + unsigned long index, + int (*filler)(void *,struct page*), + void *data) +{ + struct page *page; + int err; + +retry: + page = __read_cache_page(mapping, index, filler, data); + if (IS_ERR(page)) + goto out; + mark_page_accessed(page); + if (PageUptodate(page)) + goto out; + + lock_page(page); + if (!page->mapping) { + unlock_page(page); + page_cache_release(page); + goto retry; + } + if (PageUptodate(page)) { + unlock_page(page); + goto out; + } + err = filler(data, page); + if (err < 0) { + page_cache_release(page); + page = ERR_PTR(err); + } + out: + return page; +} + +EXPORT_SYMBOL(read_cache_page); + +/* + * If the page was newly created, increment its refcount and add it to the + * caller's lru-buffering pagevec. This function is specifically for + * generic_file_write(). + */ +static inline struct page * +__grab_cache_page(struct address_space *mapping, unsigned long index, + struct page **cached_page, struct pagevec *lru_pvec) +{ + int err; + struct page *page; +repeat: + page = find_lock_page(mapping, index); + if (!page) { + if (!*cached_page) { + *cached_page = page_cache_alloc(mapping); + if (!*cached_page) + return NULL; + } + err = add_to_page_cache(*cached_page, mapping, + index, GFP_KERNEL); + if (err == -EEXIST) + goto repeat; + if (err == 0) { + page = *cached_page; + page_cache_get(page); + if (!pagevec_add(lru_pvec, page)) + __pagevec_lru_add(lru_pvec); + *cached_page = NULL; + } + } + return page; +} + +/* + * The logic we want is + * + * if suid or (sgid and xgrp) + * remove privs + */ +int remove_suid(struct dentry *dentry) +{ + mode_t mode = dentry->d_inode->i_mode; + int kill = 0; + int result = 0; + + /* suid always must be killed */ + if (unlikely(mode & S_ISUID)) + kill = ATTR_KILL_SUID; + + /* + * sgid without any exec bits is just a mandatory locking mark; leave + * it alone. If some exec bits are set, it's a real sgid; kill it. + */ + if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) + kill |= ATTR_KILL_SGID; + + if (unlikely(kill && !capable(CAP_FSETID))) { + struct iattr newattrs; + + newattrs.ia_valid = ATTR_FORCE | kill; + result = notify_change(dentry, &newattrs); + } + return result; +} +EXPORT_SYMBOL(remove_suid); + +/* + * Copy as much as we can into the page and return the number of bytes which + * were sucessfully copied. If a fault is encountered then clear the page + * out to (offset+bytes) and return the number of bytes which were copied. + */ +static inline size_t +filemap_copy_from_user(struct page *page, unsigned long offset, + const char __user *buf, unsigned bytes) +{ + char *kaddr; + int left; + + kaddr = kmap_atomic(page, KM_USER0); + left = __copy_from_user_inatomic(kaddr + offset, buf, bytes); + kunmap_atomic(kaddr, KM_USER0); + + if (left != 0) { + /* Do it the slow way */ + kaddr = kmap(page); + left = __copy_from_user(kaddr + offset, buf, bytes); + kunmap(page); + } + return bytes - left; +} + +static size_t +__filemap_copy_from_user_iovec(char *vaddr, + const struct iovec *iov, size_t base, size_t bytes) +{ + size_t copied = 0, left = 0; + + while (bytes) { + char __user *buf = iov->iov_base + base; + int copy = min(bytes, iov->iov_len - base); + + base = 0; + left = __copy_from_user_inatomic(vaddr, buf, copy); + copied += copy; + bytes -= copy; + vaddr += copy; + iov++; + + if (unlikely(left)) { + /* zero the rest of the target like __copy_from_user */ + if (bytes) + memset(vaddr, 0, bytes); + break; + } + } + return copied - left; +} + +/* + * This has the same sideeffects and return value as filemap_copy_from_user(). + * The difference is that on a fault we need to memset the remainder of the + * page (out to offset+bytes), to emulate filemap_copy_from_user()'s + * single-segment behaviour. + */ +static inline size_t +filemap_copy_from_user_iovec(struct page *page, unsigned long offset, + const struct iovec *iov, size_t base, size_t bytes) +{ + char *kaddr; + size_t copied; + + kaddr = kmap_atomic(page, KM_USER0); + copied = __filemap_copy_from_user_iovec(kaddr + offset, iov, + base, bytes); + kunmap_atomic(kaddr, KM_USER0); + if (copied != bytes) { + kaddr = kmap(page); + copied = __filemap_copy_from_user_iovec(kaddr + offset, iov, + base, bytes); + kunmap(page); + } + return copied; +} + +static inline void +filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes) +{ + const struct iovec *iov = *iovp; + size_t base = *basep; + + while (bytes) { + int copy = min(bytes, iov->iov_len - base); + + bytes -= copy; + base += copy; + if (iov->iov_len == base) { + iov++; + base = 0; + } + } + *iovp = iov; + *basep = base; +} + +/* + * Performs necessary checks before doing a write + * + * Can adjust writing position aor amount of bytes to write. + * Returns appropriate error code that caller should return or + * zero in case that write should be allowed. + */ +inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk) +{ + struct inode *inode = file->f_mapping->host; + unsigned long limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; + + if (unlikely(*pos < 0)) + return -EINVAL; + + if (unlikely(file->f_error)) { + int err = file->f_error; + file->f_error = 0; + return err; + } + + if (!isblk) { + /* FIXME: this is for backwards compatibility with 2.4 */ + if (file->f_flags & O_APPEND) + *pos = i_size_read(inode); + + if (limit != RLIM_INFINITY) { + if (*pos >= limit) { + send_sig(SIGXFSZ, current, 0); + return -EFBIG; + } + if (*count > limit - (typeof(limit))*pos) { + *count = limit - (typeof(limit))*pos; + } + } + } + + /* + * LFS rule + */ + if (unlikely(*pos + *count > MAX_NON_LFS && + !(file->f_flags & O_LARGEFILE))) { + if (*pos >= MAX_NON_LFS) { + send_sig(SIGXFSZ, current, 0); + return -EFBIG; + } + if (*count > MAX_NON_LFS - (unsigned long)*pos) { + *count = MAX_NON_LFS - (unsigned long)*pos; + } + } + + /* + * Are we about to exceed the fs block limit ? + * + * If we have written data it becomes a short write. If we have + * exceeded without writing data we send a signal and return EFBIG. + * Linus frestrict idea will clean these up nicely.. + */ + if (likely(!isblk)) { + if (unlikely(*pos >= inode->i_sb->s_maxbytes)) { + if (*count || *pos > inode->i_sb->s_maxbytes) { + send_sig(SIGXFSZ, current, 0); + return -EFBIG; + } + /* zero-length writes at ->s_maxbytes are OK */ + } + + if (unlikely(*pos + *count > inode->i_sb->s_maxbytes)) + *count = inode->i_sb->s_maxbytes - *pos; + } else { + loff_t isize; + if (bdev_read_only(I_BDEV(inode))) + return -EPERM; + isize = i_size_read(inode); + if (*pos >= isize) { + if (*count || *pos > isize) + return -ENOSPC; + } + + if (*pos + *count > isize) + *count = isize - *pos; + } + return 0; +} +EXPORT_SYMBOL(generic_write_checks); + +ssize_t +generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long *nr_segs, loff_t pos, loff_t *ppos, + size_t count, size_t ocount) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + ssize_t written; + + if (count != ocount) + *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count); + + written = generic_file_direct_IO(WRITE, iocb, iov, pos, *nr_segs); + if (written > 0) { + loff_t end = pos + written; + if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { + i_size_write(inode, end); + mark_inode_dirty(inode); + } + *ppos = end; + } + + /* + * Sync the fs metadata but not the minor inode changes and + * of course not the data as we did direct DMA for the IO. + * i_sem is held, which protects generic_osync_inode() from + * livelocking. + */ + if (written >= 0 && file->f_flags & O_SYNC) + generic_osync_inode(inode, mapping, OSYNC_METADATA); + if (written == count && !is_sync_kiocb(iocb)) + written = -EIOCBQUEUED; + return written; +} +EXPORT_SYMBOL(generic_file_direct_write); + +ssize_t +generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos, loff_t *ppos, + size_t count, ssize_t written) +{ + struct file *file = iocb->ki_filp; + struct address_space * mapping = file->f_mapping; + struct address_space_operations *a_ops = mapping->a_ops; + struct inode *inode = mapping->host; + long status = 0; + struct page *page; + struct page *cached_page = NULL; + size_t bytes; + struct pagevec lru_pvec; + const struct iovec *cur_iov = iov; /* current iovec */ + size_t iov_base = 0; /* offset in the current iovec */ + char __user *buf; + + pagevec_init(&lru_pvec, 0); + + /* + * handle partial DIO write. Adjust cur_iov if needed. + */ + if (likely(nr_segs == 1)) + buf = iov->iov_base + written; + else { + filemap_set_next_iovec(&cur_iov, &iov_base, written); + buf = iov->iov_base + iov_base; + } + + do { + unsigned long index; + unsigned long offset; + size_t copied; + + offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ + index = pos >> PAGE_CACHE_SHIFT; + bytes = PAGE_CACHE_SIZE - offset; + if (bytes > count) + bytes = count; + + /* + * Bring in the user page that we will copy from _first_. + * Otherwise there's a nasty deadlock on copying from the + * same page as we're writing to, without it being marked + * up-to-date. + */ + fault_in_pages_readable(buf, bytes); + + page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec); + if (!page) { + status = -ENOMEM; + break; + } + + status = a_ops->prepare_write(file, page, offset, offset+bytes); + if (unlikely(status)) { + loff_t isize = i_size_read(inode); + /* + * prepare_write() may have instantiated a few blocks + * outside i_size. Trim these off again. + */ + unlock_page(page); + page_cache_release(page); + if (pos + bytes > isize) + vmtruncate(inode, isize); + break; + } + if (likely(nr_segs == 1)) + copied = filemap_copy_from_user(page, offset, + buf, bytes); + else + copied = filemap_copy_from_user_iovec(page, offset, + cur_iov, iov_base, bytes); + flush_dcache_page(page); + status = a_ops->commit_write(file, page, offset, offset+bytes); + if (likely(copied > 0)) { + if (!status) + status = copied; + + if (status >= 0) { + written += status; + count -= status; + pos += status; + buf += status; + if (unlikely(nr_segs > 1)) + filemap_set_next_iovec(&cur_iov, + &iov_base, status); + } + } + if (unlikely(copied != bytes)) + if (status >= 0) + status = -EFAULT; + unlock_page(page); + mark_page_accessed(page); + page_cache_release(page); + if (status < 0) + break; + balance_dirty_pages_ratelimited(mapping); + cond_resched(); + } while (count); + *ppos = pos; + + if (cached_page) + page_cache_release(cached_page); + + /* + * For now, when the user asks for O_SYNC, we'll actually give O_DSYNC + */ + if (likely(status >= 0)) { + if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) { + if (!a_ops->writepage || !is_sync_kiocb(iocb)) + status = generic_osync_inode(inode, mapping, + OSYNC_METADATA|OSYNC_DATA); + } + } + + /* + * If we get here for O_DIRECT writes then we must have fallen through + * to buffered writes (block instantiation inside i_size). So we sync + * the file data here, to try to honour O_DIRECT expectations. + */ + if (unlikely(file->f_flags & O_DIRECT) && written) + status = filemap_write_and_wait(mapping); + + pagevec_lru_add(&lru_pvec); + return written ? written : status; +} +EXPORT_SYMBOL(generic_file_buffered_write); + +ssize_t +__generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t *ppos) +{ + struct file *file = iocb->ki_filp; + struct address_space * mapping = file->f_mapping; + size_t ocount; /* original count */ + size_t count; /* after file limit checks */ + struct inode *inode = mapping->host; + unsigned long seg; + loff_t pos; + ssize_t written; + ssize_t err; + + ocount = 0; + for (seg = 0; seg < nr_segs; seg++) { + const struct iovec *iv = &iov[seg]; + + /* + * If any segment has a negative length, or the cumulative + * length ever wraps negative then return -EINVAL. + */ + ocount += iv->iov_len; + if (unlikely((ssize_t)(ocount|iv->iov_len) < 0)) + return -EINVAL; + if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len)) + continue; + if (seg == 0) + return -EFAULT; + nr_segs = seg; + ocount -= iv->iov_len; /* This segment is no good */ + break; + } + + count = ocount; + pos = *ppos; + + vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); + + /* We can write back this queue in page reclaim */ + current->backing_dev_info = mapping->backing_dev_info; + written = 0; + + err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); + if (err) + goto out; + + if (count == 0) + goto out; + + err = remove_suid(file->f_dentry); + if (err) + goto out; + + inode_update_time(inode, 1); + + /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ + if (unlikely(file->f_flags & O_DIRECT)) { + written = generic_file_direct_write(iocb, iov, + &nr_segs, pos, ppos, count, ocount); + if (written < 0 || written == count) + goto out; + /* + * direct-io write to a hole: fall through to buffered I/O + * for completing the rest of the request. + */ + pos += written; + count -= written; + } + + written = generic_file_buffered_write(iocb, iov, nr_segs, + pos, ppos, count, written); +out: + current->backing_dev_info = NULL; + return written ? written : err; +} +EXPORT_SYMBOL(generic_file_aio_write_nolock); + +ssize_t +generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t *ppos) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + ssize_t ret; + loff_t pos = *ppos; + + ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs, ppos); + + if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { + int err; + + err = sync_page_range_nolock(inode, mapping, pos, ret); + if (err < 0) + ret = err; + } + return ret; +} + +ssize_t +__generic_file_write_nolock(struct file *file, const struct iovec *iov, + unsigned long nr_segs, loff_t *ppos) +{ + struct kiocb kiocb; + ssize_t ret; + + init_sync_kiocb(&kiocb, file); + ret = __generic_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos); + if (ret == -EIOCBQUEUED) + ret = wait_on_sync_kiocb(&kiocb); + return ret; +} + +ssize_t +generic_file_write_nolock(struct file *file, const struct iovec *iov, + unsigned long nr_segs, loff_t *ppos) +{ + struct kiocb kiocb; + ssize_t ret; + + init_sync_kiocb(&kiocb, file); + ret = generic_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos); + if (-EIOCBQUEUED == ret) + ret = wait_on_sync_kiocb(&kiocb); + return ret; +} +EXPORT_SYMBOL(generic_file_write_nolock); + +ssize_t generic_file_aio_write(struct kiocb *iocb, const char __user *buf, + size_t count, loff_t pos) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + ssize_t ret; + struct iovec local_iov = { .iov_base = (void __user *)buf, + .iov_len = count }; + + BUG_ON(iocb->ki_pos != pos); + + down(&inode->i_sem); + ret = __generic_file_aio_write_nolock(iocb, &local_iov, 1, + &iocb->ki_pos); + up(&inode->i_sem); + + if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { + ssize_t err; + + err = sync_page_range(inode, mapping, pos, ret); + if (err < 0) + ret = err; + } + return ret; +} +EXPORT_SYMBOL(generic_file_aio_write); + +ssize_t generic_file_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + ssize_t ret; + struct iovec local_iov = { .iov_base = (void __user *)buf, + .iov_len = count }; + + down(&inode->i_sem); + ret = __generic_file_write_nolock(file, &local_iov, 1, ppos); + up(&inode->i_sem); + + if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { + ssize_t err; + + err = sync_page_range(inode, mapping, *ppos - ret, ret); + if (err < 0) + ret = err; + } + return ret; +} +EXPORT_SYMBOL(generic_file_write); + +ssize_t generic_file_readv(struct file *filp, const struct iovec *iov, + unsigned long nr_segs, loff_t *ppos) +{ + struct kiocb kiocb; + ssize_t ret; + + init_sync_kiocb(&kiocb, filp); + ret = __generic_file_aio_read(&kiocb, iov, nr_segs, ppos); + if (-EIOCBQUEUED == ret) + ret = wait_on_sync_kiocb(&kiocb); + return ret; +} +EXPORT_SYMBOL(generic_file_readv); + +ssize_t generic_file_writev(struct file *file, const struct iovec *iov, + unsigned long nr_segs, loff_t *ppos) +{ + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + ssize_t ret; + + down(&inode->i_sem); + ret = __generic_file_write_nolock(file, iov, nr_segs, ppos); + up(&inode->i_sem); + + if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { + int err; + + err = sync_page_range(inode, mapping, *ppos - ret, ret); + if (err < 0) + ret = err; + } + return ret; +} +EXPORT_SYMBOL(generic_file_writev); + +/* + * Called under i_sem for writes to S_ISREG files. Returns -EIO if something + * went wrong during pagecache shootdown. + */ +ssize_t +generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, + loff_t offset, unsigned long nr_segs) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + ssize_t retval; + size_t write_len = 0; + + /* + * If it's a write, unmap all mmappings of the file up-front. This + * will cause any pte dirty bits to be propagated into the pageframes + * for the subsequent filemap_write_and_wait(). + */ + if (rw == WRITE) { + write_len = iov_length(iov, nr_segs); + if (mapping_mapped(mapping)) + unmap_mapping_range(mapping, offset, write_len, 0); + } + + retval = filemap_write_and_wait(mapping); + if (retval == 0) { + retval = mapping->a_ops->direct_IO(rw, iocb, iov, + offset, nr_segs); + if (rw == WRITE && mapping->nrpages) { + pgoff_t end = (offset + write_len - 1) + >> PAGE_CACHE_SHIFT; + int err = invalidate_inode_pages2_range(mapping, + offset >> PAGE_CACHE_SHIFT, end); + if (err) + retval = err; + } + } + return retval; +} +EXPORT_SYMBOL_GPL(generic_file_direct_IO); diff --git a/mm/fremap.c b/mm/fremap.c new file mode 100644 index 000000000000..3235fb77c133 --- /dev/null +++ b/mm/fremap.c @@ -0,0 +1,256 @@ +/* + * linux/mm/fremap.c + * + * Explicit pagetable population and nonlinear (random) mappings support. + * + * started by Ingo Molnar, Copyright (C) 2002, 2003 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +static inline void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) +{ + pte_t pte = *ptep; + + if (pte_none(pte)) + return; + if (pte_present(pte)) { + unsigned long pfn = pte_pfn(pte); + + flush_cache_page(vma, addr, pfn); + pte = ptep_clear_flush(vma, addr, ptep); + if (pfn_valid(pfn)) { + struct page *page = pfn_to_page(pfn); + if (!PageReserved(page)) { + if (pte_dirty(pte)) + set_page_dirty(page); + page_remove_rmap(page); + page_cache_release(page); + dec_mm_counter(mm, rss); + } + } + } else { + if (!pte_file(pte)) + free_swap_and_cache(pte_to_swp_entry(pte)); + pte_clear(mm, addr, ptep); + } +} + +/* + * Install a file page to a given virtual memory address, release any + * previously existing mapping. + */ +int install_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, struct page *page, pgprot_t prot) +{ + struct inode *inode; + pgoff_t size; + int err = -ENOMEM; + pte_t *pte; + pmd_t *pmd; + pud_t *pud; + pgd_t *pgd; + pte_t pte_val; + + pgd = pgd_offset(mm, addr); + spin_lock(&mm->page_table_lock); + + pud = pud_alloc(mm, pgd, addr); + if (!pud) + goto err_unlock; + + pmd = pmd_alloc(mm, pud, addr); + if (!pmd) + goto err_unlock; + + pte = pte_alloc_map(mm, pmd, addr); + if (!pte) + goto err_unlock; + + /* + * This page may have been truncated. Tell the + * caller about it. + */ + err = -EINVAL; + inode = vma->vm_file->f_mapping->host; + size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + if (!page->mapping || page->index >= size) + goto err_unlock; + + zap_pte(mm, vma, addr, pte); + + inc_mm_counter(mm,rss); + flush_icache_page(vma, page); + set_pte_at(mm, addr, pte, mk_pte(page, prot)); + page_add_file_rmap(page); + pte_val = *pte; + pte_unmap(pte); + update_mmu_cache(vma, addr, pte_val); + + err = 0; +err_unlock: + spin_unlock(&mm->page_table_lock); + return err; +} +EXPORT_SYMBOL(install_page); + + +/* + * Install a file pte to a given virtual memory address, release any + * previously existing mapping. + */ +int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, unsigned long pgoff, pgprot_t prot) +{ + int err = -ENOMEM; + pte_t *pte; + pmd_t *pmd; + pud_t *pud; + pgd_t *pgd; + pte_t pte_val; + + pgd = pgd_offset(mm, addr); + spin_lock(&mm->page_table_lock); + + pud = pud_alloc(mm, pgd, addr); + if (!pud) + goto err_unlock; + + pmd = pmd_alloc(mm, pud, addr); + if (!pmd) + goto err_unlock; + + pte = pte_alloc_map(mm, pmd, addr); + if (!pte) + goto err_unlock; + + zap_pte(mm, vma, addr, pte); + + set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff)); + pte_val = *pte; + pte_unmap(pte); + update_mmu_cache(vma, addr, pte_val); + spin_unlock(&mm->page_table_lock); + return 0; + +err_unlock: + spin_unlock(&mm->page_table_lock); + return err; +} + + +/*** + * sys_remap_file_pages - remap arbitrary pages of a shared backing store + * file within an existing vma. + * @start: start of the remapped virtual memory range + * @size: size of the remapped virtual memory range + * @prot: new protection bits of the range + * @pgoff: to be mapped page of the backing store file + * @flags: 0 or MAP_NONBLOCKED - the later will cause no IO. + * + * this syscall works purely via pagetables, so it's the most efficient + * way to map the same (large) file into a given virtual window. Unlike + * mmap()/mremap() it does not create any new vmas. The new mappings are + * also safe across swapout. + * + * NOTE: the 'prot' parameter right now is ignored, and the vma's default + * protection is used. Arbitrary protections might be implemented in the + * future. + */ +asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, + unsigned long __prot, unsigned long pgoff, unsigned long flags) +{ + struct mm_struct *mm = current->mm; + struct address_space *mapping; + unsigned long end = start + size; + struct vm_area_struct *vma; + int err = -EINVAL; + int has_write_lock = 0; + + if (__prot) + return err; + /* + * Sanitize the syscall parameters: + */ + start = start & PAGE_MASK; + size = size & PAGE_MASK; + + /* Does the address range wrap, or is the span zero-sized? */ + if (start + size <= start) + return err; + + /* Can we represent this offset inside this architecture's pte's? */ +#if PTE_FILE_MAX_BITS < BITS_PER_LONG + if (pgoff + (size >> PAGE_SHIFT) >= (1UL << PTE_FILE_MAX_BITS)) + return err; +#endif + + /* We need down_write() to change vma->vm_flags. */ + down_read(&mm->mmap_sem); + retry: + vma = find_vma(mm, start); + + /* + * Make sure the vma is shared, that it supports prefaulting, + * and that the remapped range is valid and fully within + * the single existing vma. vm_private_data is used as a + * swapout cursor in a VM_NONLINEAR vma (unless VM_RESERVED + * or VM_LOCKED, but VM_LOCKED could be revoked later on). + */ + if (vma && (vma->vm_flags & VM_SHARED) && + (!vma->vm_private_data || + (vma->vm_flags & (VM_NONLINEAR|VM_RESERVED))) && + vma->vm_ops && vma->vm_ops->populate && + end > start && start >= vma->vm_start && + end <= vma->vm_end) { + + /* Must set VM_NONLINEAR before any pages are populated. */ + if (pgoff != linear_page_index(vma, start) && + !(vma->vm_flags & VM_NONLINEAR)) { + if (!has_write_lock) { + up_read(&mm->mmap_sem); + down_write(&mm->mmap_sem); + has_write_lock = 1; + goto retry; + } + mapping = vma->vm_file->f_mapping; + spin_lock(&mapping->i_mmap_lock); + flush_dcache_mmap_lock(mapping); + vma->vm_flags |= VM_NONLINEAR; + vma_prio_tree_remove(vma, &mapping->i_mmap); + vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); + flush_dcache_mmap_unlock(mapping); + spin_unlock(&mapping->i_mmap_lock); + } + + err = vma->vm_ops->populate(vma, start, size, + vma->vm_page_prot, + pgoff, flags & MAP_NONBLOCK); + + /* + * We can't clear VM_NONLINEAR because we'd have to do + * it after ->populate completes, and that would prevent + * downgrading the lock. (Locks can't be upgraded). + */ + } + if (likely(!has_write_lock)) + up_read(&mm->mmap_sem); + else + up_write(&mm->mmap_sem); + + return err; +} + diff --git a/mm/highmem.c b/mm/highmem.c new file mode 100644 index 000000000000..d01276506b00 --- /dev/null +++ b/mm/highmem.c @@ -0,0 +1,607 @@ +/* + * High memory handling common code and variables. + * + * (C) 1999 Andrea Arcangeli, SuSE GmbH, andrea@suse.de + * Gerhard Wichert, Siemens AG, Gerhard.Wichert@pdb.siemens.de + * + * + * Redesigned the x86 32-bit VM architecture to deal with + * 64-bit physical space. With current x86 CPUs this + * means up to 64 Gigabytes physical RAM. + * + * Rewrote high memory support to move the page cache into + * high memory. Implemented permanent (schedulable) kmaps + * based on Linus' idea. + * + * Copyright (C) 1999 Ingo Molnar + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static mempool_t *page_pool, *isa_page_pool; + +static void *page_pool_alloc(unsigned int __nocast gfp_mask, void *data) +{ + unsigned int gfp = gfp_mask | (unsigned int) (long) data; + + return alloc_page(gfp); +} + +static void page_pool_free(void *page, void *data) +{ + __free_page(page); +} + +/* + * Virtual_count is not a pure "count". + * 0 means that it is not mapped, and has not been mapped + * since a TLB flush - it is usable. + * 1 means that there are no users, but it has been mapped + * since the last TLB flush - so we can't use it. + * n means that there are (n-1) current users of it. + */ +#ifdef CONFIG_HIGHMEM +static int pkmap_count[LAST_PKMAP]; +static unsigned int last_pkmap_nr; +static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock); + +pte_t * pkmap_page_table; + +static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait); + +static void flush_all_zero_pkmaps(void) +{ + int i; + + flush_cache_kmaps(); + + for (i = 0; i < LAST_PKMAP; i++) { + struct page *page; + + /* + * zero means we don't have anything to do, + * >1 means that it is still in use. Only + * a count of 1 means that it is free but + * needs to be unmapped + */ + if (pkmap_count[i] != 1) + continue; + pkmap_count[i] = 0; + + /* sanity check */ + if (pte_none(pkmap_page_table[i])) + BUG(); + + /* + * Don't need an atomic fetch-and-clear op here; + * no-one has the page mapped, and cannot get at + * its virtual address (and hence PTE) without first + * getting the kmap_lock (which is held here). + * So no dangers, even with speculative execution. + */ + page = pte_page(pkmap_page_table[i]); + pte_clear(&init_mm, (unsigned long)page_address(page), + &pkmap_page_table[i]); + + set_page_address(page, NULL); + } + flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP)); +} + +static inline unsigned long map_new_virtual(struct page *page) +{ + unsigned long vaddr; + int count; + +start: + count = LAST_PKMAP; + /* Find an empty entry */ + for (;;) { + last_pkmap_nr = (last_pkmap_nr + 1) & LAST_PKMAP_MASK; + if (!last_pkmap_nr) { + flush_all_zero_pkmaps(); + count = LAST_PKMAP; + } + if (!pkmap_count[last_pkmap_nr]) + break; /* Found a usable entry */ + if (--count) + continue; + + /* + * Sleep for somebody else to unmap their entries + */ + { + DECLARE_WAITQUEUE(wait, current); + + __set_current_state(TASK_UNINTERRUPTIBLE); + add_wait_queue(&pkmap_map_wait, &wait); + spin_unlock(&kmap_lock); + schedule(); + remove_wait_queue(&pkmap_map_wait, &wait); + spin_lock(&kmap_lock); + + /* Somebody else might have mapped it while we slept */ + if (page_address(page)) + return (unsigned long)page_address(page); + + /* Re-start */ + goto start; + } + } + vaddr = PKMAP_ADDR(last_pkmap_nr); + set_pte_at(&init_mm, vaddr, + &(pkmap_page_table[last_pkmap_nr]), mk_pte(page, kmap_prot)); + + pkmap_count[last_pkmap_nr] = 1; + set_page_address(page, (void *)vaddr); + + return vaddr; +} + +void fastcall *kmap_high(struct page *page) +{ + unsigned long vaddr; + + /* + * For highmem pages, we can't trust "virtual" until + * after we have the lock. + * + * We cannot call this from interrupts, as it may block + */ + spin_lock(&kmap_lock); + vaddr = (unsigned long)page_address(page); + if (!vaddr) + vaddr = map_new_virtual(page); + pkmap_count[PKMAP_NR(vaddr)]++; + if (pkmap_count[PKMAP_NR(vaddr)] < 2) + BUG(); + spin_unlock(&kmap_lock); + return (void*) vaddr; +} + +EXPORT_SYMBOL(kmap_high); + +void fastcall kunmap_high(struct page *page) +{ + unsigned long vaddr; + unsigned long nr; + int need_wakeup; + + spin_lock(&kmap_lock); + vaddr = (unsigned long)page_address(page); + if (!vaddr) + BUG(); + nr = PKMAP_NR(vaddr); + + /* + * A count must never go down to zero + * without a TLB flush! + */ + need_wakeup = 0; + switch (--pkmap_count[nr]) { + case 0: + BUG(); + case 1: + /* + * Avoid an unnecessary wake_up() function call. + * The common case is pkmap_count[] == 1, but + * no waiters. + * The tasks queued in the wait-queue are guarded + * by both the lock in the wait-queue-head and by + * the kmap_lock. As the kmap_lock is held here, + * no need for the wait-queue-head's lock. Simply + * test if the queue is empty. + */ + need_wakeup = waitqueue_active(&pkmap_map_wait); + } + spin_unlock(&kmap_lock); + + /* do wake-up, if needed, race-free outside of the spin lock */ + if (need_wakeup) + wake_up(&pkmap_map_wait); +} + +EXPORT_SYMBOL(kunmap_high); + +#define POOL_SIZE 64 + +static __init int init_emergency_pool(void) +{ + struct sysinfo i; + si_meminfo(&i); + si_swapinfo(&i); + + if (!i.totalhigh) + return 0; + + page_pool = mempool_create(POOL_SIZE, page_pool_alloc, page_pool_free, NULL); + if (!page_pool) + BUG(); + printk("highmem bounce pool size: %d pages\n", POOL_SIZE); + + return 0; +} + +__initcall(init_emergency_pool); + +/* + * highmem version, map in to vec + */ +static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom) +{ + unsigned long flags; + unsigned char *vto; + + local_irq_save(flags); + vto = kmap_atomic(to->bv_page, KM_BOUNCE_READ); + memcpy(vto + to->bv_offset, vfrom, to->bv_len); + kunmap_atomic(vto, KM_BOUNCE_READ); + local_irq_restore(flags); +} + +#else /* CONFIG_HIGHMEM */ + +#define bounce_copy_vec(to, vfrom) \ + memcpy(page_address((to)->bv_page) + (to)->bv_offset, vfrom, (to)->bv_len) + +#endif + +#define ISA_POOL_SIZE 16 + +/* + * gets called "every" time someone init's a queue with BLK_BOUNCE_ISA + * as the max address, so check if the pool has already been created. + */ +int init_emergency_isa_pool(void) +{ + if (isa_page_pool) + return 0; + + isa_page_pool = mempool_create(ISA_POOL_SIZE, page_pool_alloc, page_pool_free, (void *) __GFP_DMA); + if (!isa_page_pool) + BUG(); + + printk("isa bounce pool size: %d pages\n", ISA_POOL_SIZE); + return 0; +} + +/* + * Simple bounce buffer support for highmem pages. Depending on the + * queue gfp mask set, *to may or may not be a highmem page. kmap it + * always, it will do the Right Thing + */ +static void copy_to_high_bio_irq(struct bio *to, struct bio *from) +{ + unsigned char *vfrom; + struct bio_vec *tovec, *fromvec; + int i; + + __bio_for_each_segment(tovec, to, i, 0) { + fromvec = from->bi_io_vec + i; + + /* + * not bounced + */ + if (tovec->bv_page == fromvec->bv_page) + continue; + + /* + * fromvec->bv_offset and fromvec->bv_len might have been + * modified by the block layer, so use the original copy, + * bounce_copy_vec already uses tovec->bv_len + */ + vfrom = page_address(fromvec->bv_page) + tovec->bv_offset; + + flush_dcache_page(tovec->bv_page); + bounce_copy_vec(tovec, vfrom); + } +} + +static void bounce_end_io(struct bio *bio, mempool_t *pool, int err) +{ + struct bio *bio_orig = bio->bi_private; + struct bio_vec *bvec, *org_vec; + int i; + + if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags)) + set_bit(BIO_EOPNOTSUPP, &bio_orig->bi_flags); + + /* + * free up bounce indirect pages used + */ + __bio_for_each_segment(bvec, bio, i, 0) { + org_vec = bio_orig->bi_io_vec + i; + if (bvec->bv_page == org_vec->bv_page) + continue; + + mempool_free(bvec->bv_page, pool); + } + + bio_endio(bio_orig, bio_orig->bi_size, err); + bio_put(bio); +} + +static int bounce_end_io_write(struct bio *bio, unsigned int bytes_done,int err) +{ + if (bio->bi_size) + return 1; + + bounce_end_io(bio, page_pool, err); + return 0; +} + +static int bounce_end_io_write_isa(struct bio *bio, unsigned int bytes_done, int err) +{ + if (bio->bi_size) + return 1; + + bounce_end_io(bio, isa_page_pool, err); + return 0; +} + +static void __bounce_end_io_read(struct bio *bio, mempool_t *pool, int err) +{ + struct bio *bio_orig = bio->bi_private; + + if (test_bit(BIO_UPTODATE, &bio->bi_flags)) + copy_to_high_bio_irq(bio_orig, bio); + + bounce_end_io(bio, pool, err); +} + +static int bounce_end_io_read(struct bio *bio, unsigned int bytes_done, int err) +{ + if (bio->bi_size) + return 1; + + __bounce_end_io_read(bio, page_pool, err); + return 0; +} + +static int bounce_end_io_read_isa(struct bio *bio, unsigned int bytes_done, int err) +{ + if (bio->bi_size) + return 1; + + __bounce_end_io_read(bio, isa_page_pool, err); + return 0; +} + +static void __blk_queue_bounce(request_queue_t *q, struct bio **bio_orig, + mempool_t *pool) +{ + struct page *page; + struct bio *bio = NULL; + int i, rw = bio_data_dir(*bio_orig); + struct bio_vec *to, *from; + + bio_for_each_segment(from, *bio_orig, i) { + page = from->bv_page; + + /* + * is destination page below bounce pfn? + */ + if (page_to_pfn(page) < q->bounce_pfn) + continue; + + /* + * irk, bounce it + */ + if (!bio) + bio = bio_alloc(GFP_NOIO, (*bio_orig)->bi_vcnt); + + to = bio->bi_io_vec + i; + + to->bv_page = mempool_alloc(pool, q->bounce_gfp); + to->bv_len = from->bv_len; + to->bv_offset = from->bv_offset; + + if (rw == WRITE) { + char *vto, *vfrom; + + flush_dcache_page(from->bv_page); + vto = page_address(to->bv_page) + to->bv_offset; + vfrom = kmap(from->bv_page) + from->bv_offset; + memcpy(vto, vfrom, to->bv_len); + kunmap(from->bv_page); + } + } + + /* + * no pages bounced + */ + if (!bio) + return; + + /* + * at least one page was bounced, fill in possible non-highmem + * pages + */ + __bio_for_each_segment(from, *bio_orig, i, 0) { + to = bio_iovec_idx(bio, i); + if (!to->bv_page) { + to->bv_page = from->bv_page; + to->bv_len = from->bv_len; + to->bv_offset = from->bv_offset; + } + } + + bio->bi_bdev = (*bio_orig)->bi_bdev; + bio->bi_flags |= (1 << BIO_BOUNCED); + bio->bi_sector = (*bio_orig)->bi_sector; + bio->bi_rw = (*bio_orig)->bi_rw; + + bio->bi_vcnt = (*bio_orig)->bi_vcnt; + bio->bi_idx = (*bio_orig)->bi_idx; + bio->bi_size = (*bio_orig)->bi_size; + + if (pool == page_pool) { + bio->bi_end_io = bounce_end_io_write; + if (rw == READ) + bio->bi_end_io = bounce_end_io_read; + } else { + bio->bi_end_io = bounce_end_io_write_isa; + if (rw == READ) + bio->bi_end_io = bounce_end_io_read_isa; + } + + bio->bi_private = *bio_orig; + *bio_orig = bio; +} + +void blk_queue_bounce(request_queue_t *q, struct bio **bio_orig) +{ + mempool_t *pool; + + /* + * for non-isa bounce case, just check if the bounce pfn is equal + * to or bigger than the highest pfn in the system -- in that case, + * don't waste time iterating over bio segments + */ + if (!(q->bounce_gfp & GFP_DMA)) { + if (q->bounce_pfn >= blk_max_pfn) + return; + pool = page_pool; + } else { + BUG_ON(!isa_page_pool); + pool = isa_page_pool; + } + + /* + * slow path + */ + __blk_queue_bounce(q, bio_orig, pool); +} + +EXPORT_SYMBOL(blk_queue_bounce); + +#if defined(HASHED_PAGE_VIRTUAL) + +#define PA_HASH_ORDER 7 + +/* + * Describes one page->virtual association + */ +struct page_address_map { + struct page *page; + void *virtual; + struct list_head list; +}; + +/* + * page_address_map freelist, allocated from page_address_maps. + */ +static struct list_head page_address_pool; /* freelist */ +static spinlock_t pool_lock; /* protects page_address_pool */ + +/* + * Hash table bucket + */ +static struct page_address_slot { + struct list_head lh; /* List of page_address_maps */ + spinlock_t lock; /* Protect this bucket's list */ +} ____cacheline_aligned_in_smp page_address_htable[1<lock, flags); + if (!list_empty(&pas->lh)) { + struct page_address_map *pam; + + list_for_each_entry(pam, &pas->lh, list) { + if (pam->page == page) { + ret = pam->virtual; + goto done; + } + } + } +done: + spin_unlock_irqrestore(&pas->lock, flags); + return ret; +} + +EXPORT_SYMBOL(page_address); + +void set_page_address(struct page *page, void *virtual) +{ + unsigned long flags; + struct page_address_slot *pas; + struct page_address_map *pam; + + BUG_ON(!PageHighMem(page)); + + pas = page_slot(page); + if (virtual) { /* Add */ + BUG_ON(list_empty(&page_address_pool)); + + spin_lock_irqsave(&pool_lock, flags); + pam = list_entry(page_address_pool.next, + struct page_address_map, list); + list_del(&pam->list); + spin_unlock_irqrestore(&pool_lock, flags); + + pam->page = page; + pam->virtual = virtual; + + spin_lock_irqsave(&pas->lock, flags); + list_add_tail(&pam->list, &pas->lh); + spin_unlock_irqrestore(&pas->lock, flags); + } else { /* Remove */ + spin_lock_irqsave(&pas->lock, flags); + list_for_each_entry(pam, &pas->lh, list) { + if (pam->page == page) { + list_del(&pam->list); + spin_unlock_irqrestore(&pas->lock, flags); + spin_lock_irqsave(&pool_lock, flags); + list_add_tail(&pam->list, &page_address_pool); + spin_unlock_irqrestore(&pool_lock, flags); + goto done; + } + } + spin_unlock_irqrestore(&pas->lock, flags); + } +done: + return; +} + +static struct page_address_map page_address_maps[LAST_PKMAP]; + +void __init page_address_init(void) +{ + int i; + + INIT_LIST_HEAD(&page_address_pool); + for (i = 0; i < ARRAY_SIZE(page_address_maps); i++) + list_add(&page_address_maps[i].list, &page_address_pool); + for (i = 0; i < ARRAY_SIZE(page_address_htable); i++) { + INIT_LIST_HEAD(&page_address_htable[i].lh); + spin_lock_init(&page_address_htable[i].lock); + } + spin_lock_init(&pool_lock); +} + +#endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */ diff --git a/mm/hugetlb.c b/mm/hugetlb.c new file mode 100644 index 000000000000..4eb5ae3fbe10 --- /dev/null +++ b/mm/hugetlb.c @@ -0,0 +1,260 @@ +/* + * Generic hugetlb support. + * (C) William Irwin, April 2004 + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; +static unsigned long nr_huge_pages, free_huge_pages; +unsigned long max_huge_pages; +static struct list_head hugepage_freelists[MAX_NUMNODES]; +static unsigned int nr_huge_pages_node[MAX_NUMNODES]; +static unsigned int free_huge_pages_node[MAX_NUMNODES]; +static DEFINE_SPINLOCK(hugetlb_lock); + +static void enqueue_huge_page(struct page *page) +{ + int nid = page_to_nid(page); + list_add(&page->lru, &hugepage_freelists[nid]); + free_huge_pages++; + free_huge_pages_node[nid]++; +} + +static struct page *dequeue_huge_page(void) +{ + int nid = numa_node_id(); + struct page *page = NULL; + + if (list_empty(&hugepage_freelists[nid])) { + for (nid = 0; nid < MAX_NUMNODES; ++nid) + if (!list_empty(&hugepage_freelists[nid])) + break; + } + if (nid >= 0 && nid < MAX_NUMNODES && + !list_empty(&hugepage_freelists[nid])) { + page = list_entry(hugepage_freelists[nid].next, + struct page, lru); + list_del(&page->lru); + free_huge_pages--; + free_huge_pages_node[nid]--; + } + return page; +} + +static struct page *alloc_fresh_huge_page(void) +{ + static int nid = 0; + struct page *page; + page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN, + HUGETLB_PAGE_ORDER); + nid = (nid + 1) % num_online_nodes(); + if (page) { + nr_huge_pages++; + nr_huge_pages_node[page_to_nid(page)]++; + } + return page; +} + +void free_huge_page(struct page *page) +{ + BUG_ON(page_count(page)); + + INIT_LIST_HEAD(&page->lru); + page[1].mapping = NULL; + + spin_lock(&hugetlb_lock); + enqueue_huge_page(page); + spin_unlock(&hugetlb_lock); +} + +struct page *alloc_huge_page(void) +{ + struct page *page; + int i; + + spin_lock(&hugetlb_lock); + page = dequeue_huge_page(); + if (!page) { + spin_unlock(&hugetlb_lock); + return NULL; + } + spin_unlock(&hugetlb_lock); + set_page_count(page, 1); + page[1].mapping = (void *)free_huge_page; + for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i) + clear_highpage(&page[i]); + return page; +} + +static int __init hugetlb_init(void) +{ + unsigned long i; + struct page *page; + + for (i = 0; i < MAX_NUMNODES; ++i) + INIT_LIST_HEAD(&hugepage_freelists[i]); + + for (i = 0; i < max_huge_pages; ++i) { + page = alloc_fresh_huge_page(); + if (!page) + break; + spin_lock(&hugetlb_lock); + enqueue_huge_page(page); + spin_unlock(&hugetlb_lock); + } + max_huge_pages = free_huge_pages = nr_huge_pages = i; + printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages); + return 0; +} +module_init(hugetlb_init); + +static int __init hugetlb_setup(char *s) +{ + if (sscanf(s, "%lu", &max_huge_pages) <= 0) + max_huge_pages = 0; + return 1; +} +__setup("hugepages=", hugetlb_setup); + +#ifdef CONFIG_SYSCTL +static void update_and_free_page(struct page *page) +{ + int i; + nr_huge_pages--; + nr_huge_pages_node[page_zone(page)->zone_pgdat->node_id]--; + for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) { + page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | + 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | + 1 << PG_private | 1<< PG_writeback); + set_page_count(&page[i], 0); + } + set_page_count(page, 1); + __free_pages(page, HUGETLB_PAGE_ORDER); +} + +#ifdef CONFIG_HIGHMEM +static void try_to_free_low(unsigned long count) +{ + int i, nid; + for (i = 0; i < MAX_NUMNODES; ++i) { + struct page *page, *next; + list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) { + if (PageHighMem(page)) + continue; + list_del(&page->lru); + update_and_free_page(page); + nid = page_zone(page)->zone_pgdat->node_id; + free_huge_pages--; + free_huge_pages_node[nid]--; + if (count >= nr_huge_pages) + return; + } + } +} +#else +static inline void try_to_free_low(unsigned long count) +{ +} +#endif + +static unsigned long set_max_huge_pages(unsigned long count) +{ + while (count > nr_huge_pages) { + struct page *page = alloc_fresh_huge_page(); + if (!page) + return nr_huge_pages; + spin_lock(&hugetlb_lock); + enqueue_huge_page(page); + spin_unlock(&hugetlb_lock); + } + if (count >= nr_huge_pages) + return nr_huge_pages; + + spin_lock(&hugetlb_lock); + try_to_free_low(count); + while (count < nr_huge_pages) { + struct page *page = dequeue_huge_page(); + if (!page) + break; + update_and_free_page(page); + } + spin_unlock(&hugetlb_lock); + return nr_huge_pages; +} + +int hugetlb_sysctl_handler(struct ctl_table *table, int write, + struct file *file, void __user *buffer, + size_t *length, loff_t *ppos) +{ + proc_doulongvec_minmax(table, write, file, buffer, length, ppos); + max_huge_pages = set_max_huge_pages(max_huge_pages); + return 0; +} +#endif /* CONFIG_SYSCTL */ + +int hugetlb_report_meminfo(char *buf) +{ + return sprintf(buf, + "HugePages_Total: %5lu\n" + "HugePages_Free: %5lu\n" + "Hugepagesize: %5lu kB\n", + nr_huge_pages, + free_huge_pages, + HPAGE_SIZE/1024); +} + +int hugetlb_report_node_meminfo(int nid, char *buf) +{ + return sprintf(buf, + "Node %d HugePages_Total: %5u\n" + "Node %d HugePages_Free: %5u\n", + nid, nr_huge_pages_node[nid], + nid, free_huge_pages_node[nid]); +} + +int is_hugepage_mem_enough(size_t size) +{ + return (size + ~HPAGE_MASK)/HPAGE_SIZE <= free_huge_pages; +} + +/* Return the number pages of memory we physically have, in PAGE_SIZE units. */ +unsigned long hugetlb_total_pages(void) +{ + return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE); +} +EXPORT_SYMBOL(hugetlb_total_pages); + +/* + * We cannot handle pagefaults against hugetlb pages at all. They cause + * handle_mm_fault() to try to instantiate regular-sized pages in the + * hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get + * this far. + */ +static struct page *hugetlb_nopage(struct vm_area_struct *vma, + unsigned long address, int *unused) +{ + BUG(); + return NULL; +} + +struct vm_operations_struct hugetlb_vm_ops = { + .nopage = hugetlb_nopage, +}; + +void zap_hugepage_range(struct vm_area_struct *vma, + unsigned long start, unsigned long length) +{ + struct mm_struct *mm = vma->vm_mm; + + spin_lock(&mm->page_table_lock); + unmap_hugepage_range(vma, start, start + length); + spin_unlock(&mm->page_table_lock); +} diff --git a/mm/internal.h b/mm/internal.h new file mode 100644 index 000000000000..6bf134e8fb3d --- /dev/null +++ b/mm/internal.h @@ -0,0 +1,13 @@ +/* internal.h: mm/ internal definitions + * + * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* page_alloc.c */ +extern void set_page_refs(struct page *page, int order); diff --git a/mm/madvise.c b/mm/madvise.c new file mode 100644 index 000000000000..944b5e52d812 --- /dev/null +++ b/mm/madvise.c @@ -0,0 +1,242 @@ +/* + * linux/mm/madvise.c + * + * Copyright (C) 1999 Linus Torvalds + * Copyright (C) 2002 Christoph Hellwig + */ + +#include +#include +#include +#include + +/* + * We can potentially split a vm area into separate + * areas, each area with its own behavior. + */ +static long madvise_behavior(struct vm_area_struct * vma, unsigned long start, + unsigned long end, int behavior) +{ + struct mm_struct * mm = vma->vm_mm; + int error = 0; + + if (start != vma->vm_start) { + error = split_vma(mm, vma, start, 1); + if (error) + goto out; + } + + if (end != vma->vm_end) { + error = split_vma(mm, vma, end, 0); + if (error) + goto out; + } + + /* + * vm_flags is protected by the mmap_sem held in write mode. + */ + VM_ClearReadHint(vma); + + switch (behavior) { + case MADV_SEQUENTIAL: + vma->vm_flags |= VM_SEQ_READ; + break; + case MADV_RANDOM: + vma->vm_flags |= VM_RAND_READ; + break; + default: + break; + } + +out: + if (error == -ENOMEM) + error = -EAGAIN; + return error; +} + +/* + * Schedule all required I/O operations. Do not wait for completion. + */ +static long madvise_willneed(struct vm_area_struct * vma, + unsigned long start, unsigned long end) +{ + struct file *file = vma->vm_file; + + if (!file) + return -EBADF; + + start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + if (end > vma->vm_end) + end = vma->vm_end; + end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + + force_page_cache_readahead(file->f_mapping, + file, start, max_sane_readahead(end - start)); + return 0; +} + +/* + * Application no longer needs these pages. If the pages are dirty, + * it's OK to just throw them away. The app will be more careful about + * data it wants to keep. Be sure to free swap resources too. The + * zap_page_range call sets things up for refill_inactive to actually free + * these pages later if no one else has touched them in the meantime, + * although we could add these pages to a global reuse list for + * refill_inactive to pick up before reclaiming other pages. + * + * NB: This interface discards data rather than pushes it out to swap, + * as some implementations do. This has performance implications for + * applications like large transactional databases which want to discard + * pages in anonymous maps after committing to backing store the data + * that was kept in them. There is no reason to write this data out to + * the swap area if the application is discarding it. + * + * An interface that causes the system to free clean pages and flush + * dirty pages is already available as msync(MS_INVALIDATE). + */ +static long madvise_dontneed(struct vm_area_struct * vma, + unsigned long start, unsigned long end) +{ + if ((vma->vm_flags & VM_LOCKED) || is_vm_hugetlb_page(vma)) + return -EINVAL; + + if (unlikely(vma->vm_flags & VM_NONLINEAR)) { + struct zap_details details = { + .nonlinear_vma = vma, + .last_index = ULONG_MAX, + }; + zap_page_range(vma, start, end - start, &details); + } else + zap_page_range(vma, start, end - start, NULL); + return 0; +} + +static long madvise_vma(struct vm_area_struct * vma, unsigned long start, + unsigned long end, int behavior) +{ + long error = -EBADF; + + switch (behavior) { + case MADV_NORMAL: + case MADV_SEQUENTIAL: + case MADV_RANDOM: + error = madvise_behavior(vma, start, end, behavior); + break; + + case MADV_WILLNEED: + error = madvise_willneed(vma, start, end); + break; + + case MADV_DONTNEED: + error = madvise_dontneed(vma, start, end); + break; + + default: + error = -EINVAL; + break; + } + + return error; +} + +/* + * The madvise(2) system call. + * + * Applications can use madvise() to advise the kernel how it should + * handle paging I/O in this VM area. The idea is to help the kernel + * use appropriate read-ahead and caching techniques. The information + * provided is advisory only, and can be safely disregarded by the + * kernel without affecting the correct operation of the application. + * + * behavior values: + * MADV_NORMAL - the default behavior is to read clusters. This + * results in some read-ahead and read-behind. + * MADV_RANDOM - the system should read the minimum amount of data + * on any access, since it is unlikely that the appli- + * cation will need more than what it asks for. + * MADV_SEQUENTIAL - pages in the given range will probably be accessed + * once, so they can be aggressively read ahead, and + * can be freed soon after they are accessed. + * MADV_WILLNEED - the application is notifying the system to read + * some pages ahead. + * MADV_DONTNEED - the application is finished with the given range, + * so the kernel can free resources associated with it. + * + * return values: + * zero - success + * -EINVAL - start + len < 0, start is not page-aligned, + * "behavior" is not a valid value, or application + * is attempting to release locked or shared pages. + * -ENOMEM - addresses in the specified range are not currently + * mapped, or are outside the AS of the process. + * -EIO - an I/O error occurred while paging in data. + * -EBADF - map exists, but area maps something that isn't a file. + * -EAGAIN - a kernel resource was temporarily unavailable. + */ +asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior) +{ + unsigned long end; + struct vm_area_struct * vma; + int unmapped_error = 0; + int error = -EINVAL; + size_t len; + + down_write(¤t->mm->mmap_sem); + + if (start & ~PAGE_MASK) + goto out; + len = (len_in + ~PAGE_MASK) & PAGE_MASK; + + /* Check to see whether len was rounded up from small -ve to zero */ + if (len_in && !len) + goto out; + + end = start + len; + if (end < start) + goto out; + + error = 0; + if (end == start) + goto out; + + /* + * If the interval [start,end) covers some unmapped address + * ranges, just ignore them, but return -ENOMEM at the end. + */ + vma = find_vma(current->mm, start); + for (;;) { + /* Still start < end. */ + error = -ENOMEM; + if (!vma) + goto out; + + /* Here start < vma->vm_end. */ + if (start < vma->vm_start) { + unmapped_error = -ENOMEM; + start = vma->vm_start; + } + + /* Here vma->vm_start <= start < vma->vm_end. */ + if (end <= vma->vm_end) { + if (start < end) { + error = madvise_vma(vma, start, end, + behavior); + if (error) + goto out; + } + error = unmapped_error; + goto out; + } + + /* Here vma->vm_start <= start < vma->vm_end < end. */ + error = madvise_vma(vma, start, vma->vm_end, behavior); + if (error) + goto out; + start = vma->vm_end; + vma = vma->vm_next; + } + +out: + up_write(¤t->mm->mmap_sem); + return error; +} diff --git a/mm/memory.c b/mm/memory.c new file mode 100644 index 000000000000..fb6e5deb873a --- /dev/null +++ b/mm/memory.c @@ -0,0 +1,2165 @@ +/* + * linux/mm/memory.c + * + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + */ + +/* + * demand-loading started 01.12.91 - seems it is high on the list of + * things wanted, and it should be easy to implement. - Linus + */ + +/* + * Ok, demand-loading was easy, shared pages a little bit tricker. Shared + * pages started 02.12.91, seems to work. - Linus. + * + * Tested sharing by executing about 30 /bin/sh: under the old kernel it + * would have taken more than the 6M I have free, but it worked well as + * far as I could see. + * + * Also corrected some "invalidate()"s - I wasn't doing enough of them. + */ + +/* + * Real VM (paging to/from disk) started 18.12.91. Much more work and + * thought has to go into this. Oh, well.. + * 19.12.91 - works, somewhat. Sometimes I get faults, don't know why. + * Found it. Everything seems to work now. + * 20.12.91 - Ok, making the swap-device changeable like the root. + */ + +/* + * 05.04.94 - Multi-page memory management added for v1.1. + * Idea by Alex Bligh (alex@cconcepts.co.uk) + * + * 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG + * (Gerhard.Wichert@pdb.siemens.de) + * + * Aug/Sep 2004 Changed to four level page tables (Andi Kleen) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +#ifndef CONFIG_DISCONTIGMEM +/* use the per-pgdat data instead for discontigmem - mbligh */ +unsigned long max_mapnr; +struct page *mem_map; + +EXPORT_SYMBOL(max_mapnr); +EXPORT_SYMBOL(mem_map); +#endif + +unsigned long num_physpages; +/* + * A number of key systems in x86 including ioremap() rely on the assumption + * that high_memory defines the upper bound on direct map memory, then end + * of ZONE_NORMAL. Under CONFIG_DISCONTIG this means that max_low_pfn and + * highstart_pfn must be the same; there must be no gap between ZONE_NORMAL + * and ZONE_HIGHMEM. + */ +void * high_memory; +unsigned long vmalloc_earlyreserve; + +EXPORT_SYMBOL(num_physpages); +EXPORT_SYMBOL(high_memory); +EXPORT_SYMBOL(vmalloc_earlyreserve); + +/* + * If a p?d_bad entry is found while walking page tables, report + * the error, before resetting entry to p?d_none. Usually (but + * very seldom) called out from the p?d_none_or_clear_bad macros. + */ + +void pgd_clear_bad(pgd_t *pgd) +{ + pgd_ERROR(*pgd); + pgd_clear(pgd); +} + +void pud_clear_bad(pud_t *pud) +{ + pud_ERROR(*pud); + pud_clear(pud); +} + +void pmd_clear_bad(pmd_t *pmd) +{ + pmd_ERROR(*pmd); + pmd_clear(pmd); +} + +/* + * Note: this doesn't free the actual pages themselves. That + * has been handled earlier when unmapping all the memory regions. + */ +static inline void clear_pte_range(struct mmu_gather *tlb, pmd_t *pmd, + unsigned long addr, unsigned long end) +{ + if (!((addr | end) & ~PMD_MASK)) { + /* Only free fully aligned ranges */ + struct page *page = pmd_page(*pmd); + pmd_clear(pmd); + dec_page_state(nr_page_table_pages); + tlb->mm->nr_ptes--; + pte_free_tlb(tlb, page); + } +} + +static inline void clear_pmd_range(struct mmu_gather *tlb, pud_t *pud, + unsigned long addr, unsigned long end) +{ + pmd_t *pmd; + unsigned long next; + pmd_t *empty_pmd = NULL; + + pmd = pmd_offset(pud, addr); + + /* Only free fully aligned ranges */ + if (!((addr | end) & ~PUD_MASK)) + empty_pmd = pmd; + do { + next = pmd_addr_end(addr, end); + if (pmd_none_or_clear_bad(pmd)) + continue; + clear_pte_range(tlb, pmd, addr, next); + } while (pmd++, addr = next, addr != end); + + if (empty_pmd) { + pud_clear(pud); + pmd_free_tlb(tlb, empty_pmd); + } +} + +static inline void clear_pud_range(struct mmu_gather *tlb, pgd_t *pgd, + unsigned long addr, unsigned long end) +{ + pud_t *pud; + unsigned long next; + pud_t *empty_pud = NULL; + + pud = pud_offset(pgd, addr); + + /* Only free fully aligned ranges */ + if (!((addr | end) & ~PGDIR_MASK)) + empty_pud = pud; + do { + next = pud_addr_end(addr, end); + if (pud_none_or_clear_bad(pud)) + continue; + clear_pmd_range(tlb, pud, addr, next); + } while (pud++, addr = next, addr != end); + + if (empty_pud) { + pgd_clear(pgd); + pud_free_tlb(tlb, empty_pud); + } +} + +/* + * This function clears user-level page tables of a process. + * Unlike other pagetable walks, some memory layouts might give end 0. + * Must be called with pagetable lock held. + */ +void clear_page_range(struct mmu_gather *tlb, + unsigned long addr, unsigned long end) +{ + pgd_t *pgd; + unsigned long next; + + pgd = pgd_offset(tlb->mm, addr); + do { + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(pgd)) + continue; + clear_pud_range(tlb, pgd, addr, next); + } while (pgd++, addr = next, addr != end); +} + +pte_t fastcall * pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address) +{ + if (!pmd_present(*pmd)) { + struct page *new; + + spin_unlock(&mm->page_table_lock); + new = pte_alloc_one(mm, address); + spin_lock(&mm->page_table_lock); + if (!new) + return NULL; + /* + * Because we dropped the lock, we should re-check the + * entry, as somebody else could have populated it.. + */ + if (pmd_present(*pmd)) { + pte_free(new); + goto out; + } + mm->nr_ptes++; + inc_page_state(nr_page_table_pages); + pmd_populate(mm, pmd, new); + } +out: + return pte_offset_map(pmd, address); +} + +pte_t fastcall * pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address) +{ + if (!pmd_present(*pmd)) { + pte_t *new; + + spin_unlock(&mm->page_table_lock); + new = pte_alloc_one_kernel(mm, address); + spin_lock(&mm->page_table_lock); + if (!new) + return NULL; + + /* + * Because we dropped the lock, we should re-check the + * entry, as somebody else could have populated it.. + */ + if (pmd_present(*pmd)) { + pte_free_kernel(new); + goto out; + } + pmd_populate_kernel(mm, pmd, new); + } +out: + return pte_offset_kernel(pmd, address); +} + +/* + * copy one vm_area from one task to the other. Assumes the page tables + * already present in the new task to be cleared in the whole range + * covered by this vma. + * + * dst->page_table_lock is held on entry and exit, + * but may be dropped within p[mg]d_alloc() and pte_alloc_map(). + */ + +static inline void +copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pte_t *dst_pte, pte_t *src_pte, unsigned long vm_flags, + unsigned long addr) +{ + pte_t pte = *src_pte; + struct page *page; + unsigned long pfn; + + /* pte contains position in swap or file, so copy. */ + if (unlikely(!pte_present(pte))) { + if (!pte_file(pte)) { + swap_duplicate(pte_to_swp_entry(pte)); + /* make sure dst_mm is on swapoff's mmlist. */ + if (unlikely(list_empty(&dst_mm->mmlist))) { + spin_lock(&mmlist_lock); + list_add(&dst_mm->mmlist, &src_mm->mmlist); + spin_unlock(&mmlist_lock); + } + } + set_pte_at(dst_mm, addr, dst_pte, pte); + return; + } + + pfn = pte_pfn(pte); + /* the pte points outside of valid memory, the + * mapping is assumed to be good, meaningful + * and not mapped via rmap - duplicate the + * mapping as is. + */ + page = NULL; + if (pfn_valid(pfn)) + page = pfn_to_page(pfn); + + if (!page || PageReserved(page)) { + set_pte_at(dst_mm, addr, dst_pte, pte); + return; + } + + /* + * If it's a COW mapping, write protect it both + * in the parent and the child + */ + if ((vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE) { + ptep_set_wrprotect(src_mm, addr, src_pte); + pte = *src_pte; + } + + /* + * If it's a shared mapping, mark it clean in + * the child + */ + if (vm_flags & VM_SHARED) + pte = pte_mkclean(pte); + pte = pte_mkold(pte); + get_page(page); + inc_mm_counter(dst_mm, rss); + if (PageAnon(page)) + inc_mm_counter(dst_mm, anon_rss); + set_pte_at(dst_mm, addr, dst_pte, pte); + page_dup_rmap(page); +} + +static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + pte_t *src_pte, *dst_pte; + unsigned long vm_flags = vma->vm_flags; + int progress; + +again: + dst_pte = pte_alloc_map(dst_mm, dst_pmd, addr); + if (!dst_pte) + return -ENOMEM; + src_pte = pte_offset_map_nested(src_pmd, addr); + + progress = 0; + spin_lock(&src_mm->page_table_lock); + do { + /* + * We are holding two locks at this point - either of them + * could generate latencies in another task on another CPU. + */ + if (progress >= 32 && (need_resched() || + need_lockbreak(&src_mm->page_table_lock) || + need_lockbreak(&dst_mm->page_table_lock))) + break; + if (pte_none(*src_pte)) { + progress++; + continue; + } + copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vm_flags, addr); + progress += 8; + } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); + spin_unlock(&src_mm->page_table_lock); + + pte_unmap_nested(src_pte - 1); + pte_unmap(dst_pte - 1); + cond_resched_lock(&dst_mm->page_table_lock); + if (addr != end) + goto again; + return 0; +} + +static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + pmd_t *src_pmd, *dst_pmd; + unsigned long next; + + dst_pmd = pmd_alloc(dst_mm, dst_pud, addr); + if (!dst_pmd) + return -ENOMEM; + src_pmd = pmd_offset(src_pud, addr); + do { + next = pmd_addr_end(addr, end); + if (pmd_none_or_clear_bad(src_pmd)) + continue; + if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd, + vma, addr, next)) + return -ENOMEM; + } while (dst_pmd++, src_pmd++, addr = next, addr != end); + return 0; +} + +static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + pud_t *src_pud, *dst_pud; + unsigned long next; + + dst_pud = pud_alloc(dst_mm, dst_pgd, addr); + if (!dst_pud) + return -ENOMEM; + src_pud = pud_offset(src_pgd, addr); + do { + next = pud_addr_end(addr, end); + if (pud_none_or_clear_bad(src_pud)) + continue; + if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud, + vma, addr, next)) + return -ENOMEM; + } while (dst_pud++, src_pud++, addr = next, addr != end); + return 0; +} + +int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, + struct vm_area_struct *vma) +{ + pgd_t *src_pgd, *dst_pgd; + unsigned long next; + unsigned long addr = vma->vm_start; + unsigned long end = vma->vm_end; + + if (is_vm_hugetlb_page(vma)) + return copy_hugetlb_page_range(dst_mm, src_mm, vma); + + dst_pgd = pgd_offset(dst_mm, addr); + src_pgd = pgd_offset(src_mm, addr); + do { + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(src_pgd)) + continue; + if (copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd, + vma, addr, next)) + return -ENOMEM; + } while (dst_pgd++, src_pgd++, addr = next, addr != end); + return 0; +} + +static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd, + unsigned long addr, unsigned long end, + struct zap_details *details) +{ + pte_t *pte; + + pte = pte_offset_map(pmd, addr); + do { + pte_t ptent = *pte; + if (pte_none(ptent)) + continue; + if (pte_present(ptent)) { + struct page *page = NULL; + unsigned long pfn = pte_pfn(ptent); + if (pfn_valid(pfn)) { + page = pfn_to_page(pfn); + if (PageReserved(page)) + page = NULL; + } + if (unlikely(details) && page) { + /* + * unmap_shared_mapping_pages() wants to + * invalidate cache without truncating: + * unmap shared but keep private pages. + */ + if (details->check_mapping && + details->check_mapping != page->mapping) + continue; + /* + * Each page->index must be checked when + * invalidating or truncating nonlinear. + */ + if (details->nonlinear_vma && + (page->index < details->first_index || + page->index > details->last_index)) + continue; + } + ptent = ptep_get_and_clear(tlb->mm, addr, pte); + tlb_remove_tlb_entry(tlb, pte, addr); + if (unlikely(!page)) + continue; + if (unlikely(details) && details->nonlinear_vma + && linear_page_index(details->nonlinear_vma, + addr) != page->index) + set_pte_at(tlb->mm, addr, pte, + pgoff_to_pte(page->index)); + if (pte_dirty(ptent)) + set_page_dirty(page); + if (PageAnon(page)) + dec_mm_counter(tlb->mm, anon_rss); + else if (pte_young(ptent)) + mark_page_accessed(page); + tlb->freed++; + page_remove_rmap(page); + tlb_remove_page(tlb, page); + continue; + } + /* + * If details->check_mapping, we leave swap entries; + * if details->nonlinear_vma, we leave file entries. + */ + if (unlikely(details)) + continue; + if (!pte_file(ptent)) + free_swap_and_cache(pte_to_swp_entry(ptent)); + pte_clear(tlb->mm, addr, pte); + } while (pte++, addr += PAGE_SIZE, addr != end); + pte_unmap(pte - 1); +} + +static inline void zap_pmd_range(struct mmu_gather *tlb, pud_t *pud, + unsigned long addr, unsigned long end, + struct zap_details *details) +{ + pmd_t *pmd; + unsigned long next; + + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (pmd_none_or_clear_bad(pmd)) + continue; + zap_pte_range(tlb, pmd, addr, next, details); + } while (pmd++, addr = next, addr != end); +} + +static inline void zap_pud_range(struct mmu_gather *tlb, pgd_t *pgd, + unsigned long addr, unsigned long end, + struct zap_details *details) +{ + pud_t *pud; + unsigned long next; + + pud = pud_offset(pgd, addr); + do { + next = pud_addr_end(addr, end); + if (pud_none_or_clear_bad(pud)) + continue; + zap_pmd_range(tlb, pud, addr, next, details); + } while (pud++, addr = next, addr != end); +} + +static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, + unsigned long addr, unsigned long end, + struct zap_details *details) +{ + pgd_t *pgd; + unsigned long next; + + if (details && !details->check_mapping && !details->nonlinear_vma) + details = NULL; + + BUG_ON(addr >= end); + tlb_start_vma(tlb, vma); + pgd = pgd_offset(vma->vm_mm, addr); + do { + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(pgd)) + continue; + zap_pud_range(tlb, pgd, addr, next, details); + } while (pgd++, addr = next, addr != end); + tlb_end_vma(tlb, vma); +} + +#ifdef CONFIG_PREEMPT +# define ZAP_BLOCK_SIZE (8 * PAGE_SIZE) +#else +/* No preempt: go for improved straight-line efficiency */ +# define ZAP_BLOCK_SIZE (1024 * PAGE_SIZE) +#endif + +/** + * unmap_vmas - unmap a range of memory covered by a list of vma's + * @tlbp: address of the caller's struct mmu_gather + * @mm: the controlling mm_struct + * @vma: the starting vma + * @start_addr: virtual address at which to start unmapping + * @end_addr: virtual address at which to end unmapping + * @nr_accounted: Place number of unmapped pages in vm-accountable vma's here + * @details: details of nonlinear truncation or shared cache invalidation + * + * Returns the number of vma's which were covered by the unmapping. + * + * Unmap all pages in the vma list. Called under page_table_lock. + * + * We aim to not hold page_table_lock for too long (for scheduling latency + * reasons). So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to + * return the ending mmu_gather to the caller. + * + * Only addresses between `start' and `end' will be unmapped. + * + * The VMA list must be sorted in ascending virtual address order. + * + * unmap_vmas() assumes that the caller will flush the whole unmapped address + * range after unmap_vmas() returns. So the only responsibility here is to + * ensure that any thus-far unmapped pages are flushed before unmap_vmas() + * drops the lock and schedules. + */ +int unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long start_addr, + unsigned long end_addr, unsigned long *nr_accounted, + struct zap_details *details) +{ + unsigned long zap_bytes = ZAP_BLOCK_SIZE; + unsigned long tlb_start = 0; /* For tlb_finish_mmu */ + int tlb_start_valid = 0; + int ret = 0; + spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL; + int fullmm = tlb_is_full_mm(*tlbp); + + for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) { + unsigned long start; + unsigned long end; + + start = max(vma->vm_start, start_addr); + if (start >= vma->vm_end) + continue; + end = min(vma->vm_end, end_addr); + if (end <= vma->vm_start) + continue; + + if (vma->vm_flags & VM_ACCOUNT) + *nr_accounted += (end - start) >> PAGE_SHIFT; + + ret++; + while (start != end) { + unsigned long block; + + if (!tlb_start_valid) { + tlb_start = start; + tlb_start_valid = 1; + } + + if (is_vm_hugetlb_page(vma)) { + block = end - start; + unmap_hugepage_range(vma, start, end); + } else { + block = min(zap_bytes, end - start); + unmap_page_range(*tlbp, vma, start, + start + block, details); + } + + start += block; + zap_bytes -= block; + if ((long)zap_bytes > 0) + continue; + + tlb_finish_mmu(*tlbp, tlb_start, start); + + if (need_resched() || + need_lockbreak(&mm->page_table_lock) || + (i_mmap_lock && need_lockbreak(i_mmap_lock))) { + if (i_mmap_lock) { + /* must reset count of rss freed */ + *tlbp = tlb_gather_mmu(mm, fullmm); + details->break_addr = start; + goto out; + } + spin_unlock(&mm->page_table_lock); + cond_resched(); + spin_lock(&mm->page_table_lock); + } + + *tlbp = tlb_gather_mmu(mm, fullmm); + tlb_start_valid = 0; + zap_bytes = ZAP_BLOCK_SIZE; + } + } +out: + return ret; +} + +/** + * zap_page_range - remove user pages in a given range + * @vma: vm_area_struct holding the applicable pages + * @address: starting address of pages to zap + * @size: number of bytes to zap + * @details: details of nonlinear truncation or shared cache invalidation + */ +void zap_page_range(struct vm_area_struct *vma, unsigned long address, + unsigned long size, struct zap_details *details) +{ + struct mm_struct *mm = vma->vm_mm; + struct mmu_gather *tlb; + unsigned long end = address + size; + unsigned long nr_accounted = 0; + + if (is_vm_hugetlb_page(vma)) { + zap_hugepage_range(vma, address, size); + return; + } + + lru_add_drain(); + spin_lock(&mm->page_table_lock); + tlb = tlb_gather_mmu(mm, 0); + unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted, details); + tlb_finish_mmu(tlb, address, end); + spin_unlock(&mm->page_table_lock); +} + +/* + * Do a quick page-table lookup for a single page. + * mm->page_table_lock must be held. + */ +static struct page * +__follow_page(struct mm_struct *mm, unsigned long address, int read, int write) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *ptep, pte; + unsigned long pfn; + struct page *page; + + page = follow_huge_addr(mm, address, write); + if (! IS_ERR(page)) + return page; + + pgd = pgd_offset(mm, address); + if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) + goto out; + + pud = pud_offset(pgd, address); + if (pud_none(*pud) || unlikely(pud_bad(*pud))) + goto out; + + pmd = pmd_offset(pud, address); + if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) + goto out; + if (pmd_huge(*pmd)) + return follow_huge_pmd(mm, address, pmd, write); + + ptep = pte_offset_map(pmd, address); + if (!ptep) + goto out; + + pte = *ptep; + pte_unmap(ptep); + if (pte_present(pte)) { + if (write && !pte_write(pte)) + goto out; + if (read && !pte_read(pte)) + goto out; + pfn = pte_pfn(pte); + if (pfn_valid(pfn)) { + page = pfn_to_page(pfn); + if (write && !pte_dirty(pte) && !PageDirty(page)) + set_page_dirty(page); + mark_page_accessed(page); + return page; + } + } + +out: + return NULL; +} + +struct page * +follow_page(struct mm_struct *mm, unsigned long address, int write) +{ + return __follow_page(mm, address, /*read*/0, write); +} + +int +check_user_page_readable(struct mm_struct *mm, unsigned long address) +{ + return __follow_page(mm, address, /*read*/1, /*write*/0) != NULL; +} + +EXPORT_SYMBOL(check_user_page_readable); + +/* + * Given a physical address, is there a useful struct page pointing to + * it? This may become more complex in the future if we start dealing + * with IO-aperture pages for direct-IO. + */ + +static inline struct page *get_page_map(struct page *page) +{ + if (!pfn_valid(page_to_pfn(page))) + return NULL; + return page; +} + + +static inline int +untouched_anonymous_page(struct mm_struct* mm, struct vm_area_struct *vma, + unsigned long address) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + + /* Check if the vma is for an anonymous mapping. */ + if (vma->vm_ops && vma->vm_ops->nopage) + return 0; + + /* Check if page directory entry exists. */ + pgd = pgd_offset(mm, address); + if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) + return 1; + + pud = pud_offset(pgd, address); + if (pud_none(*pud) || unlikely(pud_bad(*pud))) + return 1; + + /* Check if page middle directory entry exists. */ + pmd = pmd_offset(pud, address); + if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) + return 1; + + /* There is a pte slot for 'address' in 'mm'. */ + return 0; +} + + +int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, int len, int write, int force, + struct page **pages, struct vm_area_struct **vmas) +{ + int i; + unsigned int flags; + + /* + * Require read or write permissions. + * If 'force' is set, we only require the "MAY" flags. + */ + flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); + flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); + i = 0; + + do { + struct vm_area_struct * vma; + + vma = find_extend_vma(mm, start); + if (!vma && in_gate_area(tsk, start)) { + unsigned long pg = start & PAGE_MASK; + struct vm_area_struct *gate_vma = get_gate_vma(tsk); + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + if (write) /* user gate pages are read-only */ + return i ? : -EFAULT; + if (pg > TASK_SIZE) + pgd = pgd_offset_k(pg); + else + pgd = pgd_offset_gate(mm, pg); + BUG_ON(pgd_none(*pgd)); + pud = pud_offset(pgd, pg); + BUG_ON(pud_none(*pud)); + pmd = pmd_offset(pud, pg); + BUG_ON(pmd_none(*pmd)); + pte = pte_offset_map(pmd, pg); + BUG_ON(pte_none(*pte)); + if (pages) { + pages[i] = pte_page(*pte); + get_page(pages[i]); + } + pte_unmap(pte); + if (vmas) + vmas[i] = gate_vma; + i++; + start += PAGE_SIZE; + len--; + continue; + } + + if (!vma || (vma->vm_flags & VM_IO) + || !(flags & vma->vm_flags)) + return i ? : -EFAULT; + + if (is_vm_hugetlb_page(vma)) { + i = follow_hugetlb_page(mm, vma, pages, vmas, + &start, &len, i); + continue; + } + spin_lock(&mm->page_table_lock); + do { + struct page *map; + int lookup_write = write; + + cond_resched_lock(&mm->page_table_lock); + while (!(map = follow_page(mm, start, lookup_write))) { + /* + * Shortcut for anonymous pages. We don't want + * to force the creation of pages tables for + * insanly big anonymously mapped areas that + * nobody touched so far. This is important + * for doing a core dump for these mappings. + */ + if (!lookup_write && + untouched_anonymous_page(mm,vma,start)) { + map = ZERO_PAGE(start); + break; + } + spin_unlock(&mm->page_table_lock); + switch (handle_mm_fault(mm,vma,start,write)) { + case VM_FAULT_MINOR: + tsk->min_flt++; + break; + case VM_FAULT_MAJOR: + tsk->maj_flt++; + break; + case VM_FAULT_SIGBUS: + return i ? i : -EFAULT; + case VM_FAULT_OOM: + return i ? i : -ENOMEM; + default: + BUG(); + } + /* + * Now that we have performed a write fault + * and surely no longer have a shared page we + * shouldn't write, we shouldn't ignore an + * unwritable page in the page table if + * we are forcing write access. + */ + lookup_write = write && !force; + spin_lock(&mm->page_table_lock); + } + if (pages) { + pages[i] = get_page_map(map); + if (!pages[i]) { + spin_unlock(&mm->page_table_lock); + while (i--) + page_cache_release(pages[i]); + i = -EFAULT; + goto out; + } + flush_dcache_page(pages[i]); + if (!PageReserved(pages[i])) + page_cache_get(pages[i]); + } + if (vmas) + vmas[i] = vma; + i++; + start += PAGE_SIZE; + len--; + } while(len && start < vma->vm_end); + spin_unlock(&mm->page_table_lock); + } while(len); +out: + return i; +} + +EXPORT_SYMBOL(get_user_pages); + +static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, unsigned long end, pgprot_t prot) +{ + pte_t *pte; + + pte = pte_alloc_map(mm, pmd, addr); + if (!pte) + return -ENOMEM; + do { + pte_t zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE(addr), prot)); + BUG_ON(!pte_none(*pte)); + set_pte_at(mm, addr, pte, zero_pte); + } while (pte++, addr += PAGE_SIZE, addr != end); + pte_unmap(pte - 1); + return 0; +} + +static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud, + unsigned long addr, unsigned long end, pgprot_t prot) +{ + pmd_t *pmd; + unsigned long next; + + pmd = pmd_alloc(mm, pud, addr); + if (!pmd) + return -ENOMEM; + do { + next = pmd_addr_end(addr, end); + if (zeromap_pte_range(mm, pmd, addr, next, prot)) + return -ENOMEM; + } while (pmd++, addr = next, addr != end); + return 0; +} + +static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd, + unsigned long addr, unsigned long end, pgprot_t prot) +{ + pud_t *pud; + unsigned long next; + + pud = pud_alloc(mm, pgd, addr); + if (!pud) + return -ENOMEM; + do { + next = pud_addr_end(addr, end); + if (zeromap_pmd_range(mm, pud, addr, next, prot)) + return -ENOMEM; + } while (pud++, addr = next, addr != end); + return 0; +} + +int zeromap_page_range(struct vm_area_struct *vma, + unsigned long addr, unsigned long size, pgprot_t prot) +{ + pgd_t *pgd; + unsigned long next; + unsigned long end = addr + size; + struct mm_struct *mm = vma->vm_mm; + int err; + + BUG_ON(addr >= end); + pgd = pgd_offset(mm, addr); + flush_cache_range(vma, addr, end); + spin_lock(&mm->page_table_lock); + do { + next = pgd_addr_end(addr, end); + err = zeromap_pud_range(mm, pgd, addr, next, prot); + if (err) + break; + } while (pgd++, addr = next, addr != end); + spin_unlock(&mm->page_table_lock); + return err; +} + +/* + * maps a range of physical memory into the requested pages. the old + * mappings are removed. any references to nonexistent pages results + * in null mappings (currently treated as "copy-on-access") + */ +static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, unsigned long end, + unsigned long pfn, pgprot_t prot) +{ + pte_t *pte; + + pte = pte_alloc_map(mm, pmd, addr); + if (!pte) + return -ENOMEM; + do { + BUG_ON(!pte_none(*pte)); + if (!pfn_valid(pfn) || PageReserved(pfn_to_page(pfn))) + set_pte_at(mm, addr, pte, pfn_pte(pfn, prot)); + pfn++; + } while (pte++, addr += PAGE_SIZE, addr != end); + pte_unmap(pte - 1); + return 0; +} + +static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud, + unsigned long addr, unsigned long end, + unsigned long pfn, pgprot_t prot) +{ + pmd_t *pmd; + unsigned long next; + + pfn -= addr >> PAGE_SHIFT; + pmd = pmd_alloc(mm, pud, addr); + if (!pmd) + return -ENOMEM; + do { + next = pmd_addr_end(addr, end); + if (remap_pte_range(mm, pmd, addr, next, + pfn + (addr >> PAGE_SHIFT), prot)) + return -ENOMEM; + } while (pmd++, addr = next, addr != end); + return 0; +} + +static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd, + unsigned long addr, unsigned long end, + unsigned long pfn, pgprot_t prot) +{ + pud_t *pud; + unsigned long next; + + pfn -= addr >> PAGE_SHIFT; + pud = pud_alloc(mm, pgd, addr); + if (!pud) + return -ENOMEM; + do { + next = pud_addr_end(addr, end); + if (remap_pmd_range(mm, pud, addr, next, + pfn + (addr >> PAGE_SHIFT), prot)) + return -ENOMEM; + } while (pud++, addr = next, addr != end); + return 0; +} + +/* Note: this is only safe if the mm semaphore is held when called. */ +int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot) +{ + pgd_t *pgd; + unsigned long next; + unsigned long end = addr + size; + struct mm_struct *mm = vma->vm_mm; + int err; + + /* + * Physically remapped pages are special. Tell the + * rest of the world about it: + * VM_IO tells people not to look at these pages + * (accesses can have side effects). + * VM_RESERVED tells swapout not to try to touch + * this region. + */ + vma->vm_flags |= VM_IO | VM_RESERVED; + + BUG_ON(addr >= end); + pfn -= addr >> PAGE_SHIFT; + pgd = pgd_offset(mm, addr); + flush_cache_range(vma, addr, end); + spin_lock(&mm->page_table_lock); + do { + next = pgd_addr_end(addr, end); + err = remap_pud_range(mm, pgd, addr, next, + pfn + (addr >> PAGE_SHIFT), prot); + if (err) + break; + } while (pgd++, addr = next, addr != end); + spin_unlock(&mm->page_table_lock); + return err; +} +EXPORT_SYMBOL(remap_pfn_range); + +/* + * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when + * servicing faults for write access. In the normal case, do always want + * pte_mkwrite. But get_user_pages can cause write faults for mappings + * that do not have writing enabled, when used by access_process_vm. + */ +static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) +{ + if (likely(vma->vm_flags & VM_WRITE)) + pte = pte_mkwrite(pte); + return pte; +} + +/* + * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock + */ +static inline void break_cow(struct vm_area_struct * vma, struct page * new_page, unsigned long address, + pte_t *page_table) +{ + pte_t entry; + + entry = maybe_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)), + vma); + ptep_establish(vma, address, page_table, entry); + update_mmu_cache(vma, address, entry); + lazy_mmu_prot_update(entry); +} + +/* + * This routine handles present pages, when users try to write + * to a shared page. It is done by copying the page to a new address + * and decrementing the shared-page counter for the old page. + * + * Goto-purists beware: the only reason for goto's here is that it results + * in better assembly code.. The "default" path will see no jumps at all. + * + * Note that this routine assumes that the protection checks have been + * done by the caller (the low-level page fault routine in most cases). + * Thus we can safely just mark it writable once we've done any necessary + * COW. + * + * We also mark the page dirty at this point even though the page will + * change only once the write actually happens. This avoids a few races, + * and potentially makes it more efficient. + * + * We hold the mm semaphore and the page_table_lock on entry and exit + * with the page_table_lock released. + */ +static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma, + unsigned long address, pte_t *page_table, pmd_t *pmd, pte_t pte) +{ + struct page *old_page, *new_page; + unsigned long pfn = pte_pfn(pte); + pte_t entry; + + if (unlikely(!pfn_valid(pfn))) { + /* + * This should really halt the system so it can be debugged or + * at least the kernel stops what it's doing before it corrupts + * data, but for the moment just pretend this is OOM. + */ + pte_unmap(page_table); + printk(KERN_ERR "do_wp_page: bogus page at address %08lx\n", + address); + spin_unlock(&mm->page_table_lock); + return VM_FAULT_OOM; + } + old_page = pfn_to_page(pfn); + + if (!TestSetPageLocked(old_page)) { + int reuse = can_share_swap_page(old_page); + unlock_page(old_page); + if (reuse) { + flush_cache_page(vma, address, pfn); + entry = maybe_mkwrite(pte_mkyoung(pte_mkdirty(pte)), + vma); + ptep_set_access_flags(vma, address, page_table, entry, 1); + update_mmu_cache(vma, address, entry); + lazy_mmu_prot_update(entry); + pte_unmap(page_table); + spin_unlock(&mm->page_table_lock); + return VM_FAULT_MINOR; + } + } + pte_unmap(page_table); + + /* + * Ok, we need to copy. Oh, well.. + */ + if (!PageReserved(old_page)) + page_cache_get(old_page); + spin_unlock(&mm->page_table_lock); + + if (unlikely(anon_vma_prepare(vma))) + goto no_new_page; + if (old_page == ZERO_PAGE(address)) { + new_page = alloc_zeroed_user_highpage(vma, address); + if (!new_page) + goto no_new_page; + } else { + new_page = alloc_page_vma(GFP_HIGHUSER, vma, address); + if (!new_page) + goto no_new_page; + copy_user_highpage(new_page, old_page, address); + } + /* + * Re-check the pte - we dropped the lock + */ + spin_lock(&mm->page_table_lock); + page_table = pte_offset_map(pmd, address); + if (likely(pte_same(*page_table, pte))) { + if (PageAnon(old_page)) + dec_mm_counter(mm, anon_rss); + if (PageReserved(old_page)) + inc_mm_counter(mm, rss); + else + page_remove_rmap(old_page); + flush_cache_page(vma, address, pfn); + break_cow(vma, new_page, address, page_table); + lru_cache_add_active(new_page); + page_add_anon_rmap(new_page, vma, address); + + /* Free the old page.. */ + new_page = old_page; + } + pte_unmap(page_table); + page_cache_release(new_page); + page_cache_release(old_page); + spin_unlock(&mm->page_table_lock); + return VM_FAULT_MINOR; + +no_new_page: + page_cache_release(old_page); + return VM_FAULT_OOM; +} + +/* + * Helper functions for unmap_mapping_range(). + * + * __ Notes on dropping i_mmap_lock to reduce latency while unmapping __ + * + * We have to restart searching the prio_tree whenever we drop the lock, + * since the iterator is only valid while the lock is held, and anyway + * a later vma might be split and reinserted earlier while lock dropped. + * + * The list of nonlinear vmas could be handled more efficiently, using + * a placeholder, but handle it in the same way until a need is shown. + * It is important to search the prio_tree before nonlinear list: a vma + * may become nonlinear and be shifted from prio_tree to nonlinear list + * while the lock is dropped; but never shifted from list to prio_tree. + * + * In order to make forward progress despite restarting the search, + * vm_truncate_count is used to mark a vma as now dealt with, so we can + * quickly skip it next time around. Since the prio_tree search only + * shows us those vmas affected by unmapping the range in question, we + * can't efficiently keep all vmas in step with mapping->truncate_count: + * so instead reset them all whenever it wraps back to 0 (then go to 1). + * mapping->truncate_count and vma->vm_truncate_count are protected by + * i_mmap_lock. + * + * In order to make forward progress despite repeatedly restarting some + * large vma, note the break_addr set by unmap_vmas when it breaks out: + * and restart from that address when we reach that vma again. It might + * have been split or merged, shrunk or extended, but never shifted: so + * restart_addr remains valid so long as it remains in the vma's range. + * unmap_mapping_range forces truncate_count to leap over page-aligned + * values so we can save vma's restart_addr in its truncate_count field. + */ +#define is_restart_addr(truncate_count) (!((truncate_count) & ~PAGE_MASK)) + +static void reset_vma_truncate_counts(struct address_space *mapping) +{ + struct vm_area_struct *vma; + struct prio_tree_iter iter; + + vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX) + vma->vm_truncate_count = 0; + list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) + vma->vm_truncate_count = 0; +} + +static int unmap_mapping_range_vma(struct vm_area_struct *vma, + unsigned long start_addr, unsigned long end_addr, + struct zap_details *details) +{ + unsigned long restart_addr; + int need_break; + +again: + restart_addr = vma->vm_truncate_count; + if (is_restart_addr(restart_addr) && start_addr < restart_addr) { + start_addr = restart_addr; + if (start_addr >= end_addr) { + /* Top of vma has been split off since last time */ + vma->vm_truncate_count = details->truncate_count; + return 0; + } + } + + details->break_addr = end_addr; + zap_page_range(vma, start_addr, end_addr - start_addr, details); + + /* + * We cannot rely on the break test in unmap_vmas: + * on the one hand, we don't want to restart our loop + * just because that broke out for the page_table_lock; + * on the other hand, it does no test when vma is small. + */ + need_break = need_resched() || + need_lockbreak(details->i_mmap_lock); + + if (details->break_addr >= end_addr) { + /* We have now completed this vma: mark it so */ + vma->vm_truncate_count = details->truncate_count; + if (!need_break) + return 0; + } else { + /* Note restart_addr in vma's truncate_count field */ + vma->vm_truncate_count = details->break_addr; + if (!need_break) + goto again; + } + + spin_unlock(details->i_mmap_lock); + cond_resched(); + spin_lock(details->i_mmap_lock); + return -EINTR; +} + +static inline void unmap_mapping_range_tree(struct prio_tree_root *root, + struct zap_details *details) +{ + struct vm_area_struct *vma; + struct prio_tree_iter iter; + pgoff_t vba, vea, zba, zea; + +restart: + vma_prio_tree_foreach(vma, &iter, root, + details->first_index, details->last_index) { + /* Skip quickly over those we have already dealt with */ + if (vma->vm_truncate_count == details->truncate_count) + continue; + + vba = vma->vm_pgoff; + vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1; + /* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */ + zba = details->first_index; + if (zba < vba) + zba = vba; + zea = details->last_index; + if (zea > vea) + zea = vea; + + if (unmap_mapping_range_vma(vma, + ((zba - vba) << PAGE_SHIFT) + vma->vm_start, + ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start, + details) < 0) + goto restart; + } +} + +static inline void unmap_mapping_range_list(struct list_head *head, + struct zap_details *details) +{ + struct vm_area_struct *vma; + + /* + * In nonlinear VMAs there is no correspondence between virtual address + * offset and file offset. So we must perform an exhaustive search + * across *all* the pages in each nonlinear VMA, not just the pages + * whose virtual address lies outside the file truncation point. + */ +restart: + list_for_each_entry(vma, head, shared.vm_set.list) { + /* Skip quickly over those we have already dealt with */ + if (vma->vm_truncate_count == details->truncate_count) + continue; + details->nonlinear_vma = vma; + if (unmap_mapping_range_vma(vma, vma->vm_start, + vma->vm_end, details) < 0) + goto restart; + } +} + +/** + * unmap_mapping_range - unmap the portion of all mmaps + * in the specified address_space corresponding to the specified + * page range in the underlying file. + * @address_space: the address space containing mmaps to be unmapped. + * @holebegin: byte in first page to unmap, relative to the start of + * the underlying file. This will be rounded down to a PAGE_SIZE + * boundary. Note that this is different from vmtruncate(), which + * must keep the partial page. In contrast, we must get rid of + * partial pages. + * @holelen: size of prospective hole in bytes. This will be rounded + * up to a PAGE_SIZE boundary. A holelen of zero truncates to the + * end of the file. + * @even_cows: 1 when truncating a file, unmap even private COWed pages; + * but 0 when invalidating pagecache, don't throw away private data. + */ +void unmap_mapping_range(struct address_space *mapping, + loff_t const holebegin, loff_t const holelen, int even_cows) +{ + struct zap_details details; + pgoff_t hba = holebegin >> PAGE_SHIFT; + pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; + + /* Check for overflow. */ + if (sizeof(holelen) > sizeof(hlen)) { + long long holeend = + (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (holeend & ~(long long)ULONG_MAX) + hlen = ULONG_MAX - hba + 1; + } + + details.check_mapping = even_cows? NULL: mapping; + details.nonlinear_vma = NULL; + details.first_index = hba; + details.last_index = hba + hlen - 1; + if (details.last_index < details.first_index) + details.last_index = ULONG_MAX; + details.i_mmap_lock = &mapping->i_mmap_lock; + + spin_lock(&mapping->i_mmap_lock); + + /* serialize i_size write against truncate_count write */ + smp_wmb(); + /* Protect against page faults, and endless unmapping loops */ + mapping->truncate_count++; + /* + * For archs where spin_lock has inclusive semantics like ia64 + * this smp_mb() will prevent to read pagetable contents + * before the truncate_count increment is visible to + * other cpus. + */ + smp_mb(); + if (unlikely(is_restart_addr(mapping->truncate_count))) { + if (mapping->truncate_count == 0) + reset_vma_truncate_counts(mapping); + mapping->truncate_count++; + } + details.truncate_count = mapping->truncate_count; + + if (unlikely(!prio_tree_empty(&mapping->i_mmap))) + unmap_mapping_range_tree(&mapping->i_mmap, &details); + if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) + unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); + spin_unlock(&mapping->i_mmap_lock); +} +EXPORT_SYMBOL(unmap_mapping_range); + +/* + * Handle all mappings that got truncated by a "truncate()" + * system call. + * + * NOTE! We have to be ready to update the memory sharing + * between the file and the memory map for a potential last + * incomplete page. Ugly, but necessary. + */ +int vmtruncate(struct inode * inode, loff_t offset) +{ + struct address_space *mapping = inode->i_mapping; + unsigned long limit; + + if (inode->i_size < offset) + goto do_expand; + /* + * truncation of in-use swapfiles is disallowed - it would cause + * subsequent swapout to scribble on the now-freed blocks. + */ + if (IS_SWAPFILE(inode)) + goto out_busy; + i_size_write(inode, offset); + unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); + truncate_inode_pages(mapping, offset); + goto out_truncate; + +do_expand: + limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; + if (limit != RLIM_INFINITY && offset > limit) + goto out_sig; + if (offset > inode->i_sb->s_maxbytes) + goto out_big; + i_size_write(inode, offset); + +out_truncate: + if (inode->i_op && inode->i_op->truncate) + inode->i_op->truncate(inode); + return 0; +out_sig: + send_sig(SIGXFSZ, current, 0); +out_big: + return -EFBIG; +out_busy: + return -ETXTBSY; +} + +EXPORT_SYMBOL(vmtruncate); + +/* + * Primitive swap readahead code. We simply read an aligned block of + * (1 << page_cluster) entries in the swap area. This method is chosen + * because it doesn't cost us any seek time. We also make sure to queue + * the 'original' request together with the readahead ones... + * + * This has been extended to use the NUMA policies from the mm triggering + * the readahead. + * + * Caller must hold down_read on the vma->vm_mm if vma is not NULL. + */ +void swapin_readahead(swp_entry_t entry, unsigned long addr,struct vm_area_struct *vma) +{ +#ifdef CONFIG_NUMA + struct vm_area_struct *next_vma = vma ? vma->vm_next : NULL; +#endif + int i, num; + struct page *new_page; + unsigned long offset; + + /* + * Get the number of handles we should do readahead io to. + */ + num = valid_swaphandles(entry, &offset); + for (i = 0; i < num; offset++, i++) { + /* Ok, do the async read-ahead now */ + new_page = read_swap_cache_async(swp_entry(swp_type(entry), + offset), vma, addr); + if (!new_page) + break; + page_cache_release(new_page); +#ifdef CONFIG_NUMA + /* + * Find the next applicable VMA for the NUMA policy. + */ + addr += PAGE_SIZE; + if (addr == 0) + vma = NULL; + if (vma) { + if (addr >= vma->vm_end) { + vma = next_vma; + next_vma = vma ? vma->vm_next : NULL; + } + if (vma && addr < vma->vm_start) + vma = NULL; + } else { + if (next_vma && addr >= next_vma->vm_start) { + vma = next_vma; + next_vma = vma->vm_next; + } + } +#endif + } + lru_add_drain(); /* Push any new pages onto the LRU now */ +} + +/* + * We hold the mm semaphore and the page_table_lock on entry and + * should release the pagetable lock on exit.. + */ +static int do_swap_page(struct mm_struct * mm, + struct vm_area_struct * vma, unsigned long address, + pte_t *page_table, pmd_t *pmd, pte_t orig_pte, int write_access) +{ + struct page *page; + swp_entry_t entry = pte_to_swp_entry(orig_pte); + pte_t pte; + int ret = VM_FAULT_MINOR; + + pte_unmap(page_table); + spin_unlock(&mm->page_table_lock); + page = lookup_swap_cache(entry); + if (!page) { + swapin_readahead(entry, address, vma); + page = read_swap_cache_async(entry, vma, address); + if (!page) { + /* + * Back out if somebody else faulted in this pte while + * we released the page table lock. + */ + spin_lock(&mm->page_table_lock); + page_table = pte_offset_map(pmd, address); + if (likely(pte_same(*page_table, orig_pte))) + ret = VM_FAULT_OOM; + else + ret = VM_FAULT_MINOR; + pte_unmap(page_table); + spin_unlock(&mm->page_table_lock); + goto out; + } + + /* Had to read the page from swap area: Major fault */ + ret = VM_FAULT_MAJOR; + inc_page_state(pgmajfault); + grab_swap_token(); + } + + mark_page_accessed(page); + lock_page(page); + + /* + * Back out if somebody else faulted in this pte while we + * released the page table lock. + */ + spin_lock(&mm->page_table_lock); + page_table = pte_offset_map(pmd, address); + if (unlikely(!pte_same(*page_table, orig_pte))) { + pte_unmap(page_table); + spin_unlock(&mm->page_table_lock); + unlock_page(page); + page_cache_release(page); + ret = VM_FAULT_MINOR; + goto out; + } + + /* The page isn't present yet, go ahead with the fault. */ + + swap_free(entry); + if (vm_swap_full()) + remove_exclusive_swap_page(page); + + inc_mm_counter(mm, rss); + pte = mk_pte(page, vma->vm_page_prot); + if (write_access && can_share_swap_page(page)) { + pte = maybe_mkwrite(pte_mkdirty(pte), vma); + write_access = 0; + } + unlock_page(page); + + flush_icache_page(vma, page); + set_pte_at(mm, address, page_table, pte); + page_add_anon_rmap(page, vma, address); + + if (write_access) { + if (do_wp_page(mm, vma, address, + page_table, pmd, pte) == VM_FAULT_OOM) + ret = VM_FAULT_OOM; + goto out; + } + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(vma, address, pte); + lazy_mmu_prot_update(pte); + pte_unmap(page_table); + spin_unlock(&mm->page_table_lock); +out: + return ret; +} + +/* + * We are called with the MM semaphore and page_table_lock + * spinlock held to protect against concurrent faults in + * multithreaded programs. + */ +static int +do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, + pte_t *page_table, pmd_t *pmd, int write_access, + unsigned long addr) +{ + pte_t entry; + struct page * page = ZERO_PAGE(addr); + + /* Read-only mapping of ZERO_PAGE. */ + entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot)); + + /* ..except if it's a write access */ + if (write_access) { + /* Allocate our own private page. */ + pte_unmap(page_table); + spin_unlock(&mm->page_table_lock); + + if (unlikely(anon_vma_prepare(vma))) + goto no_mem; + page = alloc_zeroed_user_highpage(vma, addr); + if (!page) + goto no_mem; + + spin_lock(&mm->page_table_lock); + page_table = pte_offset_map(pmd, addr); + + if (!pte_none(*page_table)) { + pte_unmap(page_table); + page_cache_release(page); + spin_unlock(&mm->page_table_lock); + goto out; + } + inc_mm_counter(mm, rss); + entry = maybe_mkwrite(pte_mkdirty(mk_pte(page, + vma->vm_page_prot)), + vma); + lru_cache_add_active(page); + SetPageReferenced(page); + page_add_anon_rmap(page, vma, addr); + } + + set_pte_at(mm, addr, page_table, entry); + pte_unmap(page_table); + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(vma, addr, entry); + lazy_mmu_prot_update(entry); + spin_unlock(&mm->page_table_lock); +out: + return VM_FAULT_MINOR; +no_mem: + return VM_FAULT_OOM; +} + +/* + * do_no_page() tries to create a new page mapping. It aggressively + * tries to share with existing pages, but makes a separate copy if + * the "write_access" parameter is true in order to avoid the next + * page fault. + * + * As this is called only for pages that do not currently exist, we + * do not need to flush old virtual caches or the TLB. + * + * This is called with the MM semaphore held and the page table + * spinlock held. Exit with the spinlock released. + */ +static int +do_no_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, int write_access, pte_t *page_table, pmd_t *pmd) +{ + struct page * new_page; + struct address_space *mapping = NULL; + pte_t entry; + unsigned int sequence = 0; + int ret = VM_FAULT_MINOR; + int anon = 0; + + if (!vma->vm_ops || !vma->vm_ops->nopage) + return do_anonymous_page(mm, vma, page_table, + pmd, write_access, address); + pte_unmap(page_table); + spin_unlock(&mm->page_table_lock); + + if (vma->vm_file) { + mapping = vma->vm_file->f_mapping; + sequence = mapping->truncate_count; + smp_rmb(); /* serializes i_size against truncate_count */ + } +retry: + cond_resched(); + new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret); + /* + * No smp_rmb is needed here as long as there's a full + * spin_lock/unlock sequence inside the ->nopage callback + * (for the pagecache lookup) that acts as an implicit + * smp_mb() and prevents the i_size read to happen + * after the next truncate_count read. + */ + + /* no page was available -- either SIGBUS or OOM */ + if (new_page == NOPAGE_SIGBUS) + return VM_FAULT_SIGBUS; + if (new_page == NOPAGE_OOM) + return VM_FAULT_OOM; + + /* + * Should we do an early C-O-W break? + */ + if (write_access && !(vma->vm_flags & VM_SHARED)) { + struct page *page; + + if (unlikely(anon_vma_prepare(vma))) + goto oom; + page = alloc_page_vma(GFP_HIGHUSER, vma, address); + if (!page) + goto oom; + copy_user_highpage(page, new_page, address); + page_cache_release(new_page); + new_page = page; + anon = 1; + } + + spin_lock(&mm->page_table_lock); + /* + * For a file-backed vma, someone could have truncated or otherwise + * invalidated this page. If unmap_mapping_range got called, + * retry getting the page. + */ + if (mapping && unlikely(sequence != mapping->truncate_count)) { + sequence = mapping->truncate_count; + spin_unlock(&mm->page_table_lock); + page_cache_release(new_page); + goto retry; + } + page_table = pte_offset_map(pmd, address); + + /* + * This silly early PAGE_DIRTY setting removes a race + * due to the bad i386 page protection. But it's valid + * for other architectures too. + * + * Note that if write_access is true, we either now have + * an exclusive copy of the page, or this is a shared mapping, + * so we can make it writable and dirty to avoid having to + * handle that later. + */ + /* Only go through if we didn't race with anybody else... */ + if (pte_none(*page_table)) { + if (!PageReserved(new_page)) + inc_mm_counter(mm, rss); + + flush_icache_page(vma, new_page); + entry = mk_pte(new_page, vma->vm_page_prot); + if (write_access) + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + set_pte_at(mm, address, page_table, entry); + if (anon) { + lru_cache_add_active(new_page); + page_add_anon_rmap(new_page, vma, address); + } else + page_add_file_rmap(new_page); + pte_unmap(page_table); + } else { + /* One of our sibling threads was faster, back out. */ + pte_unmap(page_table); + page_cache_release(new_page); + spin_unlock(&mm->page_table_lock); + goto out; + } + + /* no need to invalidate: a not-present page shouldn't be cached */ + update_mmu_cache(vma, address, entry); + lazy_mmu_prot_update(entry); + spin_unlock(&mm->page_table_lock); +out: + return ret; +oom: + page_cache_release(new_page); + ret = VM_FAULT_OOM; + goto out; +} + +/* + * Fault of a previously existing named mapping. Repopulate the pte + * from the encoded file_pte if possible. This enables swappable + * nonlinear vmas. + */ +static int do_file_page(struct mm_struct * mm, struct vm_area_struct * vma, + unsigned long address, int write_access, pte_t *pte, pmd_t *pmd) +{ + unsigned long pgoff; + int err; + + BUG_ON(!vma->vm_ops || !vma->vm_ops->nopage); + /* + * Fall back to the linear mapping if the fs does not support + * ->populate: + */ + if (!vma->vm_ops || !vma->vm_ops->populate || + (write_access && !(vma->vm_flags & VM_SHARED))) { + pte_clear(mm, address, pte); + return do_no_page(mm, vma, address, write_access, pte, pmd); + } + + pgoff = pte_to_pgoff(*pte); + + pte_unmap(pte); + spin_unlock(&mm->page_table_lock); + + err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE, vma->vm_page_prot, pgoff, 0); + if (err == -ENOMEM) + return VM_FAULT_OOM; + if (err) + return VM_FAULT_SIGBUS; + return VM_FAULT_MAJOR; +} + +/* + * These routines also need to handle stuff like marking pages dirty + * and/or accessed for architectures that don't do it in hardware (most + * RISC architectures). The early dirtying is also good on the i386. + * + * There is also a hook called "update_mmu_cache()" that architectures + * with external mmu caches can use to update those (ie the Sparc or + * PowerPC hashed page tables that act as extended TLBs). + * + * Note the "page_table_lock". It is to protect against kswapd removing + * pages from under us. Note that kswapd only ever _removes_ pages, never + * adds them. As such, once we have noticed that the page is not present, + * we can drop the lock early. + * + * The adding of pages is protected by the MM semaphore (which we hold), + * so we don't need to worry about a page being suddenly been added into + * our VM. + * + * We enter with the pagetable spinlock held, we are supposed to + * release it when done. + */ +static inline int handle_pte_fault(struct mm_struct *mm, + struct vm_area_struct * vma, unsigned long address, + int write_access, pte_t *pte, pmd_t *pmd) +{ + pte_t entry; + + entry = *pte; + if (!pte_present(entry)) { + /* + * If it truly wasn't present, we know that kswapd + * and the PTE updates will not touch it later. So + * drop the lock. + */ + if (pte_none(entry)) + return do_no_page(mm, vma, address, write_access, pte, pmd); + if (pte_file(entry)) + return do_file_page(mm, vma, address, write_access, pte, pmd); + return do_swap_page(mm, vma, address, pte, pmd, entry, write_access); + } + + if (write_access) { + if (!pte_write(entry)) + return do_wp_page(mm, vma, address, pte, pmd, entry); + + entry = pte_mkdirty(entry); + } + entry = pte_mkyoung(entry); + ptep_set_access_flags(vma, address, pte, entry, write_access); + update_mmu_cache(vma, address, entry); + lazy_mmu_prot_update(entry); + pte_unmap(pte); + spin_unlock(&mm->page_table_lock); + return VM_FAULT_MINOR; +} + +/* + * By the time we get here, we already hold the mm semaphore + */ +int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma, + unsigned long address, int write_access) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + __set_current_state(TASK_RUNNING); + + inc_page_state(pgfault); + + if (is_vm_hugetlb_page(vma)) + return VM_FAULT_SIGBUS; /* mapping truncation does this. */ + + /* + * We need the page table lock to synchronize with kswapd + * and the SMP-safe atomic PTE updates. + */ + pgd = pgd_offset(mm, address); + spin_lock(&mm->page_table_lock); + + pud = pud_alloc(mm, pgd, address); + if (!pud) + goto oom; + + pmd = pmd_alloc(mm, pud, address); + if (!pmd) + goto oom; + + pte = pte_alloc_map(mm, pmd, address); + if (!pte) + goto oom; + + return handle_pte_fault(mm, vma, address, write_access, pte, pmd); + + oom: + spin_unlock(&mm->page_table_lock); + return VM_FAULT_OOM; +} + +#ifndef __PAGETABLE_PUD_FOLDED +/* + * Allocate page upper directory. + * + * We've already handled the fast-path in-line, and we own the + * page table lock. + */ +pud_t fastcall *__pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) +{ + pud_t *new; + + spin_unlock(&mm->page_table_lock); + new = pud_alloc_one(mm, address); + spin_lock(&mm->page_table_lock); + if (!new) + return NULL; + + /* + * Because we dropped the lock, we should re-check the + * entry, as somebody else could have populated it.. + */ + if (pgd_present(*pgd)) { + pud_free(new); + goto out; + } + pgd_populate(mm, pgd, new); + out: + return pud_offset(pgd, address); +} +#endif /* __PAGETABLE_PUD_FOLDED */ + +#ifndef __PAGETABLE_PMD_FOLDED +/* + * Allocate page middle directory. + * + * We've already handled the fast-path in-line, and we own the + * page table lock. + */ +pmd_t fastcall *__pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) +{ + pmd_t *new; + + spin_unlock(&mm->page_table_lock); + new = pmd_alloc_one(mm, address); + spin_lock(&mm->page_table_lock); + if (!new) + return NULL; + + /* + * Because we dropped the lock, we should re-check the + * entry, as somebody else could have populated it.. + */ +#ifndef __ARCH_HAS_4LEVEL_HACK + if (pud_present(*pud)) { + pmd_free(new); + goto out; + } + pud_populate(mm, pud, new); +#else + if (pgd_present(*pud)) { + pmd_free(new); + goto out; + } + pgd_populate(mm, pud, new); +#endif /* __ARCH_HAS_4LEVEL_HACK */ + + out: + return pmd_offset(pud, address); +} +#endif /* __PAGETABLE_PMD_FOLDED */ + +int make_pages_present(unsigned long addr, unsigned long end) +{ + int ret, len, write; + struct vm_area_struct * vma; + + vma = find_vma(current->mm, addr); + if (!vma) + return -1; + write = (vma->vm_flags & VM_WRITE) != 0; + if (addr >= end) + BUG(); + if (end > vma->vm_end) + BUG(); + len = (end+PAGE_SIZE-1)/PAGE_SIZE-addr/PAGE_SIZE; + ret = get_user_pages(current, current->mm, addr, + len, write, 0, NULL, NULL); + if (ret < 0) + return ret; + return ret == len ? 0 : -1; +} + +/* + * Map a vmalloc()-space virtual address to the physical page. + */ +struct page * vmalloc_to_page(void * vmalloc_addr) +{ + unsigned long addr = (unsigned long) vmalloc_addr; + struct page *page = NULL; + pgd_t *pgd = pgd_offset_k(addr); + pud_t *pud; + pmd_t *pmd; + pte_t *ptep, pte; + + if (!pgd_none(*pgd)) { + pud = pud_offset(pgd, addr); + if (!pud_none(*pud)) { + pmd = pmd_offset(pud, addr); + if (!pmd_none(*pmd)) { + ptep = pte_offset_map(pmd, addr); + pte = *ptep; + if (pte_present(pte)) + page = pte_page(pte); + pte_unmap(ptep); + } + } + } + return page; +} + +EXPORT_SYMBOL(vmalloc_to_page); + +/* + * Map a vmalloc()-space virtual address to the physical page frame number. + */ +unsigned long vmalloc_to_pfn(void * vmalloc_addr) +{ + return page_to_pfn(vmalloc_to_page(vmalloc_addr)); +} + +EXPORT_SYMBOL(vmalloc_to_pfn); + +/* + * update_mem_hiwater + * - update per process rss and vm high water data + */ +void update_mem_hiwater(struct task_struct *tsk) +{ + if (tsk->mm) { + unsigned long rss = get_mm_counter(tsk->mm, rss); + + if (tsk->mm->hiwater_rss < rss) + tsk->mm->hiwater_rss = rss; + if (tsk->mm->hiwater_vm < tsk->mm->total_vm) + tsk->mm->hiwater_vm = tsk->mm->total_vm; + } +} + +#if !defined(__HAVE_ARCH_GATE_AREA) + +#if defined(AT_SYSINFO_EHDR) +struct vm_area_struct gate_vma; + +static int __init gate_vma_init(void) +{ + gate_vma.vm_mm = NULL; + gate_vma.vm_start = FIXADDR_USER_START; + gate_vma.vm_end = FIXADDR_USER_END; + gate_vma.vm_page_prot = PAGE_READONLY; + gate_vma.vm_flags = 0; + return 0; +} +__initcall(gate_vma_init); +#endif + +struct vm_area_struct *get_gate_vma(struct task_struct *tsk) +{ +#ifdef AT_SYSINFO_EHDR + return &gate_vma; +#else + return NULL; +#endif +} + +int in_gate_area_no_task(unsigned long addr) +{ +#ifdef AT_SYSINFO_EHDR + if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END)) + return 1; +#endif + return 0; +} + +#endif /* __HAVE_ARCH_GATE_AREA */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c new file mode 100644 index 000000000000..a3b44a671cec --- /dev/null +++ b/mm/mempolicy.c @@ -0,0 +1,1138 @@ +/* + * Simple NUMA memory policy for the Linux kernel. + * + * Copyright 2003,2004 Andi Kleen, SuSE Labs. + * Subject to the GNU Public License, version 2. + * + * NUMA policy allows the user to give hints in which node(s) memory should + * be allocated. + * + * Support four policies per VMA and per process: + * + * The VMA policy has priority over the process policy for a page fault. + * + * interleave Allocate memory interleaved over a set of nodes, + * with normal fallback if it fails. + * For VMA based allocations this interleaves based on the + * offset into the backing object or offset into the mapping + * for anonymous memory. For process policy an process counter + * is used. + * bind Only allocate memory on a specific set of nodes, + * no fallback. + * preferred Try a specific node first before normal fallback. + * As a special case node -1 here means do the allocation + * on the local CPU. This is normally identical to default, + * but useful to set in a VMA when you have a non default + * process policy. + * default Allocate on the local node first, or when on a VMA + * use the process policy. This is what Linux always did + * in a NUMA aware kernel and still does by, ahem, default. + * + * The process policy is applied for most non interrupt memory allocations + * in that process' context. Interrupts ignore the policies and always + * try to allocate on the local CPU. The VMA policy is only applied for memory + * allocations for a VMA in the VM. + * + * Currently there are a few corner cases in swapping where the policy + * is not applied, but the majority should be handled. When process policy + * is used it is not remembered over swap outs/swap ins. + * + * Only the highest zone in the zone hierarchy gets policied. Allocations + * requesting a lower zone just use default policy. This implies that + * on systems with highmem kernel lowmem allocation don't get policied. + * Same with GFP_DMA allocations. + * + * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between + * all users and remembered even when nobody has memory mapped. + */ + +/* Notebook: + fix mmap readahead to honour policy and enable policy for any page cache + object + statistics for bigpages + global policy for page cache? currently it uses process policy. Requires + first item above. + handle mremap for shared memory (currently ignored for the policy) + grows down? + make bind policy root only? It can trigger oom much faster and the + kernel is not always grateful with that. + could replace all the switch()es with a mempolicy_ops structure. +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static kmem_cache_t *policy_cache; +static kmem_cache_t *sn_cache; + +#define PDprintk(fmt...) + +/* Highest zone. An specific allocation for a zone below that is not + policied. */ +static int policy_zone; + +static struct mempolicy default_policy = { + .refcnt = ATOMIC_INIT(1), /* never free it */ + .policy = MPOL_DEFAULT, +}; + +/* Check if all specified nodes are online */ +static int nodes_online(unsigned long *nodes) +{ + DECLARE_BITMAP(online2, MAX_NUMNODES); + + bitmap_copy(online2, nodes_addr(node_online_map), MAX_NUMNODES); + if (bitmap_empty(online2, MAX_NUMNODES)) + set_bit(0, online2); + if (!bitmap_subset(nodes, online2, MAX_NUMNODES)) + return -EINVAL; + return 0; +} + +/* Do sanity checking on a policy */ +static int mpol_check_policy(int mode, unsigned long *nodes) +{ + int empty = bitmap_empty(nodes, MAX_NUMNODES); + + switch (mode) { + case MPOL_DEFAULT: + if (!empty) + return -EINVAL; + break; + case MPOL_BIND: + case MPOL_INTERLEAVE: + /* Preferred will only use the first bit, but allow + more for now. */ + if (empty) + return -EINVAL; + break; + } + return nodes_online(nodes); +} + +/* Copy a node mask from user space. */ +static int get_nodes(unsigned long *nodes, unsigned long __user *nmask, + unsigned long maxnode, int mode) +{ + unsigned long k; + unsigned long nlongs; + unsigned long endmask; + + --maxnode; + bitmap_zero(nodes, MAX_NUMNODES); + if (maxnode == 0 || !nmask) + return 0; + + nlongs = BITS_TO_LONGS(maxnode); + if ((maxnode % BITS_PER_LONG) == 0) + endmask = ~0UL; + else + endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1; + + /* When the user specified more nodes than supported just check + if the non supported part is all zero. */ + if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) { + if (nlongs > PAGE_SIZE/sizeof(long)) + return -EINVAL; + for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) { + unsigned long t; + if (get_user(t, nmask + k)) + return -EFAULT; + if (k == nlongs - 1) { + if (t & endmask) + return -EINVAL; + } else if (t) + return -EINVAL; + } + nlongs = BITS_TO_LONGS(MAX_NUMNODES); + endmask = ~0UL; + } + + if (copy_from_user(nodes, nmask, nlongs*sizeof(unsigned long))) + return -EFAULT; + nodes[nlongs-1] &= endmask; + /* Update current mems_allowed */ + cpuset_update_current_mems_allowed(); + /* Ignore nodes not set in current->mems_allowed */ + cpuset_restrict_to_mems_allowed(nodes); + return mpol_check_policy(mode, nodes); +} + +/* Generate a custom zonelist for the BIND policy. */ +static struct zonelist *bind_zonelist(unsigned long *nodes) +{ + struct zonelist *zl; + int num, max, nd; + + max = 1 + MAX_NR_ZONES * bitmap_weight(nodes, MAX_NUMNODES); + zl = kmalloc(sizeof(void *) * max, GFP_KERNEL); + if (!zl) + return NULL; + num = 0; + for (nd = find_first_bit(nodes, MAX_NUMNODES); + nd < MAX_NUMNODES; + nd = find_next_bit(nodes, MAX_NUMNODES, 1+nd)) { + int k; + for (k = MAX_NR_ZONES-1; k >= 0; k--) { + struct zone *z = &NODE_DATA(nd)->node_zones[k]; + if (!z->present_pages) + continue; + zl->zones[num++] = z; + if (k > policy_zone) + policy_zone = k; + } + } + BUG_ON(num >= max); + zl->zones[num] = NULL; + return zl; +} + +/* Create a new policy */ +static struct mempolicy *mpol_new(int mode, unsigned long *nodes) +{ + struct mempolicy *policy; + + PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes[0]); + if (mode == MPOL_DEFAULT) + return NULL; + policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); + if (!policy) + return ERR_PTR(-ENOMEM); + atomic_set(&policy->refcnt, 1); + switch (mode) { + case MPOL_INTERLEAVE: + bitmap_copy(policy->v.nodes, nodes, MAX_NUMNODES); + break; + case MPOL_PREFERRED: + policy->v.preferred_node = find_first_bit(nodes, MAX_NUMNODES); + if (policy->v.preferred_node >= MAX_NUMNODES) + policy->v.preferred_node = -1; + break; + case MPOL_BIND: + policy->v.zonelist = bind_zonelist(nodes); + if (policy->v.zonelist == NULL) { + kmem_cache_free(policy_cache, policy); + return ERR_PTR(-ENOMEM); + } + break; + } + policy->policy = mode; + return policy; +} + +/* Ensure all existing pages follow the policy. */ +static int +verify_pages(struct mm_struct *mm, + unsigned long addr, unsigned long end, unsigned long *nodes) +{ + while (addr < end) { + struct page *p; + pte_t *pte; + pmd_t *pmd; + pud_t *pud; + pgd_t *pgd; + pgd = pgd_offset(mm, addr); + if (pgd_none(*pgd)) { + unsigned long next = (addr + PGDIR_SIZE) & PGDIR_MASK; + if (next > addr) + break; + addr = next; + continue; + } + pud = pud_offset(pgd, addr); + if (pud_none(*pud)) { + addr = (addr + PUD_SIZE) & PUD_MASK; + continue; + } + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) { + addr = (addr + PMD_SIZE) & PMD_MASK; + continue; + } + p = NULL; + pte = pte_offset_map(pmd, addr); + if (pte_present(*pte)) + p = pte_page(*pte); + pte_unmap(pte); + if (p) { + unsigned nid = page_to_nid(p); + if (!test_bit(nid, nodes)) + return -EIO; + } + addr += PAGE_SIZE; + } + return 0; +} + +/* Step 1: check the range */ +static struct vm_area_struct * +check_range(struct mm_struct *mm, unsigned long start, unsigned long end, + unsigned long *nodes, unsigned long flags) +{ + int err; + struct vm_area_struct *first, *vma, *prev; + + first = find_vma(mm, start); + if (!first) + return ERR_PTR(-EFAULT); + prev = NULL; + for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { + if (!vma->vm_next && vma->vm_end < end) + return ERR_PTR(-EFAULT); + if (prev && prev->vm_end < vma->vm_start) + return ERR_PTR(-EFAULT); + if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) { + err = verify_pages(vma->vm_mm, + vma->vm_start, vma->vm_end, nodes); + if (err) { + first = ERR_PTR(err); + break; + } + } + prev = vma; + } + return first; +} + +/* Apply policy to a single VMA */ +static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new) +{ + int err = 0; + struct mempolicy *old = vma->vm_policy; + + PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", + vma->vm_start, vma->vm_end, vma->vm_pgoff, + vma->vm_ops, vma->vm_file, + vma->vm_ops ? vma->vm_ops->set_policy : NULL); + + if (vma->vm_ops && vma->vm_ops->set_policy) + err = vma->vm_ops->set_policy(vma, new); + if (!err) { + mpol_get(new); + vma->vm_policy = new; + mpol_free(old); + } + return err; +} + +/* Step 2: apply policy to a range and do splits. */ +static int mbind_range(struct vm_area_struct *vma, unsigned long start, + unsigned long end, struct mempolicy *new) +{ + struct vm_area_struct *next; + int err; + + err = 0; + for (; vma && vma->vm_start < end; vma = next) { + next = vma->vm_next; + if (vma->vm_start < start) + err = split_vma(vma->vm_mm, vma, start, 1); + if (!err && vma->vm_end > end) + err = split_vma(vma->vm_mm, vma, end, 0); + if (!err) + err = policy_vma(vma, new); + if (err) + break; + } + return err; +} + +/* Change policy for a memory range */ +asmlinkage long sys_mbind(unsigned long start, unsigned long len, + unsigned long mode, + unsigned long __user *nmask, unsigned long maxnode, + unsigned flags) +{ + struct vm_area_struct *vma; + struct mm_struct *mm = current->mm; + struct mempolicy *new; + unsigned long end; + DECLARE_BITMAP(nodes, MAX_NUMNODES); + int err; + + if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX) + return -EINVAL; + if (start & ~PAGE_MASK) + return -EINVAL; + if (mode == MPOL_DEFAULT) + flags &= ~MPOL_MF_STRICT; + len = (len + PAGE_SIZE - 1) & PAGE_MASK; + end = start + len; + if (end < start) + return -EINVAL; + if (end == start) + return 0; + + err = get_nodes(nodes, nmask, maxnode, mode); + if (err) + return err; + + new = mpol_new(mode, nodes); + if (IS_ERR(new)) + return PTR_ERR(new); + + PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len, + mode,nodes[0]); + + down_write(&mm->mmap_sem); + vma = check_range(mm, start, end, nodes, flags); + err = PTR_ERR(vma); + if (!IS_ERR(vma)) + err = mbind_range(vma, start, end, new); + up_write(&mm->mmap_sem); + mpol_free(new); + return err; +} + +/* Set the process memory policy */ +asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, + unsigned long maxnode) +{ + int err; + struct mempolicy *new; + DECLARE_BITMAP(nodes, MAX_NUMNODES); + + if (mode > MPOL_MAX) + return -EINVAL; + err = get_nodes(nodes, nmask, maxnode, mode); + if (err) + return err; + new = mpol_new(mode, nodes); + if (IS_ERR(new)) + return PTR_ERR(new); + mpol_free(current->mempolicy); + current->mempolicy = new; + if (new && new->policy == MPOL_INTERLEAVE) + current->il_next = find_first_bit(new->v.nodes, MAX_NUMNODES); + return 0; +} + +/* Fill a zone bitmap for a policy */ +static void get_zonemask(struct mempolicy *p, unsigned long *nodes) +{ + int i; + + bitmap_zero(nodes, MAX_NUMNODES); + switch (p->policy) { + case MPOL_BIND: + for (i = 0; p->v.zonelist->zones[i]; i++) + __set_bit(p->v.zonelist->zones[i]->zone_pgdat->node_id, nodes); + break; + case MPOL_DEFAULT: + break; + case MPOL_INTERLEAVE: + bitmap_copy(nodes, p->v.nodes, MAX_NUMNODES); + break; + case MPOL_PREFERRED: + /* or use current node instead of online map? */ + if (p->v.preferred_node < 0) + bitmap_copy(nodes, nodes_addr(node_online_map), MAX_NUMNODES); + else + __set_bit(p->v.preferred_node, nodes); + break; + default: + BUG(); + } +} + +static int lookup_node(struct mm_struct *mm, unsigned long addr) +{ + struct page *p; + int err; + + err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL); + if (err >= 0) { + err = page_to_nid(p); + put_page(p); + } + return err; +} + +/* Copy a kernel node mask to user space */ +static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, + void *nodes, unsigned nbytes) +{ + unsigned long copy = ALIGN(maxnode-1, 64) / 8; + + if (copy > nbytes) { + if (copy > PAGE_SIZE) + return -EINVAL; + if (clear_user((char __user *)mask + nbytes, copy - nbytes)) + return -EFAULT; + copy = nbytes; + } + return copy_to_user(mask, nodes, copy) ? -EFAULT : 0; +} + +/* Retrieve NUMA policy */ +asmlinkage long sys_get_mempolicy(int __user *policy, + unsigned long __user *nmask, + unsigned long maxnode, + unsigned long addr, unsigned long flags) +{ + int err, pval; + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma = NULL; + struct mempolicy *pol = current->mempolicy; + + if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR)) + return -EINVAL; + if (nmask != NULL && maxnode < MAX_NUMNODES) + return -EINVAL; + if (flags & MPOL_F_ADDR) { + down_read(&mm->mmap_sem); + vma = find_vma_intersection(mm, addr, addr+1); + if (!vma) { + up_read(&mm->mmap_sem); + return -EFAULT; + } + if (vma->vm_ops && vma->vm_ops->get_policy) + pol = vma->vm_ops->get_policy(vma, addr); + else + pol = vma->vm_policy; + } else if (addr) + return -EINVAL; + + if (!pol) + pol = &default_policy; + + if (flags & MPOL_F_NODE) { + if (flags & MPOL_F_ADDR) { + err = lookup_node(mm, addr); + if (err < 0) + goto out; + pval = err; + } else if (pol == current->mempolicy && + pol->policy == MPOL_INTERLEAVE) { + pval = current->il_next; + } else { + err = -EINVAL; + goto out; + } + } else + pval = pol->policy; + + if (vma) { + up_read(¤t->mm->mmap_sem); + vma = NULL; + } + + if (policy && put_user(pval, policy)) + return -EFAULT; + + err = 0; + if (nmask) { + DECLARE_BITMAP(nodes, MAX_NUMNODES); + get_zonemask(pol, nodes); + err = copy_nodes_to_user(nmask, maxnode, nodes, sizeof(nodes)); + } + + out: + if (vma) + up_read(¤t->mm->mmap_sem); + return err; +} + +#ifdef CONFIG_COMPAT + +asmlinkage long compat_sys_get_mempolicy(int __user *policy, + compat_ulong_t __user *nmask, + compat_ulong_t maxnode, + compat_ulong_t addr, compat_ulong_t flags) +{ + long err; + unsigned long __user *nm = NULL; + unsigned long nr_bits, alloc_size; + DECLARE_BITMAP(bm, MAX_NUMNODES); + + nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); + alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; + + if (nmask) + nm = compat_alloc_user_space(alloc_size); + + err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags); + + if (!err && nmask) { + err = copy_from_user(bm, nm, alloc_size); + /* ensure entire bitmap is zeroed */ + err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8); + err |= compat_put_bitmap(nmask, bm, nr_bits); + } + + return err; +} + +asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask, + compat_ulong_t maxnode) +{ + long err = 0; + unsigned long __user *nm = NULL; + unsigned long nr_bits, alloc_size; + DECLARE_BITMAP(bm, MAX_NUMNODES); + + nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); + alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; + + if (nmask) { + err = compat_get_bitmap(bm, nmask, nr_bits); + nm = compat_alloc_user_space(alloc_size); + err |= copy_to_user(nm, bm, alloc_size); + } + + if (err) + return -EFAULT; + + return sys_set_mempolicy(mode, nm, nr_bits+1); +} + +asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, + compat_ulong_t mode, compat_ulong_t __user *nmask, + compat_ulong_t maxnode, compat_ulong_t flags) +{ + long err = 0; + unsigned long __user *nm = NULL; + unsigned long nr_bits, alloc_size; + DECLARE_BITMAP(bm, MAX_NUMNODES); + + nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); + alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; + + if (nmask) { + err = compat_get_bitmap(bm, nmask, nr_bits); + nm = compat_alloc_user_space(alloc_size); + err |= copy_to_user(nm, bm, alloc_size); + } + + if (err) + return -EFAULT; + + return sys_mbind(start, len, mode, nm, nr_bits+1, flags); +} + +#endif + +/* Return effective policy for a VMA */ +static struct mempolicy * +get_vma_policy(struct vm_area_struct *vma, unsigned long addr) +{ + struct mempolicy *pol = current->mempolicy; + + if (vma) { + if (vma->vm_ops && vma->vm_ops->get_policy) + pol = vma->vm_ops->get_policy(vma, addr); + else if (vma->vm_policy && + vma->vm_policy->policy != MPOL_DEFAULT) + pol = vma->vm_policy; + } + if (!pol) + pol = &default_policy; + return pol; +} + +/* Return a zonelist representing a mempolicy */ +static struct zonelist *zonelist_policy(unsigned int __nocast gfp, struct mempolicy *policy) +{ + int nd; + + switch (policy->policy) { + case MPOL_PREFERRED: + nd = policy->v.preferred_node; + if (nd < 0) + nd = numa_node_id(); + break; + case MPOL_BIND: + /* Lower zones don't get a policy applied */ + /* Careful: current->mems_allowed might have moved */ + if (gfp >= policy_zone) + if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist)) + return policy->v.zonelist; + /*FALL THROUGH*/ + case MPOL_INTERLEAVE: /* should not happen */ + case MPOL_DEFAULT: + nd = numa_node_id(); + break; + default: + nd = 0; + BUG(); + } + return NODE_DATA(nd)->node_zonelists + (gfp & GFP_ZONEMASK); +} + +/* Do dynamic interleaving for a process */ +static unsigned interleave_nodes(struct mempolicy *policy) +{ + unsigned nid, next; + struct task_struct *me = current; + + nid = me->il_next; + BUG_ON(nid >= MAX_NUMNODES); + next = find_next_bit(policy->v.nodes, MAX_NUMNODES, 1+nid); + if (next >= MAX_NUMNODES) + next = find_first_bit(policy->v.nodes, MAX_NUMNODES); + me->il_next = next; + return nid; +} + +/* Do static interleaving for a VMA with known offset. */ +static unsigned offset_il_node(struct mempolicy *pol, + struct vm_area_struct *vma, unsigned long off) +{ + unsigned nnodes = bitmap_weight(pol->v.nodes, MAX_NUMNODES); + unsigned target = (unsigned)off % nnodes; + int c; + int nid = -1; + + c = 0; + do { + nid = find_next_bit(pol->v.nodes, MAX_NUMNODES, nid+1); + c++; + } while (c <= target); + BUG_ON(nid >= MAX_NUMNODES); + BUG_ON(!test_bit(nid, pol->v.nodes)); + return nid; +} + +/* Allocate a page in interleaved policy. + Own path because it needs to do special accounting. */ +static struct page *alloc_page_interleave(unsigned int __nocast gfp, unsigned order, unsigned nid) +{ + struct zonelist *zl; + struct page *page; + + BUG_ON(!node_online(nid)); + zl = NODE_DATA(nid)->node_zonelists + (gfp & GFP_ZONEMASK); + page = __alloc_pages(gfp, order, zl); + if (page && page_zone(page) == zl->zones[0]) { + zl->zones[0]->pageset[get_cpu()].interleave_hit++; + put_cpu(); + } + return page; +} + +/** + * alloc_page_vma - Allocate a page for a VMA. + * + * @gfp: + * %GFP_USER user allocation. + * %GFP_KERNEL kernel allocations, + * %GFP_HIGHMEM highmem/user allocations, + * %GFP_FS allocation should not call back into a file system. + * %GFP_ATOMIC don't sleep. + * + * @vma: Pointer to VMA or NULL if not available. + * @addr: Virtual Address of the allocation. Must be inside the VMA. + * + * This function allocates a page from the kernel page pool and applies + * a NUMA policy associated with the VMA or the current process. + * When VMA is not NULL caller must hold down_read on the mmap_sem of the + * mm_struct of the VMA to prevent it from going away. Should be used for + * all allocations for pages that will be mapped into + * user space. Returns NULL when no page can be allocated. + * + * Should be called with the mm_sem of the vma hold. + */ +struct page * +alloc_page_vma(unsigned int __nocast gfp, struct vm_area_struct *vma, unsigned long addr) +{ + struct mempolicy *pol = get_vma_policy(vma, addr); + + cpuset_update_current_mems_allowed(); + + if (unlikely(pol->policy == MPOL_INTERLEAVE)) { + unsigned nid; + if (vma) { + unsigned long off; + BUG_ON(addr >= vma->vm_end); + BUG_ON(addr < vma->vm_start); + off = vma->vm_pgoff; + off += (addr - vma->vm_start) >> PAGE_SHIFT; + nid = offset_il_node(pol, vma, off); + } else { + /* fall back to process interleaving */ + nid = interleave_nodes(pol); + } + return alloc_page_interleave(gfp, 0, nid); + } + return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol)); +} + +/** + * alloc_pages_current - Allocate pages. + * + * @gfp: + * %GFP_USER user allocation, + * %GFP_KERNEL kernel allocation, + * %GFP_HIGHMEM highmem allocation, + * %GFP_FS don't call back into a file system. + * %GFP_ATOMIC don't sleep. + * @order: Power of two of allocation size in pages. 0 is a single page. + * + * Allocate a page from the kernel page pool. When not in + * interrupt context and apply the current process NUMA policy. + * Returns NULL when no page can be allocated. + * + * Don't call cpuset_update_current_mems_allowed() unless + * 1) it's ok to take cpuset_sem (can WAIT), and + * 2) allocating for current task (not interrupt). + */ +struct page *alloc_pages_current(unsigned int __nocast gfp, unsigned order) +{ + struct mempolicy *pol = current->mempolicy; + + if ((gfp & __GFP_WAIT) && !in_interrupt()) + cpuset_update_current_mems_allowed(); + if (!pol || in_interrupt()) + pol = &default_policy; + if (pol->policy == MPOL_INTERLEAVE) + return alloc_page_interleave(gfp, order, interleave_nodes(pol)); + return __alloc_pages(gfp, order, zonelist_policy(gfp, pol)); +} +EXPORT_SYMBOL(alloc_pages_current); + +/* Slow path of a mempolicy copy */ +struct mempolicy *__mpol_copy(struct mempolicy *old) +{ + struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL); + + if (!new) + return ERR_PTR(-ENOMEM); + *new = *old; + atomic_set(&new->refcnt, 1); + if (new->policy == MPOL_BIND) { + int sz = ksize(old->v.zonelist); + new->v.zonelist = kmalloc(sz, SLAB_KERNEL); + if (!new->v.zonelist) { + kmem_cache_free(policy_cache, new); + return ERR_PTR(-ENOMEM); + } + memcpy(new->v.zonelist, old->v.zonelist, sz); + } + return new; +} + +/* Slow path of a mempolicy comparison */ +int __mpol_equal(struct mempolicy *a, struct mempolicy *b) +{ + if (!a || !b) + return 0; + if (a->policy != b->policy) + return 0; + switch (a->policy) { + case MPOL_DEFAULT: + return 1; + case MPOL_INTERLEAVE: + return bitmap_equal(a->v.nodes, b->v.nodes, MAX_NUMNODES); + case MPOL_PREFERRED: + return a->v.preferred_node == b->v.preferred_node; + case MPOL_BIND: { + int i; + for (i = 0; a->v.zonelist->zones[i]; i++) + if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i]) + return 0; + return b->v.zonelist->zones[i] == NULL; + } + default: + BUG(); + return 0; + } +} + +/* Slow path of a mpol destructor. */ +void __mpol_free(struct mempolicy *p) +{ + if (!atomic_dec_and_test(&p->refcnt)) + return; + if (p->policy == MPOL_BIND) + kfree(p->v.zonelist); + p->policy = MPOL_DEFAULT; + kmem_cache_free(policy_cache, p); +} + +/* + * Hugetlb policy. Same as above, just works with node numbers instead of + * zonelists. + */ + +/* Find first node suitable for an allocation */ +int mpol_first_node(struct vm_area_struct *vma, unsigned long addr) +{ + struct mempolicy *pol = get_vma_policy(vma, addr); + + switch (pol->policy) { + case MPOL_DEFAULT: + return numa_node_id(); + case MPOL_BIND: + return pol->v.zonelist->zones[0]->zone_pgdat->node_id; + case MPOL_INTERLEAVE: + return interleave_nodes(pol); + case MPOL_PREFERRED: + return pol->v.preferred_node >= 0 ? + pol->v.preferred_node : numa_node_id(); + } + BUG(); + return 0; +} + +/* Find secondary valid nodes for an allocation */ +int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr) +{ + struct mempolicy *pol = get_vma_policy(vma, addr); + + switch (pol->policy) { + case MPOL_PREFERRED: + case MPOL_DEFAULT: + case MPOL_INTERLEAVE: + return 1; + case MPOL_BIND: { + struct zone **z; + for (z = pol->v.zonelist->zones; *z; z++) + if ((*z)->zone_pgdat->node_id == nid) + return 1; + return 0; + } + default: + BUG(); + return 0; + } +} + +/* + * Shared memory backing store policy support. + * + * Remember policies even when nobody has shared memory mapped. + * The policies are kept in Red-Black tree linked from the inode. + * They are protected by the sp->lock spinlock, which should be held + * for any accesses to the tree. + */ + +/* lookup first element intersecting start-end */ +/* Caller holds sp->lock */ +static struct sp_node * +sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end) +{ + struct rb_node *n = sp->root.rb_node; + + while (n) { + struct sp_node *p = rb_entry(n, struct sp_node, nd); + + if (start >= p->end) + n = n->rb_right; + else if (end <= p->start) + n = n->rb_left; + else + break; + } + if (!n) + return NULL; + for (;;) { + struct sp_node *w = NULL; + struct rb_node *prev = rb_prev(n); + if (!prev) + break; + w = rb_entry(prev, struct sp_node, nd); + if (w->end <= start) + break; + n = prev; + } + return rb_entry(n, struct sp_node, nd); +} + +/* Insert a new shared policy into the list. */ +/* Caller holds sp->lock */ +static void sp_insert(struct shared_policy *sp, struct sp_node *new) +{ + struct rb_node **p = &sp->root.rb_node; + struct rb_node *parent = NULL; + struct sp_node *nd; + + while (*p) { + parent = *p; + nd = rb_entry(parent, struct sp_node, nd); + if (new->start < nd->start) + p = &(*p)->rb_left; + else if (new->end > nd->end) + p = &(*p)->rb_right; + else + BUG(); + } + rb_link_node(&new->nd, parent, p); + rb_insert_color(&new->nd, &sp->root); + PDprintk("inserting %lx-%lx: %d\n", new->start, new->end, + new->policy ? new->policy->policy : 0); +} + +/* Find shared policy intersecting idx */ +struct mempolicy * +mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) +{ + struct mempolicy *pol = NULL; + struct sp_node *sn; + + if (!sp->root.rb_node) + return NULL; + spin_lock(&sp->lock); + sn = sp_lookup(sp, idx, idx+1); + if (sn) { + mpol_get(sn->policy); + pol = sn->policy; + } + spin_unlock(&sp->lock); + return pol; +} + +static void sp_delete(struct shared_policy *sp, struct sp_node *n) +{ + PDprintk("deleting %lx-l%x\n", n->start, n->end); + rb_erase(&n->nd, &sp->root); + mpol_free(n->policy); + kmem_cache_free(sn_cache, n); +} + +struct sp_node * +sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol) +{ + struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL); + + if (!n) + return NULL; + n->start = start; + n->end = end; + mpol_get(pol); + n->policy = pol; + return n; +} + +/* Replace a policy range. */ +static int shared_policy_replace(struct shared_policy *sp, unsigned long start, + unsigned long end, struct sp_node *new) +{ + struct sp_node *n, *new2 = NULL; + +restart: + spin_lock(&sp->lock); + n = sp_lookup(sp, start, end); + /* Take care of old policies in the same range. */ + while (n && n->start < end) { + struct rb_node *next = rb_next(&n->nd); + if (n->start >= start) { + if (n->end <= end) + sp_delete(sp, n); + else + n->start = end; + } else { + /* Old policy spanning whole new range. */ + if (n->end > end) { + if (!new2) { + spin_unlock(&sp->lock); + new2 = sp_alloc(end, n->end, n->policy); + if (!new2) + return -ENOMEM; + goto restart; + } + n->end = start; + sp_insert(sp, new2); + new2 = NULL; + break; + } else + n->end = start; + } + if (!next) + break; + n = rb_entry(next, struct sp_node, nd); + } + if (new) + sp_insert(sp, new); + spin_unlock(&sp->lock); + if (new2) { + mpol_free(new2->policy); + kmem_cache_free(sn_cache, new2); + } + return 0; +} + +int mpol_set_shared_policy(struct shared_policy *info, + struct vm_area_struct *vma, struct mempolicy *npol) +{ + int err; + struct sp_node *new = NULL; + unsigned long sz = vma_pages(vma); + + PDprintk("set_shared_policy %lx sz %lu %d %lx\n", + vma->vm_pgoff, + sz, npol? npol->policy : -1, + npol ? npol->v.nodes[0] : -1); + + if (npol) { + new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); + if (!new) + return -ENOMEM; + } + err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new); + if (err && new) + kmem_cache_free(sn_cache, new); + return err; +} + +/* Free a backing policy store on inode delete. */ +void mpol_free_shared_policy(struct shared_policy *p) +{ + struct sp_node *n; + struct rb_node *next; + + if (!p->root.rb_node) + return; + spin_lock(&p->lock); + next = rb_first(&p->root); + while (next) { + n = rb_entry(next, struct sp_node, nd); + next = rb_next(&n->nd); + mpol_free(n->policy); + kmem_cache_free(sn_cache, n); + } + spin_unlock(&p->lock); + p->root = RB_ROOT; +} + +/* assumes fs == KERNEL_DS */ +void __init numa_policy_init(void) +{ + policy_cache = kmem_cache_create("numa_policy", + sizeof(struct mempolicy), + 0, SLAB_PANIC, NULL, NULL); + + sn_cache = kmem_cache_create("shared_policy_node", + sizeof(struct sp_node), + 0, SLAB_PANIC, NULL, NULL); + + /* Set interleaving policy for system init. This way not all + the data structures allocated at system boot end up in node zero. */ + + if (sys_set_mempolicy(MPOL_INTERLEAVE, nodes_addr(node_online_map), + MAX_NUMNODES) < 0) + printk("numa_policy_init: interleaving failed\n"); +} + +/* Reset policy of current process to default. + * Assumes fs == KERNEL_DS */ +void numa_default_policy(void) +{ + sys_set_mempolicy(MPOL_DEFAULT, NULL, 0); +} diff --git a/mm/mempool.c b/mm/mempool.c new file mode 100644 index 000000000000..b014ffeaa413 --- /dev/null +++ b/mm/mempool.c @@ -0,0 +1,290 @@ +/* + * linux/mm/mempool.c + * + * memory buffer pool support. Such pools are mostly used + * for guaranteed, deadlock-free memory allocations during + * extreme VM load. + * + * started by Ingo Molnar, Copyright (C) 2001 + */ + +#include +#include +#include +#include +#include +#include + +static void add_element(mempool_t *pool, void *element) +{ + BUG_ON(pool->curr_nr >= pool->min_nr); + pool->elements[pool->curr_nr++] = element; +} + +static void *remove_element(mempool_t *pool) +{ + BUG_ON(pool->curr_nr <= 0); + return pool->elements[--pool->curr_nr]; +} + +static void free_pool(mempool_t *pool) +{ + while (pool->curr_nr) { + void *element = remove_element(pool); + pool->free(element, pool->pool_data); + } + kfree(pool->elements); + kfree(pool); +} + +/** + * mempool_create - create a memory pool + * @min_nr: the minimum number of elements guaranteed to be + * allocated for this pool. + * @alloc_fn: user-defined element-allocation function. + * @free_fn: user-defined element-freeing function. + * @pool_data: optional private data available to the user-defined functions. + * + * this function creates and allocates a guaranteed size, preallocated + * memory pool. The pool can be used from the mempool_alloc and mempool_free + * functions. This function might sleep. Both the alloc_fn() and the free_fn() + * functions might sleep - as long as the mempool_alloc function is not called + * from IRQ contexts. + */ +mempool_t * mempool_create(int min_nr, mempool_alloc_t *alloc_fn, + mempool_free_t *free_fn, void *pool_data) +{ + mempool_t *pool; + + pool = kmalloc(sizeof(*pool), GFP_KERNEL); + if (!pool) + return NULL; + memset(pool, 0, sizeof(*pool)); + pool->elements = kmalloc(min_nr * sizeof(void *), GFP_KERNEL); + if (!pool->elements) { + kfree(pool); + return NULL; + } + spin_lock_init(&pool->lock); + pool->min_nr = min_nr; + pool->pool_data = pool_data; + init_waitqueue_head(&pool->wait); + pool->alloc = alloc_fn; + pool->free = free_fn; + + /* + * First pre-allocate the guaranteed number of buffers. + */ + while (pool->curr_nr < pool->min_nr) { + void *element; + + element = pool->alloc(GFP_KERNEL, pool->pool_data); + if (unlikely(!element)) { + free_pool(pool); + return NULL; + } + add_element(pool, element); + } + return pool; +} +EXPORT_SYMBOL(mempool_create); + +/** + * mempool_resize - resize an existing memory pool + * @pool: pointer to the memory pool which was allocated via + * mempool_create(). + * @new_min_nr: the new minimum number of elements guaranteed to be + * allocated for this pool. + * @gfp_mask: the usual allocation bitmask. + * + * This function shrinks/grows the pool. In the case of growing, + * it cannot be guaranteed that the pool will be grown to the new + * size immediately, but new mempool_free() calls will refill it. + * + * Note, the caller must guarantee that no mempool_destroy is called + * while this function is running. mempool_alloc() & mempool_free() + * might be called (eg. from IRQ contexts) while this function executes. + */ +int mempool_resize(mempool_t *pool, int new_min_nr, unsigned int __nocast gfp_mask) +{ + void *element; + void **new_elements; + unsigned long flags; + + BUG_ON(new_min_nr <= 0); + + spin_lock_irqsave(&pool->lock, flags); + if (new_min_nr <= pool->min_nr) { + while (new_min_nr < pool->curr_nr) { + element = remove_element(pool); + spin_unlock_irqrestore(&pool->lock, flags); + pool->free(element, pool->pool_data); + spin_lock_irqsave(&pool->lock, flags); + } + pool->min_nr = new_min_nr; + goto out_unlock; + } + spin_unlock_irqrestore(&pool->lock, flags); + + /* Grow the pool */ + new_elements = kmalloc(new_min_nr * sizeof(*new_elements), gfp_mask); + if (!new_elements) + return -ENOMEM; + + spin_lock_irqsave(&pool->lock, flags); + if (unlikely(new_min_nr <= pool->min_nr)) { + /* Raced, other resize will do our work */ + spin_unlock_irqrestore(&pool->lock, flags); + kfree(new_elements); + goto out; + } + memcpy(new_elements, pool->elements, + pool->curr_nr * sizeof(*new_elements)); + kfree(pool->elements); + pool->elements = new_elements; + pool->min_nr = new_min_nr; + + while (pool->curr_nr < pool->min_nr) { + spin_unlock_irqrestore(&pool->lock, flags); + element = pool->alloc(gfp_mask, pool->pool_data); + if (!element) + goto out; + spin_lock_irqsave(&pool->lock, flags); + if (pool->curr_nr < pool->min_nr) { + add_element(pool, element); + } else { + spin_unlock_irqrestore(&pool->lock, flags); + pool->free(element, pool->pool_data); /* Raced */ + goto out; + } + } +out_unlock: + spin_unlock_irqrestore(&pool->lock, flags); +out: + return 0; +} +EXPORT_SYMBOL(mempool_resize); + +/** + * mempool_destroy - deallocate a memory pool + * @pool: pointer to the memory pool which was allocated via + * mempool_create(). + * + * this function only sleeps if the free_fn() function sleeps. The caller + * has to guarantee that all elements have been returned to the pool (ie: + * freed) prior to calling mempool_destroy(). + */ +void mempool_destroy(mempool_t *pool) +{ + if (pool->curr_nr != pool->min_nr) + BUG(); /* There were outstanding elements */ + free_pool(pool); +} +EXPORT_SYMBOL(mempool_destroy); + +/** + * mempool_alloc - allocate an element from a specific memory pool + * @pool: pointer to the memory pool which was allocated via + * mempool_create(). + * @gfp_mask: the usual allocation bitmask. + * + * this function only sleeps if the alloc_fn function sleeps or + * returns NULL. Note that due to preallocation, this function + * *never* fails when called from process contexts. (it might + * fail if called from an IRQ context.) + */ +void * mempool_alloc(mempool_t *pool, unsigned int __nocast gfp_mask) +{ + void *element; + unsigned long flags; + DEFINE_WAIT(wait); + int gfp_nowait = gfp_mask & ~(__GFP_WAIT | __GFP_IO); + + might_sleep_if(gfp_mask & __GFP_WAIT); +repeat_alloc: + element = pool->alloc(gfp_nowait|__GFP_NOWARN, pool->pool_data); + if (likely(element != NULL)) + return element; + + /* + * If the pool is less than 50% full and we can perform effective + * page reclaim then try harder to allocate an element. + */ + mb(); + if ((gfp_mask & __GFP_FS) && (gfp_mask != gfp_nowait) && + (pool->curr_nr <= pool->min_nr/2)) { + element = pool->alloc(gfp_mask, pool->pool_data); + if (likely(element != NULL)) + return element; + } + + /* + * Kick the VM at this point. + */ + wakeup_bdflush(0); + + spin_lock_irqsave(&pool->lock, flags); + if (likely(pool->curr_nr)) { + element = remove_element(pool); + spin_unlock_irqrestore(&pool->lock, flags); + return element; + } + spin_unlock_irqrestore(&pool->lock, flags); + + /* We must not sleep in the GFP_ATOMIC case */ + if (!(gfp_mask & __GFP_WAIT)) + return NULL; + + prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE); + mb(); + if (!pool->curr_nr) + io_schedule(); + finish_wait(&pool->wait, &wait); + + goto repeat_alloc; +} +EXPORT_SYMBOL(mempool_alloc); + +/** + * mempool_free - return an element to the pool. + * @element: pool element pointer. + * @pool: pointer to the memory pool which was allocated via + * mempool_create(). + * + * this function only sleeps if the free_fn() function sleeps. + */ +void mempool_free(void *element, mempool_t *pool) +{ + unsigned long flags; + + mb(); + if (pool->curr_nr < pool->min_nr) { + spin_lock_irqsave(&pool->lock, flags); + if (pool->curr_nr < pool->min_nr) { + add_element(pool, element); + spin_unlock_irqrestore(&pool->lock, flags); + wake_up(&pool->wait); + return; + } + spin_unlock_irqrestore(&pool->lock, flags); + } + pool->free(element, pool->pool_data); +} +EXPORT_SYMBOL(mempool_free); + +/* + * A commonly used alloc and free fn. + */ +void *mempool_alloc_slab(unsigned int __nocast gfp_mask, void *pool_data) +{ + kmem_cache_t *mem = (kmem_cache_t *) pool_data; + return kmem_cache_alloc(mem, gfp_mask); +} +EXPORT_SYMBOL(mempool_alloc_slab); + +void mempool_free_slab(void *element, void *pool_data) +{ + kmem_cache_t *mem = (kmem_cache_t *) pool_data; + kmem_cache_free(mem, element); +} +EXPORT_SYMBOL(mempool_free_slab); diff --git a/mm/mincore.c b/mm/mincore.c new file mode 100644 index 000000000000..07833dc5829d --- /dev/null +++ b/mm/mincore.c @@ -0,0 +1,191 @@ +/* + * linux/mm/mincore.c + * + * Copyright (C) 1994-1999 Linus Torvalds + */ + +/* + * The mincore() system call. + */ +#include +#include +#include +#include +#include + +#include +#include + +/* + * Later we can get more picky about what "in core" means precisely. + * For now, simply check to see if the page is in the page cache, + * and is up to date; i.e. that no page-in operation would be required + * at this time if an application were to map and access this page. + */ +static unsigned char mincore_page(struct vm_area_struct * vma, + unsigned long pgoff) +{ + unsigned char present = 0; + struct address_space * as = vma->vm_file->f_mapping; + struct page * page; + + page = find_get_page(as, pgoff); + if (page) { + present = PageUptodate(page); + page_cache_release(page); + } + + return present; +} + +static long mincore_vma(struct vm_area_struct * vma, + unsigned long start, unsigned long end, unsigned char __user * vec) +{ + long error, i, remaining; + unsigned char * tmp; + + error = -ENOMEM; + if (!vma->vm_file) + return error; + + start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + if (end > vma->vm_end) + end = vma->vm_end; + end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + + error = -EAGAIN; + tmp = (unsigned char *) __get_free_page(GFP_KERNEL); + if (!tmp) + return error; + + /* (end - start) is # of pages, and also # of bytes in "vec */ + remaining = (end - start), + + error = 0; + for (i = 0; remaining > 0; remaining -= PAGE_SIZE, i++) { + int j = 0; + long thispiece = (remaining < PAGE_SIZE) ? + remaining : PAGE_SIZE; + + while (j < thispiece) + tmp[j++] = mincore_page(vma, start++); + + if (copy_to_user(vec + PAGE_SIZE * i, tmp, thispiece)) { + error = -EFAULT; + break; + } + } + + free_page((unsigned long) tmp); + return error; +} + +/* + * The mincore(2) system call. + * + * mincore() returns the memory residency status of the pages in the + * current process's address space specified by [addr, addr + len). + * The status is returned in a vector of bytes. The least significant + * bit of each byte is 1 if the referenced page is in memory, otherwise + * it is zero. + * + * Because the status of a page can change after mincore() checks it + * but before it returns to the application, the returned vector may + * contain stale information. Only locked pages are guaranteed to + * remain in memory. + * + * return values: + * zero - success + * -EFAULT - vec points to an illegal address + * -EINVAL - addr is not a multiple of PAGE_CACHE_SIZE + * -ENOMEM - Addresses in the range [addr, addr + len] are + * invalid for the address space of this process, or + * specify one or more pages which are not currently + * mapped + * -EAGAIN - A kernel resource was temporarily unavailable. + */ +asmlinkage long sys_mincore(unsigned long start, size_t len, + unsigned char __user * vec) +{ + int index = 0; + unsigned long end, limit; + struct vm_area_struct * vma; + size_t max; + int unmapped_error = 0; + long error; + + /* check the arguments */ + if (start & ~PAGE_CACHE_MASK) + goto einval; + + if (start < FIRST_USER_PGD_NR * PGDIR_SIZE) + goto enomem; + + limit = TASK_SIZE; + if (start >= limit) + goto enomem; + + if (!len) + return 0; + + max = limit - start; + len = PAGE_CACHE_ALIGN(len); + if (len > max || !len) + goto enomem; + + end = start + len; + + /* check the output buffer whilst holding the lock */ + error = -EFAULT; + down_read(¤t->mm->mmap_sem); + + if (!access_ok(VERIFY_WRITE, vec, len >> PAGE_SHIFT)) + goto out; + + /* + * If the interval [start,end) covers some unmapped address + * ranges, just ignore them, but return -ENOMEM at the end. + */ + error = 0; + + vma = find_vma(current->mm, start); + while (vma) { + /* Here start < vma->vm_end. */ + if (start < vma->vm_start) { + unmapped_error = -ENOMEM; + start = vma->vm_start; + } + + /* Here vma->vm_start <= start < vma->vm_end. */ + if (end <= vma->vm_end) { + if (start < end) { + error = mincore_vma(vma, start, end, + &vec[index]); + if (error) + goto out; + } + error = unmapped_error; + goto out; + } + + /* Here vma->vm_start <= start < vma->vm_end < end. */ + error = mincore_vma(vma, start, vma->vm_end, &vec[index]); + if (error) + goto out; + index += (vma->vm_end - start) >> PAGE_CACHE_SHIFT; + start = vma->vm_end; + vma = vma->vm_next; + } + + /* we found a hole in the area queried if we arrive here */ + error = -ENOMEM; + +out: + up_read(¤t->mm->mmap_sem); + return error; + +einval: + return -EINVAL; +enomem: + return -ENOMEM; +} diff --git a/mm/mlock.c b/mm/mlock.c new file mode 100644 index 000000000000..4ae3a46ff768 --- /dev/null +++ b/mm/mlock.c @@ -0,0 +1,253 @@ +/* + * linux/mm/mlock.c + * + * (C) Copyright 1995 Linus Torvalds + * (C) Copyright 2002 Christoph Hellwig + */ + +#include +#include +#include +#include + + +static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, + unsigned long start, unsigned long end, unsigned int newflags) +{ + struct mm_struct * mm = vma->vm_mm; + pgoff_t pgoff; + int pages; + int ret = 0; + + if (newflags == vma->vm_flags) { + *prev = vma; + goto out; + } + + pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); + *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma, + vma->vm_file, pgoff, vma_policy(vma)); + if (*prev) { + vma = *prev; + goto success; + } + + *prev = vma; + + if (start != vma->vm_start) { + ret = split_vma(mm, vma, start, 1); + if (ret) + goto out; + } + + if (end != vma->vm_end) { + ret = split_vma(mm, vma, end, 0); + if (ret) + goto out; + } + +success: + /* + * vm_flags is protected by the mmap_sem held in write mode. + * It's okay if try_to_unmap_one unmaps a page just after we + * set VM_LOCKED, make_pages_present below will bring it back. + */ + vma->vm_flags = newflags; + + /* + * Keep track of amount of locked VM. + */ + pages = (end - start) >> PAGE_SHIFT; + if (newflags & VM_LOCKED) { + pages = -pages; + if (!(newflags & VM_IO)) + ret = make_pages_present(start, end); + } + + vma->vm_mm->locked_vm -= pages; +out: + if (ret == -ENOMEM) + ret = -EAGAIN; + return ret; +} + +static int do_mlock(unsigned long start, size_t len, int on) +{ + unsigned long nstart, end, tmp; + struct vm_area_struct * vma, * prev; + int error; + + len = PAGE_ALIGN(len); + end = start + len; + if (end < start) + return -EINVAL; + if (end == start) + return 0; + vma = find_vma_prev(current->mm, start, &prev); + if (!vma || vma->vm_start > start) + return -ENOMEM; + + if (start > vma->vm_start) + prev = vma; + + for (nstart = start ; ; ) { + unsigned int newflags; + + /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ + + newflags = vma->vm_flags | VM_LOCKED; + if (!on) + newflags &= ~VM_LOCKED; + + tmp = vma->vm_end; + if (tmp > end) + tmp = end; + error = mlock_fixup(vma, &prev, nstart, tmp, newflags); + if (error) + break; + nstart = tmp; + if (nstart < prev->vm_end) + nstart = prev->vm_end; + if (nstart >= end) + break; + + vma = prev->vm_next; + if (!vma || vma->vm_start != nstart) { + error = -ENOMEM; + break; + } + } + return error; +} + +asmlinkage long sys_mlock(unsigned long start, size_t len) +{ + unsigned long locked; + unsigned long lock_limit; + int error = -ENOMEM; + + if (!can_do_mlock()) + return -EPERM; + + down_write(¤t->mm->mmap_sem); + len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); + start &= PAGE_MASK; + + locked = len >> PAGE_SHIFT; + locked += current->mm->locked_vm; + + lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; + lock_limit >>= PAGE_SHIFT; + + /* check against resource limits */ + if ((locked <= lock_limit) || capable(CAP_IPC_LOCK)) + error = do_mlock(start, len, 1); + up_write(¤t->mm->mmap_sem); + return error; +} + +asmlinkage long sys_munlock(unsigned long start, size_t len) +{ + int ret; + + down_write(¤t->mm->mmap_sem); + len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); + start &= PAGE_MASK; + ret = do_mlock(start, len, 0); + up_write(¤t->mm->mmap_sem); + return ret; +} + +static int do_mlockall(int flags) +{ + struct vm_area_struct * vma, * prev = NULL; + unsigned int def_flags = 0; + + if (flags & MCL_FUTURE) + def_flags = VM_LOCKED; + current->mm->def_flags = def_flags; + if (flags == MCL_FUTURE) + goto out; + + for (vma = current->mm->mmap; vma ; vma = prev->vm_next) { + unsigned int newflags; + + newflags = vma->vm_flags | VM_LOCKED; + if (!(flags & MCL_CURRENT)) + newflags &= ~VM_LOCKED; + + /* Ignore errors */ + mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags); + } +out: + return 0; +} + +asmlinkage long sys_mlockall(int flags) +{ + unsigned long lock_limit; + int ret = -EINVAL; + + if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE))) + goto out; + + ret = -EPERM; + if (!can_do_mlock()) + goto out; + + down_write(¤t->mm->mmap_sem); + + lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; + lock_limit >>= PAGE_SHIFT; + + ret = -ENOMEM; + if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) || + capable(CAP_IPC_LOCK)) + ret = do_mlockall(flags); + up_write(¤t->mm->mmap_sem); +out: + return ret; +} + +asmlinkage long sys_munlockall(void) +{ + int ret; + + down_write(¤t->mm->mmap_sem); + ret = do_mlockall(0); + up_write(¤t->mm->mmap_sem); + return ret; +} + +/* + * Objects with different lifetime than processes (SHM_LOCK and SHM_HUGETLB + * shm segments) get accounted against the user_struct instead. + */ +static DEFINE_SPINLOCK(shmlock_user_lock); + +int user_shm_lock(size_t size, struct user_struct *user) +{ + unsigned long lock_limit, locked; + int allowed = 0; + + locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; + lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; + lock_limit >>= PAGE_SHIFT; + spin_lock(&shmlock_user_lock); + if (locked + user->locked_shm > lock_limit && !capable(CAP_IPC_LOCK)) + goto out; + get_uid(user); + user->locked_shm += locked; + allowed = 1; +out: + spin_unlock(&shmlock_user_lock); + return allowed; +} + +void user_shm_unlock(size_t size, struct user_struct *user) +{ + spin_lock(&shmlock_user_lock); + user->locked_shm -= (size + PAGE_SIZE - 1) >> PAGE_SHIFT; + spin_unlock(&shmlock_user_lock); + free_uid(user); +} diff --git a/mm/mmap.c b/mm/mmap.c new file mode 100644 index 000000000000..a95ebda27446 --- /dev/null +++ b/mm/mmap.c @@ -0,0 +1,2082 @@ +/* + * mm/mmap.c + * + * Written by obz. + * + * Address space accounting code + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +/* + * WARNING: the debugging will use recursive algorithms so never enable this + * unless you know what you are doing. + */ +#undef DEBUG_MM_RB + +/* description of effects of mapping type and prot in current implementation. + * this is due to the limited x86 page protection hardware. The expected + * behavior is in parens: + * + * map_type prot + * PROT_NONE PROT_READ PROT_WRITE PROT_EXEC + * MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes + * w: (no) no w: (no) no w: (yes) yes w: (no) no + * x: (no) no x: (no) yes x: (no) yes x: (yes) yes + * + * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes + * w: (no) no w: (no) no w: (copy) copy w: (no) no + * x: (no) no x: (no) yes x: (no) yes x: (yes) yes + * + */ +pgprot_t protection_map[16] = { + __P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111, + __S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111 +}; + +int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ +int sysctl_overcommit_ratio = 50; /* default is 50% */ +int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; +atomic_t vm_committed_space = ATOMIC_INIT(0); + +/* + * Check that a process has enough memory to allocate a new virtual + * mapping. 0 means there is enough memory for the allocation to + * succeed and -ENOMEM implies there is not. + * + * We currently support three overcommit policies, which are set via the + * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting + * + * Strict overcommit modes added 2002 Feb 26 by Alan Cox. + * Additional code 2002 Jul 20 by Robert Love. + * + * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise. + * + * Note this is a helper function intended to be used by LSMs which + * wish to use this logic. + */ +int __vm_enough_memory(long pages, int cap_sys_admin) +{ + unsigned long free, allowed; + + vm_acct_memory(pages); + + /* + * Sometimes we want to use more memory than we have + */ + if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS) + return 0; + + if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { + unsigned long n; + + free = get_page_cache_size(); + free += nr_swap_pages; + + /* + * Any slabs which are created with the + * SLAB_RECLAIM_ACCOUNT flag claim to have contents + * which are reclaimable, under pressure. The dentry + * cache and most inode caches should fall into this + */ + free += atomic_read(&slab_reclaim_pages); + + /* + * Leave the last 3% for root + */ + if (!cap_sys_admin) + free -= free / 32; + + if (free > pages) + return 0; + + /* + * nr_free_pages() is very expensive on large systems, + * only call if we're about to fail. + */ + n = nr_free_pages(); + if (!cap_sys_admin) + n -= n / 32; + free += n; + + if (free > pages) + return 0; + vm_unacct_memory(pages); + return -ENOMEM; + } + + allowed = (totalram_pages - hugetlb_total_pages()) + * sysctl_overcommit_ratio / 100; + /* + * Leave the last 3% for root + */ + if (!cap_sys_admin) + allowed -= allowed / 32; + allowed += total_swap_pages; + + /* Don't let a single process grow too big: + leave 3% of the size of this process for other processes */ + allowed -= current->mm->total_vm / 32; + + if (atomic_read(&vm_committed_space) < allowed) + return 0; + + vm_unacct_memory(pages); + + return -ENOMEM; +} + +EXPORT_SYMBOL(sysctl_overcommit_memory); +EXPORT_SYMBOL(sysctl_overcommit_ratio); +EXPORT_SYMBOL(sysctl_max_map_count); +EXPORT_SYMBOL(vm_committed_space); +EXPORT_SYMBOL(__vm_enough_memory); + +/* + * Requires inode->i_mapping->i_mmap_lock + */ +static void __remove_shared_vm_struct(struct vm_area_struct *vma, + struct file *file, struct address_space *mapping) +{ + if (vma->vm_flags & VM_DENYWRITE) + atomic_inc(&file->f_dentry->d_inode->i_writecount); + if (vma->vm_flags & VM_SHARED) + mapping->i_mmap_writable--; + + flush_dcache_mmap_lock(mapping); + if (unlikely(vma->vm_flags & VM_NONLINEAR)) + list_del_init(&vma->shared.vm_set.list); + else + vma_prio_tree_remove(vma, &mapping->i_mmap); + flush_dcache_mmap_unlock(mapping); +} + +/* + * Remove one vm structure and free it. + */ +static void remove_vm_struct(struct vm_area_struct *vma) +{ + struct file *file = vma->vm_file; + + might_sleep(); + if (file) { + struct address_space *mapping = file->f_mapping; + spin_lock(&mapping->i_mmap_lock); + __remove_shared_vm_struct(vma, file, mapping); + spin_unlock(&mapping->i_mmap_lock); + } + if (vma->vm_ops && vma->vm_ops->close) + vma->vm_ops->close(vma); + if (file) + fput(file); + anon_vma_unlink(vma); + mpol_free(vma_policy(vma)); + kmem_cache_free(vm_area_cachep, vma); +} + +/* + * sys_brk() for the most part doesn't need the global kernel + * lock, except when an application is doing something nasty + * like trying to un-brk an area that has already been mapped + * to a regular file. in this case, the unmapping will need + * to invoke file system routines that need the global lock. + */ +asmlinkage unsigned long sys_brk(unsigned long brk) +{ + unsigned long rlim, retval; + unsigned long newbrk, oldbrk; + struct mm_struct *mm = current->mm; + + down_write(&mm->mmap_sem); + + if (brk < mm->end_code) + goto out; + newbrk = PAGE_ALIGN(brk); + oldbrk = PAGE_ALIGN(mm->brk); + if (oldbrk == newbrk) + goto set_brk; + + /* Always allow shrinking brk. */ + if (brk <= mm->brk) { + if (!do_munmap(mm, newbrk, oldbrk-newbrk)) + goto set_brk; + goto out; + } + + /* Check against rlimit.. */ + rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur; + if (rlim < RLIM_INFINITY && brk - mm->start_data > rlim) + goto out; + + /* Check against existing mmap mappings. */ + if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE)) + goto out; + + /* Ok, looks good - let it rip. */ + if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk) + goto out; +set_brk: + mm->brk = brk; +out: + retval = mm->brk; + up_write(&mm->mmap_sem); + return retval; +} + +#ifdef DEBUG_MM_RB +static int browse_rb(struct rb_root *root) +{ + int i = 0, j; + struct rb_node *nd, *pn = NULL; + unsigned long prev = 0, pend = 0; + + for (nd = rb_first(root); nd; nd = rb_next(nd)) { + struct vm_area_struct *vma; + vma = rb_entry(nd, struct vm_area_struct, vm_rb); + if (vma->vm_start < prev) + printk("vm_start %lx prev %lx\n", vma->vm_start, prev), i = -1; + if (vma->vm_start < pend) + printk("vm_start %lx pend %lx\n", vma->vm_start, pend); + if (vma->vm_start > vma->vm_end) + printk("vm_end %lx < vm_start %lx\n", vma->vm_end, vma->vm_start); + i++; + pn = nd; + } + j = 0; + for (nd = pn; nd; nd = rb_prev(nd)) { + j++; + } + if (i != j) + printk("backwards %d, forwards %d\n", j, i), i = 0; + return i; +} + +void validate_mm(struct mm_struct *mm) +{ + int bug = 0; + int i = 0; + struct vm_area_struct *tmp = mm->mmap; + while (tmp) { + tmp = tmp->vm_next; + i++; + } + if (i != mm->map_count) + printk("map_count %d vm_next %d\n", mm->map_count, i), bug = 1; + i = browse_rb(&mm->mm_rb); + if (i != mm->map_count) + printk("map_count %d rb %d\n", mm->map_count, i), bug = 1; + if (bug) + BUG(); +} +#else +#define validate_mm(mm) do { } while (0) +#endif + +static struct vm_area_struct * +find_vma_prepare(struct mm_struct *mm, unsigned long addr, + struct vm_area_struct **pprev, struct rb_node ***rb_link, + struct rb_node ** rb_parent) +{ + struct vm_area_struct * vma; + struct rb_node ** __rb_link, * __rb_parent, * rb_prev; + + __rb_link = &mm->mm_rb.rb_node; + rb_prev = __rb_parent = NULL; + vma = NULL; + + while (*__rb_link) { + struct vm_area_struct *vma_tmp; + + __rb_parent = *__rb_link; + vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb); + + if (vma_tmp->vm_end > addr) { + vma = vma_tmp; + if (vma_tmp->vm_start <= addr) + return vma; + __rb_link = &__rb_parent->rb_left; + } else { + rb_prev = __rb_parent; + __rb_link = &__rb_parent->rb_right; + } + } + + *pprev = NULL; + if (rb_prev) + *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb); + *rb_link = __rb_link; + *rb_parent = __rb_parent; + return vma; +} + +static inline void +__vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, + struct vm_area_struct *prev, struct rb_node *rb_parent) +{ + if (prev) { + vma->vm_next = prev->vm_next; + prev->vm_next = vma; + } else { + mm->mmap = vma; + if (rb_parent) + vma->vm_next = rb_entry(rb_parent, + struct vm_area_struct, vm_rb); + else + vma->vm_next = NULL; + } +} + +void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, + struct rb_node **rb_link, struct rb_node *rb_parent) +{ + rb_link_node(&vma->vm_rb, rb_parent, rb_link); + rb_insert_color(&vma->vm_rb, &mm->mm_rb); +} + +static inline void __vma_link_file(struct vm_area_struct *vma) +{ + struct file * file; + + file = vma->vm_file; + if (file) { + struct address_space *mapping = file->f_mapping; + + if (vma->vm_flags & VM_DENYWRITE) + atomic_dec(&file->f_dentry->d_inode->i_writecount); + if (vma->vm_flags & VM_SHARED) + mapping->i_mmap_writable++; + + flush_dcache_mmap_lock(mapping); + if (unlikely(vma->vm_flags & VM_NONLINEAR)) + vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); + else + vma_prio_tree_insert(vma, &mapping->i_mmap); + flush_dcache_mmap_unlock(mapping); + } +} + +static void +__vma_link(struct mm_struct *mm, struct vm_area_struct *vma, + struct vm_area_struct *prev, struct rb_node **rb_link, + struct rb_node *rb_parent) +{ + __vma_link_list(mm, vma, prev, rb_parent); + __vma_link_rb(mm, vma, rb_link, rb_parent); + __anon_vma_link(vma); +} + +static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, + struct vm_area_struct *prev, struct rb_node **rb_link, + struct rb_node *rb_parent) +{ + struct address_space *mapping = NULL; + + if (vma->vm_file) + mapping = vma->vm_file->f_mapping; + + if (mapping) { + spin_lock(&mapping->i_mmap_lock); + vma->vm_truncate_count = mapping->truncate_count; + } + anon_vma_lock(vma); + + __vma_link(mm, vma, prev, rb_link, rb_parent); + __vma_link_file(vma); + + anon_vma_unlock(vma); + if (mapping) + spin_unlock(&mapping->i_mmap_lock); + + mm->map_count++; + validate_mm(mm); +} + +/* + * Helper for vma_adjust in the split_vma insert case: + * insert vm structure into list and rbtree and anon_vma, + * but it has already been inserted into prio_tree earlier. + */ +static void +__insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) +{ + struct vm_area_struct * __vma, * prev; + struct rb_node ** rb_link, * rb_parent; + + __vma = find_vma_prepare(mm, vma->vm_start,&prev, &rb_link, &rb_parent); + if (__vma && __vma->vm_start < vma->vm_end) + BUG(); + __vma_link(mm, vma, prev, rb_link, rb_parent); + mm->map_count++; +} + +static inline void +__vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, + struct vm_area_struct *prev) +{ + prev->vm_next = vma->vm_next; + rb_erase(&vma->vm_rb, &mm->mm_rb); + if (mm->mmap_cache == vma) + mm->mmap_cache = prev; +} + +/* + * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that + * is already present in an i_mmap tree without adjusting the tree. + * The following helper function should be used when such adjustments + * are necessary. The "insert" vma (if any) is to be inserted + * before we drop the necessary locks. + */ +void vma_adjust(struct vm_area_struct *vma, unsigned long start, + unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert) +{ + struct mm_struct *mm = vma->vm_mm; + struct vm_area_struct *next = vma->vm_next; + struct vm_area_struct *importer = NULL; + struct address_space *mapping = NULL; + struct prio_tree_root *root = NULL; + struct file *file = vma->vm_file; + struct anon_vma *anon_vma = NULL; + long adjust_next = 0; + int remove_next = 0; + + if (next && !insert) { + if (end >= next->vm_end) { + /* + * vma expands, overlapping all the next, and + * perhaps the one after too (mprotect case 6). + */ +again: remove_next = 1 + (end > next->vm_end); + end = next->vm_end; + anon_vma = next->anon_vma; + importer = vma; + } else if (end > next->vm_start) { + /* + * vma expands, overlapping part of the next: + * mprotect case 5 shifting the boundary up. + */ + adjust_next = (end - next->vm_start) >> PAGE_SHIFT; + anon_vma = next->anon_vma; + importer = vma; + } else if (end < vma->vm_end) { + /* + * vma shrinks, and !insert tells it's not + * split_vma inserting another: so it must be + * mprotect case 4 shifting the boundary down. + */ + adjust_next = - ((vma->vm_end - end) >> PAGE_SHIFT); + anon_vma = next->anon_vma; + importer = next; + } + } + + if (file) { + mapping = file->f_mapping; + if (!(vma->vm_flags & VM_NONLINEAR)) + root = &mapping->i_mmap; + spin_lock(&mapping->i_mmap_lock); + if (importer && + vma->vm_truncate_count != next->vm_truncate_count) { + /* + * unmap_mapping_range might be in progress: + * ensure that the expanding vma is rescanned. + */ + importer->vm_truncate_count = 0; + } + if (insert) { + insert->vm_truncate_count = vma->vm_truncate_count; + /* + * Put into prio_tree now, so instantiated pages + * are visible to arm/parisc __flush_dcache_page + * throughout; but we cannot insert into address + * space until vma start or end is updated. + */ + __vma_link_file(insert); + } + } + + /* + * When changing only vma->vm_end, we don't really need + * anon_vma lock: but is that case worth optimizing out? + */ + if (vma->anon_vma) + anon_vma = vma->anon_vma; + if (anon_vma) { + spin_lock(&anon_vma->lock); + /* + * Easily overlooked: when mprotect shifts the boundary, + * make sure the expanding vma has anon_vma set if the + * shrinking vma had, to cover any anon pages imported. + */ + if (importer && !importer->anon_vma) { + importer->anon_vma = anon_vma; + __anon_vma_link(importer); + } + } + + if (root) { + flush_dcache_mmap_lock(mapping); + vma_prio_tree_remove(vma, root); + if (adjust_next) + vma_prio_tree_remove(next, root); + } + + vma->vm_start = start; + vma->vm_end = end; + vma->vm_pgoff = pgoff; + if (adjust_next) { + next->vm_start += adjust_next << PAGE_SHIFT; + next->vm_pgoff += adjust_next; + } + + if (root) { + if (adjust_next) + vma_prio_tree_insert(next, root); + vma_prio_tree_insert(vma, root); + flush_dcache_mmap_unlock(mapping); + } + + if (remove_next) { + /* + * vma_merge has merged next into vma, and needs + * us to remove next before dropping the locks. + */ + __vma_unlink(mm, next, vma); + if (file) + __remove_shared_vm_struct(next, file, mapping); + if (next->anon_vma) + __anon_vma_merge(vma, next); + } else if (insert) { + /* + * split_vma has split insert from vma, and needs + * us to insert it before dropping the locks + * (it may either follow vma or precede it). + */ + __insert_vm_struct(mm, insert); + } + + if (anon_vma) + spin_unlock(&anon_vma->lock); + if (mapping) + spin_unlock(&mapping->i_mmap_lock); + + if (remove_next) { + if (file) + fput(file); + mm->map_count--; + mpol_free(vma_policy(next)); + kmem_cache_free(vm_area_cachep, next); + /* + * In mprotect's case 6 (see comments on vma_merge), + * we must remove another next too. It would clutter + * up the code too much to do both in one go. + */ + if (remove_next == 2) { + next = vma->vm_next; + goto again; + } + } + + validate_mm(mm); +} + +/* + * If the vma has a ->close operation then the driver probably needs to release + * per-vma resources, so we don't attempt to merge those. + */ +#define VM_SPECIAL (VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_RESERVED) + +static inline int is_mergeable_vma(struct vm_area_struct *vma, + struct file *file, unsigned long vm_flags) +{ + if (vma->vm_flags != vm_flags) + return 0; + if (vma->vm_file != file) + return 0; + if (vma->vm_ops && vma->vm_ops->close) + return 0; + return 1; +} + +static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1, + struct anon_vma *anon_vma2) +{ + return !anon_vma1 || !anon_vma2 || (anon_vma1 == anon_vma2); +} + +/* + * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) + * in front of (at a lower virtual address and file offset than) the vma. + * + * We cannot merge two vmas if they have differently assigned (non-NULL) + * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. + * + * We don't check here for the merged mmap wrapping around the end of pagecache + * indices (16TB on ia32) because do_mmap_pgoff() does not permit mmap's which + * wrap, nor mmaps which cover the final page at index -1UL. + */ +static int +can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, + struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff) +{ + if (is_mergeable_vma(vma, file, vm_flags) && + is_mergeable_anon_vma(anon_vma, vma->anon_vma)) { + if (vma->vm_pgoff == vm_pgoff) + return 1; + } + return 0; +} + +/* + * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) + * beyond (at a higher virtual address and file offset than) the vma. + * + * We cannot merge two vmas if they have differently assigned (non-NULL) + * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. + */ +static int +can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, + struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff) +{ + if (is_mergeable_vma(vma, file, vm_flags) && + is_mergeable_anon_vma(anon_vma, vma->anon_vma)) { + pgoff_t vm_pglen; + vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + if (vma->vm_pgoff + vm_pglen == vm_pgoff) + return 1; + } + return 0; +} + +/* + * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out + * whether that can be merged with its predecessor or its successor. + * Or both (it neatly fills a hole). + * + * In most cases - when called for mmap, brk or mremap - [addr,end) is + * certain not to be mapped by the time vma_merge is called; but when + * called for mprotect, it is certain to be already mapped (either at + * an offset within prev, or at the start of next), and the flags of + * this area are about to be changed to vm_flags - and the no-change + * case has already been eliminated. + * + * The following mprotect cases have to be considered, where AAAA is + * the area passed down from mprotect_fixup, never extending beyond one + * vma, PPPPPP is the prev vma specified, and NNNNNN the next vma after: + * + * AAAA AAAA AAAA AAAA + * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPNNNNNN PPPPNNNNXXXX + * cannot merge might become might become might become + * PPNNNNNNNNNN PPPPPPPPPPNN PPPPPPPPPPPP 6 or + * mmap, brk or case 4 below case 5 below PPPPPPPPXXXX 7 or + * mremap move: PPPPNNNNNNNN 8 + * AAAA + * PPPP NNNN PPPPPPPPPPPP PPPPPPPPNNNN PPPPNNNNNNNN + * might become case 1 below case 2 below case 3 below + * + * Odd one out? Case 8, because it extends NNNN but needs flags of XXXX: + * mprotect_fixup updates vm_flags & vm_page_prot on successful return. + */ +struct vm_area_struct *vma_merge(struct mm_struct *mm, + struct vm_area_struct *prev, unsigned long addr, + unsigned long end, unsigned long vm_flags, + struct anon_vma *anon_vma, struct file *file, + pgoff_t pgoff, struct mempolicy *policy) +{ + pgoff_t pglen = (end - addr) >> PAGE_SHIFT; + struct vm_area_struct *area, *next; + + /* + * We later require that vma->vm_flags == vm_flags, + * so this tests vma->vm_flags & VM_SPECIAL, too. + */ + if (vm_flags & VM_SPECIAL) + return NULL; + + if (prev) + next = prev->vm_next; + else + next = mm->mmap; + area = next; + if (next && next->vm_end == end) /* cases 6, 7, 8 */ + next = next->vm_next; + + /* + * Can it merge with the predecessor? + */ + if (prev && prev->vm_end == addr && + mpol_equal(vma_policy(prev), policy) && + can_vma_merge_after(prev, vm_flags, + anon_vma, file, pgoff)) { + /* + * OK, it can. Can we now merge in the successor as well? + */ + if (next && end == next->vm_start && + mpol_equal(policy, vma_policy(next)) && + can_vma_merge_before(next, vm_flags, + anon_vma, file, pgoff+pglen) && + is_mergeable_anon_vma(prev->anon_vma, + next->anon_vma)) { + /* cases 1, 6 */ + vma_adjust(prev, prev->vm_start, + next->vm_end, prev->vm_pgoff, NULL); + } else /* cases 2, 5, 7 */ + vma_adjust(prev, prev->vm_start, + end, prev->vm_pgoff, NULL); + return prev; + } + + /* + * Can this new request be merged in front of next? + */ + if (next && end == next->vm_start && + mpol_equal(policy, vma_policy(next)) && + can_vma_merge_before(next, vm_flags, + anon_vma, file, pgoff+pglen)) { + if (prev && addr < prev->vm_end) /* case 4 */ + vma_adjust(prev, prev->vm_start, + addr, prev->vm_pgoff, NULL); + else /* cases 3, 8 */ + vma_adjust(area, addr, next->vm_end, + next->vm_pgoff - pglen, NULL); + return area; + } + + return NULL; +} + +/* + * find_mergeable_anon_vma is used by anon_vma_prepare, to check + * neighbouring vmas for a suitable anon_vma, before it goes off + * to allocate a new anon_vma. It checks because a repetitive + * sequence of mprotects and faults may otherwise lead to distinct + * anon_vmas being allocated, preventing vma merge in subsequent + * mprotect. + */ +struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) +{ + struct vm_area_struct *near; + unsigned long vm_flags; + + near = vma->vm_next; + if (!near) + goto try_prev; + + /* + * Since only mprotect tries to remerge vmas, match flags + * which might be mprotected into each other later on. + * Neither mlock nor madvise tries to remerge at present, + * so leave their flags as obstructing a merge. + */ + vm_flags = vma->vm_flags & ~(VM_READ|VM_WRITE|VM_EXEC); + vm_flags |= near->vm_flags & (VM_READ|VM_WRITE|VM_EXEC); + + if (near->anon_vma && vma->vm_end == near->vm_start && + mpol_equal(vma_policy(vma), vma_policy(near)) && + can_vma_merge_before(near, vm_flags, + NULL, vma->vm_file, vma->vm_pgoff + + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT))) + return near->anon_vma; +try_prev: + /* + * It is potentially slow to have to call find_vma_prev here. + * But it's only on the first write fault on the vma, not + * every time, and we could devise a way to avoid it later + * (e.g. stash info in next's anon_vma_node when assigning + * an anon_vma, or when trying vma_merge). Another time. + */ + if (find_vma_prev(vma->vm_mm, vma->vm_start, &near) != vma) + BUG(); + if (!near) + goto none; + + vm_flags = vma->vm_flags & ~(VM_READ|VM_WRITE|VM_EXEC); + vm_flags |= near->vm_flags & (VM_READ|VM_WRITE|VM_EXEC); + + if (near->anon_vma && near->vm_end == vma->vm_start && + mpol_equal(vma_policy(near), vma_policy(vma)) && + can_vma_merge_after(near, vm_flags, + NULL, vma->vm_file, vma->vm_pgoff)) + return near->anon_vma; +none: + /* + * There's no absolute need to look only at touching neighbours: + * we could search further afield for "compatible" anon_vmas. + * But it would probably just be a waste of time searching, + * or lead to too many vmas hanging off the same anon_vma. + * We're trying to allow mprotect remerging later on, + * not trying to minimize memory used for anon_vmas. + */ + return NULL; +} + +#ifdef CONFIG_PROC_FS +void __vm_stat_account(struct mm_struct *mm, unsigned long flags, + struct file *file, long pages) +{ + const unsigned long stack_flags + = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN); + +#ifdef CONFIG_HUGETLB + if (flags & VM_HUGETLB) { + if (!(flags & VM_DONTCOPY)) + mm->shared_vm += pages; + return; + } +#endif /* CONFIG_HUGETLB */ + + if (file) { + mm->shared_vm += pages; + if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC) + mm->exec_vm += pages; + } else if (flags & stack_flags) + mm->stack_vm += pages; + if (flags & (VM_RESERVED|VM_IO)) + mm->reserved_vm += pages; +} +#endif /* CONFIG_PROC_FS */ + +/* + * The caller must hold down_write(current->mm->mmap_sem). + */ + +unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, + unsigned long len, unsigned long prot, + unsigned long flags, unsigned long pgoff) +{ + struct mm_struct * mm = current->mm; + struct vm_area_struct * vma, * prev; + struct inode *inode; + unsigned int vm_flags; + int correct_wcount = 0; + int error; + struct rb_node ** rb_link, * rb_parent; + int accountable = 1; + unsigned long charged = 0, reqprot = prot; + + if (file) { + if (is_file_hugepages(file)) + accountable = 0; + + if (!file->f_op || !file->f_op->mmap) + return -ENODEV; + + if ((prot & PROT_EXEC) && + (file->f_vfsmnt->mnt_flags & MNT_NOEXEC)) + return -EPERM; + } + /* + * Does the application expect PROT_READ to imply PROT_EXEC? + * + * (the exception is when the underlying filesystem is noexec + * mounted, in which case we dont add PROT_EXEC.) + */ + if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC)) + if (!(file && (file->f_vfsmnt->mnt_flags & MNT_NOEXEC))) + prot |= PROT_EXEC; + + if (!len) + return -EINVAL; + + /* Careful about overflows.. */ + len = PAGE_ALIGN(len); + if (!len || len > TASK_SIZE) + return -ENOMEM; + + /* offset overflow? */ + if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) + return -EOVERFLOW; + + /* Too many mappings? */ + if (mm->map_count > sysctl_max_map_count) + return -ENOMEM; + + /* Obtain the address to map to. we verify (or select) it and ensure + * that it represents a valid section of the address space. + */ + addr = get_unmapped_area(file, addr, len, pgoff, flags); + if (addr & ~PAGE_MASK) + return addr; + + /* Do simple checking here so the lower-level routines won't have + * to. we assume access permissions have been handled by the open + * of the memory object, so we don't do any here. + */ + vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) | + mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; + + if (flags & MAP_LOCKED) { + if (!can_do_mlock()) + return -EPERM; + vm_flags |= VM_LOCKED; + } + /* mlock MCL_FUTURE? */ + if (vm_flags & VM_LOCKED) { + unsigned long locked, lock_limit; + locked = mm->locked_vm << PAGE_SHIFT; + lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; + locked += len; + if (locked > lock_limit && !capable(CAP_IPC_LOCK)) + return -EAGAIN; + } + + inode = file ? file->f_dentry->d_inode : NULL; + + if (file) { + switch (flags & MAP_TYPE) { + case MAP_SHARED: + if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE)) + return -EACCES; + + /* + * Make sure we don't allow writing to an append-only + * file.. + */ + if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE)) + return -EACCES; + + /* + * Make sure there are no mandatory locks on the file. + */ + if (locks_verify_locked(inode)) + return -EAGAIN; + + vm_flags |= VM_SHARED | VM_MAYSHARE; + if (!(file->f_mode & FMODE_WRITE)) + vm_flags &= ~(VM_MAYWRITE | VM_SHARED); + + /* fall through */ + case MAP_PRIVATE: + if (!(file->f_mode & FMODE_READ)) + return -EACCES; + break; + + default: + return -EINVAL; + } + } else { + switch (flags & MAP_TYPE) { + case MAP_SHARED: + vm_flags |= VM_SHARED | VM_MAYSHARE; + break; + case MAP_PRIVATE: + /* + * Set pgoff according to addr for anon_vma. + */ + pgoff = addr >> PAGE_SHIFT; + break; + default: + return -EINVAL; + } + } + + error = security_file_mmap(file, reqprot, prot, flags); + if (error) + return error; + + /* Clear old maps */ + error = -ENOMEM; +munmap_back: + vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); + if (vma && vma->vm_start < addr + len) { + if (do_munmap(mm, addr, len)) + return -ENOMEM; + goto munmap_back; + } + + /* Check against address space limit. */ + if ((mm->total_vm << PAGE_SHIFT) + len + > current->signal->rlim[RLIMIT_AS].rlim_cur) + return -ENOMEM; + + if (accountable && (!(flags & MAP_NORESERVE) || + sysctl_overcommit_memory == OVERCOMMIT_NEVER)) { + if (vm_flags & VM_SHARED) { + /* Check memory availability in shmem_file_setup? */ + vm_flags |= VM_ACCOUNT; + } else if (vm_flags & VM_WRITE) { + /* + * Private writable mapping: check memory availability + */ + charged = len >> PAGE_SHIFT; + if (security_vm_enough_memory(charged)) + return -ENOMEM; + vm_flags |= VM_ACCOUNT; + } + } + + /* + * Can we just expand an old private anonymous mapping? + * The VM_SHARED test is necessary because shmem_zero_setup + * will create the file object for a shared anonymous map below. + */ + if (!file && !(vm_flags & VM_SHARED) && + vma_merge(mm, prev, addr, addr + len, vm_flags, + NULL, NULL, pgoff, NULL)) + goto out; + + /* + * Determine the object being mapped and call the appropriate + * specific mapper. the address has already been validated, but + * not unmapped, but the maps are removed from the list. + */ + vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (!vma) { + error = -ENOMEM; + goto unacct_error; + } + memset(vma, 0, sizeof(*vma)); + + vma->vm_mm = mm; + vma->vm_start = addr; + vma->vm_end = addr + len; + vma->vm_flags = vm_flags; + vma->vm_page_prot = protection_map[vm_flags & 0x0f]; + vma->vm_pgoff = pgoff; + + if (file) { + error = -EINVAL; + if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) + goto free_vma; + if (vm_flags & VM_DENYWRITE) { + error = deny_write_access(file); + if (error) + goto free_vma; + correct_wcount = 1; + } + vma->vm_file = file; + get_file(file); + error = file->f_op->mmap(file, vma); + if (error) + goto unmap_and_free_vma; + } else if (vm_flags & VM_SHARED) { + error = shmem_zero_setup(vma); + if (error) + goto free_vma; + } + + /* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform + * shmem_zero_setup (perhaps called through /dev/zero's ->mmap) + * that memory reservation must be checked; but that reservation + * belongs to shared memory object, not to vma: so now clear it. + */ + if ((vm_flags & (VM_SHARED|VM_ACCOUNT)) == (VM_SHARED|VM_ACCOUNT)) + vma->vm_flags &= ~VM_ACCOUNT; + + /* Can addr have changed?? + * + * Answer: Yes, several device drivers can do it in their + * f_op->mmap method. -DaveM + */ + addr = vma->vm_start; + pgoff = vma->vm_pgoff; + vm_flags = vma->vm_flags; + + if (!file || !vma_merge(mm, prev, addr, vma->vm_end, + vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) { + file = vma->vm_file; + vma_link(mm, vma, prev, rb_link, rb_parent); + if (correct_wcount) + atomic_inc(&inode->i_writecount); + } else { + if (file) { + if (correct_wcount) + atomic_inc(&inode->i_writecount); + fput(file); + } + mpol_free(vma_policy(vma)); + kmem_cache_free(vm_area_cachep, vma); + } +out: + mm->total_vm += len >> PAGE_SHIFT; + __vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); + if (vm_flags & VM_LOCKED) { + mm->locked_vm += len >> PAGE_SHIFT; + make_pages_present(addr, addr + len); + } + if (flags & MAP_POPULATE) { + up_write(&mm->mmap_sem); + sys_remap_file_pages(addr, len, 0, + pgoff, flags & MAP_NONBLOCK); + down_write(&mm->mmap_sem); + } + return addr; + +unmap_and_free_vma: + if (correct_wcount) + atomic_inc(&inode->i_writecount); + vma->vm_file = NULL; + fput(file); + + /* Undo any partial mapping done by a device driver. */ + zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start, NULL); +free_vma: + kmem_cache_free(vm_area_cachep, vma); +unacct_error: + if (charged) + vm_unacct_memory(charged); + return error; +} + +EXPORT_SYMBOL(do_mmap_pgoff); + +/* Get an address range which is currently unmapped. + * For shmat() with addr=0. + * + * Ugly calling convention alert: + * Return value with the low bits set means error value, + * ie + * if (ret & ~PAGE_MASK) + * error = ret; + * + * This function "knows" that -ENOMEM has the bits set. + */ +#ifndef HAVE_ARCH_UNMAPPED_AREA +unsigned long +arch_get_unmapped_area(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, unsigned long flags) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + unsigned long start_addr; + + if (len > TASK_SIZE) + return -ENOMEM; + + if (addr) { + addr = PAGE_ALIGN(addr); + vma = find_vma(mm, addr); + if (TASK_SIZE - len >= addr && + (!vma || addr + len <= vma->vm_start)) + return addr; + } + start_addr = addr = mm->free_area_cache; + +full_search: + for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { + /* At this point: (!vma || addr < vma->vm_end). */ + if (TASK_SIZE - len < addr) { + /* + * Start a new search - just in case we missed + * some holes. + */ + if (start_addr != TASK_UNMAPPED_BASE) { + start_addr = addr = TASK_UNMAPPED_BASE; + goto full_search; + } + return -ENOMEM; + } + if (!vma || addr + len <= vma->vm_start) { + /* + * Remember the place where we stopped the search: + */ + mm->free_area_cache = addr + len; + return addr; + } + addr = vma->vm_end; + } +} +#endif + +void arch_unmap_area(struct vm_area_struct *area) +{ + /* + * Is this a new hole at the lowest possible address? + */ + if (area->vm_start >= TASK_UNMAPPED_BASE && + area->vm_start < area->vm_mm->free_area_cache) + area->vm_mm->free_area_cache = area->vm_start; +} + +/* + * This mmap-allocator allocates new areas top-down from below the + * stack's low limit (the base): + */ +#ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN +unsigned long +arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, + const unsigned long len, const unsigned long pgoff, + const unsigned long flags) +{ + struct vm_area_struct *vma; + struct mm_struct *mm = current->mm; + unsigned long addr = addr0; + + /* requested length too big for entire address space */ + if (len > TASK_SIZE) + return -ENOMEM; + + /* requesting a specific address */ + if (addr) { + addr = PAGE_ALIGN(addr); + vma = find_vma(mm, addr); + if (TASK_SIZE - len >= addr && + (!vma || addr + len <= vma->vm_start)) + return addr; + } + + /* either no address requested or can't fit in requested address hole */ + addr = mm->free_area_cache; + + /* make sure it can fit in the remaining address space */ + if (addr >= len) { + vma = find_vma(mm, addr-len); + if (!vma || addr <= vma->vm_start) + /* remember the address as a hint for next time */ + return (mm->free_area_cache = addr-len); + } + + addr = mm->mmap_base-len; + + do { + /* + * Lookup failure means no vma is above this address, + * else if new region fits below vma->vm_start, + * return with success: + */ + vma = find_vma(mm, addr); + if (!vma || addr+len <= vma->vm_start) + /* remember the address as a hint for next time */ + return (mm->free_area_cache = addr); + + /* try just below the current vma->vm_start */ + addr = vma->vm_start-len; + } while (len <= vma->vm_start); + + /* + * A failed mmap() very likely causes application failure, + * so fall back to the bottom-up function here. This scenario + * can happen with large stack limits and large mmap() + * allocations. + */ + mm->free_area_cache = TASK_UNMAPPED_BASE; + addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags); + /* + * Restore the topdown base: + */ + mm->free_area_cache = mm->mmap_base; + + return addr; +} +#endif + +void arch_unmap_area_topdown(struct vm_area_struct *area) +{ + /* + * Is this a new hole at the highest possible address? + */ + if (area->vm_end > area->vm_mm->free_area_cache) + area->vm_mm->free_area_cache = area->vm_end; + + /* dont allow allocations above current base */ + if (area->vm_mm->free_area_cache > area->vm_mm->mmap_base) + area->vm_mm->free_area_cache = area->vm_mm->mmap_base; +} + +unsigned long +get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags) +{ + if (flags & MAP_FIXED) { + unsigned long ret; + + if (addr > TASK_SIZE - len) + return -ENOMEM; + if (addr & ~PAGE_MASK) + return -EINVAL; + if (file && is_file_hugepages(file)) { + /* + * Check if the given range is hugepage aligned, and + * can be made suitable for hugepages. + */ + ret = prepare_hugepage_range(addr, len); + } else { + /* + * Ensure that a normal request is not falling in a + * reserved hugepage range. For some archs like IA-64, + * there is a separate region for hugepages. + */ + ret = is_hugepage_only_range(current->mm, addr, len); + } + if (ret) + return -EINVAL; + return addr; + } + + if (file && file->f_op && file->f_op->get_unmapped_area) + return file->f_op->get_unmapped_area(file, addr, len, + pgoff, flags); + + return current->mm->get_unmapped_area(file, addr, len, pgoff, flags); +} + +EXPORT_SYMBOL(get_unmapped_area); + +/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ +struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr) +{ + struct vm_area_struct *vma = NULL; + + if (mm) { + /* Check the cache first. */ + /* (Cache hit rate is typically around 35%.) */ + vma = mm->mmap_cache; + if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) { + struct rb_node * rb_node; + + rb_node = mm->mm_rb.rb_node; + vma = NULL; + + while (rb_node) { + struct vm_area_struct * vma_tmp; + + vma_tmp = rb_entry(rb_node, + struct vm_area_struct, vm_rb); + + if (vma_tmp->vm_end > addr) { + vma = vma_tmp; + if (vma_tmp->vm_start <= addr) + break; + rb_node = rb_node->rb_left; + } else + rb_node = rb_node->rb_right; + } + if (vma) + mm->mmap_cache = vma; + } + } + return vma; +} + +EXPORT_SYMBOL(find_vma); + +/* Same as find_vma, but also return a pointer to the previous VMA in *pprev. */ +struct vm_area_struct * +find_vma_prev(struct mm_struct *mm, unsigned long addr, + struct vm_area_struct **pprev) +{ + struct vm_area_struct *vma = NULL, *prev = NULL; + struct rb_node * rb_node; + if (!mm) + goto out; + + /* Guard against addr being lower than the first VMA */ + vma = mm->mmap; + + /* Go through the RB tree quickly. */ + rb_node = mm->mm_rb.rb_node; + + while (rb_node) { + struct vm_area_struct *vma_tmp; + vma_tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb); + + if (addr < vma_tmp->vm_end) { + rb_node = rb_node->rb_left; + } else { + prev = vma_tmp; + if (!prev->vm_next || (addr < prev->vm_next->vm_end)) + break; + rb_node = rb_node->rb_right; + } + } + +out: + *pprev = prev; + return prev ? prev->vm_next : vma; +} + +/* + * Verify that the stack growth is acceptable and + * update accounting. This is shared with both the + * grow-up and grow-down cases. + */ +static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, unsigned long grow) +{ + struct mm_struct *mm = vma->vm_mm; + struct rlimit *rlim = current->signal->rlim; + + /* address space limit tests */ + if (mm->total_vm + grow > rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT) + return -ENOMEM; + + /* Stack limit test */ + if (size > rlim[RLIMIT_STACK].rlim_cur) + return -ENOMEM; + + /* mlock limit tests */ + if (vma->vm_flags & VM_LOCKED) { + unsigned long locked; + unsigned long limit; + locked = mm->locked_vm + grow; + limit = rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; + if (locked > limit && !capable(CAP_IPC_LOCK)) + return -ENOMEM; + } + + /* + * Overcommit.. This must be the final test, as it will + * update security statistics. + */ + if (security_vm_enough_memory(grow)) + return -ENOMEM; + + /* Ok, everything looks good - let it rip */ + mm->total_vm += grow; + if (vma->vm_flags & VM_LOCKED) + mm->locked_vm += grow; + __vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow); + return 0; +} + +#ifdef CONFIG_STACK_GROWSUP +/* + * vma is the first one with address > vma->vm_end. Have to extend vma. + */ +int expand_stack(struct vm_area_struct * vma, unsigned long address) +{ + int error; + + if (!(vma->vm_flags & VM_GROWSUP)) + return -EFAULT; + + /* + * We must make sure the anon_vma is allocated + * so that the anon_vma locking is not a noop. + */ + if (unlikely(anon_vma_prepare(vma))) + return -ENOMEM; + anon_vma_lock(vma); + + /* + * vma->vm_start/vm_end cannot change under us because the caller + * is required to hold the mmap_sem in read mode. We need the + * anon_vma lock to serialize against concurrent expand_stacks. + */ + address += 4 + PAGE_SIZE - 1; + address &= PAGE_MASK; + error = 0; + + /* Somebody else might have raced and expanded it already */ + if (address > vma->vm_end) { + unsigned long size, grow; + + size = address - vma->vm_start; + grow = (address - vma->vm_end) >> PAGE_SHIFT; + + error = acct_stack_growth(vma, size, grow); + if (!error) + vma->vm_end = address; + } + anon_vma_unlock(vma); + return error; +} + +struct vm_area_struct * +find_extend_vma(struct mm_struct *mm, unsigned long addr) +{ + struct vm_area_struct *vma, *prev; + + addr &= PAGE_MASK; + vma = find_vma_prev(mm, addr, &prev); + if (vma && (vma->vm_start <= addr)) + return vma; + if (!prev || expand_stack(prev, addr)) + return NULL; + if (prev->vm_flags & VM_LOCKED) { + make_pages_present(addr, prev->vm_end); + } + return prev; +} +#else +/* + * vma is the first one with address < vma->vm_start. Have to extend vma. + */ +int expand_stack(struct vm_area_struct *vma, unsigned long address) +{ + int error; + + /* + * We must make sure the anon_vma is allocated + * so that the anon_vma locking is not a noop. + */ + if (unlikely(anon_vma_prepare(vma))) + return -ENOMEM; + anon_vma_lock(vma); + + /* + * vma->vm_start/vm_end cannot change under us because the caller + * is required to hold the mmap_sem in read mode. We need the + * anon_vma lock to serialize against concurrent expand_stacks. + */ + address &= PAGE_MASK; + error = 0; + + /* Somebody else might have raced and expanded it already */ + if (address < vma->vm_start) { + unsigned long size, grow; + + size = vma->vm_end - address; + grow = (vma->vm_start - address) >> PAGE_SHIFT; + + error = acct_stack_growth(vma, size, grow); + if (!error) { + vma->vm_start = address; + vma->vm_pgoff -= grow; + } + } + anon_vma_unlock(vma); + return error; +} + +struct vm_area_struct * +find_extend_vma(struct mm_struct * mm, unsigned long addr) +{ + struct vm_area_struct * vma; + unsigned long start; + + addr &= PAGE_MASK; + vma = find_vma(mm,addr); + if (!vma) + return NULL; + if (vma->vm_start <= addr) + return vma; + if (!(vma->vm_flags & VM_GROWSDOWN)) + return NULL; + start = vma->vm_start; + if (expand_stack(vma, addr)) + return NULL; + if (vma->vm_flags & VM_LOCKED) { + make_pages_present(addr, start); + } + return vma; +} +#endif + +/* + * Try to free as many page directory entries as we can, + * without having to work very hard at actually scanning + * the page tables themselves. + * + * Right now we try to free page tables if we have a nice + * PGDIR-aligned area that got free'd up. We could be more + * granular if we want to, but this is fast and simple, + * and covers the bad cases. + * + * "prev", if it exists, points to a vma before the one + * we just free'd - but there's no telling how much before. + */ +static void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *prev, + unsigned long start, unsigned long end) +{ + unsigned long first = start & PGDIR_MASK; + unsigned long last = end + PGDIR_SIZE - 1; + struct mm_struct *mm = tlb->mm; + + if (last > MM_VM_SIZE(mm) || last < end) + last = MM_VM_SIZE(mm); + + if (!prev) { + prev = mm->mmap; + if (!prev) + goto no_mmaps; + if (prev->vm_end > start) { + if (last > prev->vm_start) + last = prev->vm_start; + goto no_mmaps; + } + } + for (;;) { + struct vm_area_struct *next = prev->vm_next; + + if (next) { + if (next->vm_start < start) { + prev = next; + continue; + } + if (last > next->vm_start) + last = next->vm_start; + } + if (prev->vm_end > first) + first = prev->vm_end; + break; + } +no_mmaps: + if (last < first) /* for arches with discontiguous pgd indices */ + return; + if (first < FIRST_USER_PGD_NR * PGDIR_SIZE) + first = FIRST_USER_PGD_NR * PGDIR_SIZE; + /* No point trying to free anything if we're in the same pte page */ + if ((first & PMD_MASK) < (last & PMD_MASK)) { + clear_page_range(tlb, first, last); + flush_tlb_pgtables(mm, first, last); + } +} + +/* Normal function to fix up a mapping + * This function is the default for when an area has no specific + * function. This may be used as part of a more specific routine. + * + * By the time this function is called, the area struct has been + * removed from the process mapping list. + */ +static void unmap_vma(struct mm_struct *mm, struct vm_area_struct *area) +{ + size_t len = area->vm_end - area->vm_start; + + area->vm_mm->total_vm -= len >> PAGE_SHIFT; + if (area->vm_flags & VM_LOCKED) + area->vm_mm->locked_vm -= len >> PAGE_SHIFT; + vm_stat_unaccount(area); + area->vm_mm->unmap_area(area); + remove_vm_struct(area); +} + +/* + * Update the VMA and inode share lists. + * + * Ok - we have the memory areas we should free on the 'free' list, + * so release them, and do the vma updates. + */ +static void unmap_vma_list(struct mm_struct *mm, + struct vm_area_struct *mpnt) +{ + do { + struct vm_area_struct *next = mpnt->vm_next; + unmap_vma(mm, mpnt); + mpnt = next; + } while (mpnt != NULL); + validate_mm(mm); +} + +/* + * Get rid of page table information in the indicated region. + * + * Called with the page table lock held. + */ +static void unmap_region(struct mm_struct *mm, + struct vm_area_struct *vma, + struct vm_area_struct *prev, + unsigned long start, + unsigned long end) +{ + struct mmu_gather *tlb; + unsigned long nr_accounted = 0; + + lru_add_drain(); + tlb = tlb_gather_mmu(mm, 0); + unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL); + vm_unacct_memory(nr_accounted); + + if (is_hugepage_only_range(mm, start, end - start)) + hugetlb_free_pgtables(tlb, prev, start, end); + else + free_pgtables(tlb, prev, start, end); + tlb_finish_mmu(tlb, start, end); +} + +/* + * Create a list of vma's touched by the unmap, removing them from the mm's + * vma list as we go.. + */ +static void +detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, + struct vm_area_struct *prev, unsigned long end) +{ + struct vm_area_struct **insertion_point; + struct vm_area_struct *tail_vma = NULL; + + insertion_point = (prev ? &prev->vm_next : &mm->mmap); + do { + rb_erase(&vma->vm_rb, &mm->mm_rb); + mm->map_count--; + tail_vma = vma; + vma = vma->vm_next; + } while (vma && vma->vm_start < end); + *insertion_point = vma; + tail_vma->vm_next = NULL; + mm->mmap_cache = NULL; /* Kill the cache. */ +} + +/* + * Split a vma into two pieces at address 'addr', a new vma is allocated + * either for the first part or the the tail. + */ +int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, + unsigned long addr, int new_below) +{ + struct mempolicy *pol; + struct vm_area_struct *new; + + if (is_vm_hugetlb_page(vma) && (addr & ~HPAGE_MASK)) + return -EINVAL; + + if (mm->map_count >= sysctl_max_map_count) + return -ENOMEM; + + new = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (!new) + return -ENOMEM; + + /* most fields are the same, copy all, and then fixup */ + *new = *vma; + + if (new_below) + new->vm_end = addr; + else { + new->vm_start = addr; + new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT); + } + + pol = mpol_copy(vma_policy(vma)); + if (IS_ERR(pol)) { + kmem_cache_free(vm_area_cachep, new); + return PTR_ERR(pol); + } + vma_set_policy(new, pol); + + if (new->vm_file) + get_file(new->vm_file); + + if (new->vm_ops && new->vm_ops->open) + new->vm_ops->open(new); + + if (new_below) + vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff + + ((addr - new->vm_start) >> PAGE_SHIFT), new); + else + vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); + + return 0; +} + +/* Munmap is split into 2 main parts -- this part which finds + * what needs doing, and the areas themselves, which do the + * work. This now handles partial unmappings. + * Jeremy Fitzhardinge + */ +int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) +{ + unsigned long end; + struct vm_area_struct *mpnt, *prev, *last; + + if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start) + return -EINVAL; + + if ((len = PAGE_ALIGN(len)) == 0) + return -EINVAL; + + /* Find the first overlapping VMA */ + mpnt = find_vma_prev(mm, start, &prev); + if (!mpnt) + return 0; + /* we have start < mpnt->vm_end */ + + /* if it doesn't overlap, we have nothing.. */ + end = start + len; + if (mpnt->vm_start >= end) + return 0; + + /* + * If we need to split any vma, do it now to save pain later. + * + * Note: mremap's move_vma VM_ACCOUNT handling assumes a partially + * unmapped vm_area_struct will remain in use: so lower split_vma + * places tmp vma above, and higher split_vma places tmp vma below. + */ + if (start > mpnt->vm_start) { + int error = split_vma(mm, mpnt, start, 0); + if (error) + return error; + prev = mpnt; + } + + /* Does it split the last one? */ + last = find_vma(mm, end); + if (last && end > last->vm_start) { + int error = split_vma(mm, last, end, 1); + if (error) + return error; + } + mpnt = prev? prev->vm_next: mm->mmap; + + /* + * Remove the vma's, and unmap the actual pages + */ + detach_vmas_to_be_unmapped(mm, mpnt, prev, end); + spin_lock(&mm->page_table_lock); + unmap_region(mm, mpnt, prev, start, end); + spin_unlock(&mm->page_table_lock); + + /* Fix up all other VM information */ + unmap_vma_list(mm, mpnt); + + return 0; +} + +EXPORT_SYMBOL(do_munmap); + +asmlinkage long sys_munmap(unsigned long addr, size_t len) +{ + int ret; + struct mm_struct *mm = current->mm; + + profile_munmap(addr); + + down_write(&mm->mmap_sem); + ret = do_munmap(mm, addr, len); + up_write(&mm->mmap_sem); + return ret; +} + +static inline void verify_mm_writelocked(struct mm_struct *mm) +{ +#ifdef CONFIG_DEBUG_KERNEL + if (unlikely(down_read_trylock(&mm->mmap_sem))) { + WARN_ON(1); + up_read(&mm->mmap_sem); + } +#endif +} + +/* + * this is really a simplified "do_mmap". it only handles + * anonymous maps. eventually we may be able to do some + * brk-specific accounting here. + */ +unsigned long do_brk(unsigned long addr, unsigned long len) +{ + struct mm_struct * mm = current->mm; + struct vm_area_struct * vma, * prev; + unsigned long flags; + struct rb_node ** rb_link, * rb_parent; + pgoff_t pgoff = addr >> PAGE_SHIFT; + + len = PAGE_ALIGN(len); + if (!len) + return addr; + + if ((addr + len) > TASK_SIZE || (addr + len) < addr) + return -EINVAL; + + /* + * mlock MCL_FUTURE? + */ + if (mm->def_flags & VM_LOCKED) { + unsigned long locked, lock_limit; + locked = mm->locked_vm << PAGE_SHIFT; + lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; + locked += len; + if (locked > lock_limit && !capable(CAP_IPC_LOCK)) + return -EAGAIN; + } + + /* + * mm->mmap_sem is required to protect against another thread + * changing the mappings in case we sleep. + */ + verify_mm_writelocked(mm); + + /* + * Clear old maps. this also does some error checking for us + */ + munmap_back: + vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); + if (vma && vma->vm_start < addr + len) { + if (do_munmap(mm, addr, len)) + return -ENOMEM; + goto munmap_back; + } + + /* Check against address space limits *after* clearing old maps... */ + if ((mm->total_vm << PAGE_SHIFT) + len + > current->signal->rlim[RLIMIT_AS].rlim_cur) + return -ENOMEM; + + if (mm->map_count > sysctl_max_map_count) + return -ENOMEM; + + if (security_vm_enough_memory(len >> PAGE_SHIFT)) + return -ENOMEM; + + flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; + + /* Can we just expand an old private anonymous mapping? */ + if (vma_merge(mm, prev, addr, addr + len, flags, + NULL, NULL, pgoff, NULL)) + goto out; + + /* + * create a vma struct for an anonymous mapping + */ + vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (!vma) { + vm_unacct_memory(len >> PAGE_SHIFT); + return -ENOMEM; + } + memset(vma, 0, sizeof(*vma)); + + vma->vm_mm = mm; + vma->vm_start = addr; + vma->vm_end = addr + len; + vma->vm_pgoff = pgoff; + vma->vm_flags = flags; + vma->vm_page_prot = protection_map[flags & 0x0f]; + vma_link(mm, vma, prev, rb_link, rb_parent); +out: + mm->total_vm += len >> PAGE_SHIFT; + if (flags & VM_LOCKED) { + mm->locked_vm += len >> PAGE_SHIFT; + make_pages_present(addr, addr + len); + } + return addr; +} + +EXPORT_SYMBOL(do_brk); + +/* Release all mmaps. */ +void exit_mmap(struct mm_struct *mm) +{ + struct mmu_gather *tlb; + struct vm_area_struct *vma; + unsigned long nr_accounted = 0; + + lru_add_drain(); + + spin_lock(&mm->page_table_lock); + + tlb = tlb_gather_mmu(mm, 1); + flush_cache_mm(mm); + /* Use ~0UL here to ensure all VMAs in the mm are unmapped */ + mm->map_count -= unmap_vmas(&tlb, mm, mm->mmap, 0, + ~0UL, &nr_accounted, NULL); + vm_unacct_memory(nr_accounted); + BUG_ON(mm->map_count); /* This is just debugging */ + clear_page_range(tlb, FIRST_USER_PGD_NR * PGDIR_SIZE, MM_VM_SIZE(mm)); + + tlb_finish_mmu(tlb, 0, MM_VM_SIZE(mm)); + + vma = mm->mmap; + mm->mmap = mm->mmap_cache = NULL; + mm->mm_rb = RB_ROOT; + set_mm_counter(mm, rss, 0); + mm->total_vm = 0; + mm->locked_vm = 0; + + spin_unlock(&mm->page_table_lock); + + /* + * Walk the list again, actually closing and freeing it + * without holding any MM locks. + */ + while (vma) { + struct vm_area_struct *next = vma->vm_next; + remove_vm_struct(vma); + vma = next; + } +} + +/* Insert vm structure into process list sorted by address + * and into the inode's i_mmap tree. If vm_file is non-NULL + * then i_mmap_lock is taken here. + */ +int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) +{ + struct vm_area_struct * __vma, * prev; + struct rb_node ** rb_link, * rb_parent; + + /* + * The vm_pgoff of a purely anonymous vma should be irrelevant + * until its first write fault, when page's anon_vma and index + * are set. But now set the vm_pgoff it will almost certainly + * end up with (unless mremap moves it elsewhere before that + * first wfault), so /proc/pid/maps tells a consistent story. + * + * By setting it to reflect the virtual start address of the + * vma, merges and splits can happen in a seamless way, just + * using the existing file pgoff checks and manipulations. + * Similarly in do_mmap_pgoff and in do_brk. + */ + if (!vma->vm_file) { + BUG_ON(vma->anon_vma); + vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; + } + __vma = find_vma_prepare(mm,vma->vm_start,&prev,&rb_link,&rb_parent); + if (__vma && __vma->vm_start < vma->vm_end) + return -ENOMEM; + vma_link(mm, vma, prev, rb_link, rb_parent); + return 0; +} + +/* + * Copy the vma structure to a new location in the same mm, + * prior to moving page table entries, to effect an mremap move. + */ +struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, + unsigned long addr, unsigned long len, pgoff_t pgoff) +{ + struct vm_area_struct *vma = *vmap; + unsigned long vma_start = vma->vm_start; + struct mm_struct *mm = vma->vm_mm; + struct vm_area_struct *new_vma, *prev; + struct rb_node **rb_link, *rb_parent; + struct mempolicy *pol; + + /* + * If anonymous vma has not yet been faulted, update new pgoff + * to match new location, to increase its chance of merging. + */ + if (!vma->vm_file && !vma->anon_vma) + pgoff = addr >> PAGE_SHIFT; + + find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); + new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, + vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma)); + if (new_vma) { + /* + * Source vma may have been merged into new_vma + */ + if (vma_start >= new_vma->vm_start && + vma_start < new_vma->vm_end) + *vmap = new_vma; + } else { + new_vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (new_vma) { + *new_vma = *vma; + pol = mpol_copy(vma_policy(vma)); + if (IS_ERR(pol)) { + kmem_cache_free(vm_area_cachep, new_vma); + return NULL; + } + vma_set_policy(new_vma, pol); + new_vma->vm_start = addr; + new_vma->vm_end = addr + len; + new_vma->vm_pgoff = pgoff; + if (new_vma->vm_file) + get_file(new_vma->vm_file); + if (new_vma->vm_ops && new_vma->vm_ops->open) + new_vma->vm_ops->open(new_vma); + vma_link(mm, new_vma, prev, rb_link, rb_parent); + } + } + return new_vma; +} diff --git a/mm/mprotect.c b/mm/mprotect.c new file mode 100644 index 000000000000..e9fbd013ad9a --- /dev/null +++ b/mm/mprotect.c @@ -0,0 +1,282 @@ +/* + * mm/mprotect.c + * + * (C) Copyright 1994 Linus Torvalds + * (C) Copyright 2002 Christoph Hellwig + * + * Address space accounting code + * (C) Copyright 2002 Red Hat Inc, All Rights Reserved + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, unsigned long end, pgprot_t newprot) +{ + pte_t *pte; + + pte = pte_offset_map(pmd, addr); + do { + if (pte_present(*pte)) { + pte_t ptent; + + /* Avoid an SMP race with hardware updated dirty/clean + * bits by wiping the pte and then setting the new pte + * into place. + */ + ptent = pte_modify(ptep_get_and_clear(mm, addr, pte), newprot); + set_pte_at(mm, addr, pte, ptent); + lazy_mmu_prot_update(ptent); + } + } while (pte++, addr += PAGE_SIZE, addr != end); + pte_unmap(pte - 1); +} + +static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud, + unsigned long addr, unsigned long end, pgprot_t newprot) +{ + pmd_t *pmd; + unsigned long next; + + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (pmd_none_or_clear_bad(pmd)) + continue; + change_pte_range(mm, pmd, addr, next, newprot); + } while (pmd++, addr = next, addr != end); +} + +static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd, + unsigned long addr, unsigned long end, pgprot_t newprot) +{ + pud_t *pud; + unsigned long next; + + pud = pud_offset(pgd, addr); + do { + next = pud_addr_end(addr, end); + if (pud_none_or_clear_bad(pud)) + continue; + change_pmd_range(mm, pud, addr, next, newprot); + } while (pud++, addr = next, addr != end); +} + +static void change_protection(struct vm_area_struct *vma, + unsigned long addr, unsigned long end, pgprot_t newprot) +{ + struct mm_struct *mm = vma->vm_mm; + pgd_t *pgd; + unsigned long next; + unsigned long start = addr; + + BUG_ON(addr >= end); + pgd = pgd_offset(mm, addr); + flush_cache_range(vma, addr, end); + spin_lock(&mm->page_table_lock); + do { + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(pgd)) + continue; + change_pud_range(mm, pgd, addr, next, newprot); + } while (pgd++, addr = next, addr != end); + flush_tlb_range(vma, start, end); + spin_unlock(&mm->page_table_lock); +} + +static int +mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, + unsigned long start, unsigned long end, unsigned long newflags) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long oldflags = vma->vm_flags; + long nrpages = (end - start) >> PAGE_SHIFT; + unsigned long charged = 0; + pgprot_t newprot; + pgoff_t pgoff; + int error; + + if (newflags == oldflags) { + *pprev = vma; + return 0; + } + + /* + * If we make a private mapping writable we increase our commit; + * but (without finer accounting) cannot reduce our commit if we + * make it unwritable again. + * + * FIXME? We haven't defined a VM_NORESERVE flag, so mprotecting + * a MAP_NORESERVE private mapping to writable will now reserve. + */ + if (newflags & VM_WRITE) { + if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED|VM_HUGETLB))) { + charged = nrpages; + if (security_vm_enough_memory(charged)) + return -ENOMEM; + newflags |= VM_ACCOUNT; + } + } + + newprot = protection_map[newflags & 0xf]; + + /* + * First try to merge with previous and/or next vma. + */ + pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); + *pprev = vma_merge(mm, *pprev, start, end, newflags, + vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma)); + if (*pprev) { + vma = *pprev; + goto success; + } + + *pprev = vma; + + if (start != vma->vm_start) { + error = split_vma(mm, vma, start, 1); + if (error) + goto fail; + } + + if (end != vma->vm_end) { + error = split_vma(mm, vma, end, 0); + if (error) + goto fail; + } + +success: + /* + * vm_flags and vm_page_prot are protected by the mmap_sem + * held in write mode. + */ + vma->vm_flags = newflags; + vma->vm_page_prot = newprot; + change_protection(vma, start, end, newprot); + __vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); + __vm_stat_account(mm, newflags, vma->vm_file, nrpages); + return 0; + +fail: + vm_unacct_memory(charged); + return error; +} + +asmlinkage long +sys_mprotect(unsigned long start, size_t len, unsigned long prot) +{ + unsigned long vm_flags, nstart, end, tmp, reqprot; + struct vm_area_struct *vma, *prev; + int error = -EINVAL; + const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP); + prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP); + if (grows == (PROT_GROWSDOWN|PROT_GROWSUP)) /* can't be both */ + return -EINVAL; + + if (start & ~PAGE_MASK) + return -EINVAL; + if (!len) + return 0; + len = PAGE_ALIGN(len); + end = start + len; + if (end <= start) + return -ENOMEM; + if (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC | PROT_SEM)) + return -EINVAL; + + reqprot = prot; + /* + * Does the application expect PROT_READ to imply PROT_EXEC: + */ + if (unlikely((prot & PROT_READ) && + (current->personality & READ_IMPLIES_EXEC))) + prot |= PROT_EXEC; + + vm_flags = calc_vm_prot_bits(prot); + + down_write(¤t->mm->mmap_sem); + + vma = find_vma_prev(current->mm, start, &prev); + error = -ENOMEM; + if (!vma) + goto out; + if (unlikely(grows & PROT_GROWSDOWN)) { + if (vma->vm_start >= end) + goto out; + start = vma->vm_start; + error = -EINVAL; + if (!(vma->vm_flags & VM_GROWSDOWN)) + goto out; + } + else { + if (vma->vm_start > start) + goto out; + if (unlikely(grows & PROT_GROWSUP)) { + end = vma->vm_end; + error = -EINVAL; + if (!(vma->vm_flags & VM_GROWSUP)) + goto out; + } + } + if (start > vma->vm_start) + prev = vma; + + for (nstart = start ; ; ) { + unsigned long newflags; + + /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ + + if (is_vm_hugetlb_page(vma)) { + error = -EACCES; + goto out; + } + + newflags = vm_flags | (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC)); + + if ((newflags & ~(newflags >> 4)) & 0xf) { + error = -EACCES; + goto out; + } + + error = security_file_mprotect(vma, reqprot, prot); + if (error) + goto out; + + tmp = vma->vm_end; + if (tmp > end) + tmp = end; + error = mprotect_fixup(vma, &prev, nstart, tmp, newflags); + if (error) + goto out; + nstart = tmp; + + if (nstart < prev->vm_end) + nstart = prev->vm_end; + if (nstart >= end) + goto out; + + vma = prev->vm_next; + if (!vma || vma->vm_start != nstart) { + error = -ENOMEM; + goto out; + } + } +out: + up_write(¤t->mm->mmap_sem); + return error; +} diff --git a/mm/mremap.c b/mm/mremap.c new file mode 100644 index 000000000000..0d1c1b9c7a0a --- /dev/null +++ b/mm/mremap.c @@ -0,0 +1,426 @@ +/* + * mm/mremap.c + * + * (C) Copyright 1996 Linus Torvalds + * + * Address space accounting code + * (C) Copyright 2002 Red Hat Inc, All Rights Reserved + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +static pte_t *get_one_pte_map_nested(struct mm_struct *mm, unsigned long addr) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte = NULL; + + pgd = pgd_offset(mm, addr); + if (pgd_none_or_clear_bad(pgd)) + goto end; + + pud = pud_offset(pgd, addr); + if (pud_none_or_clear_bad(pud)) + goto end; + + pmd = pmd_offset(pud, addr); + if (pmd_none_or_clear_bad(pmd)) + goto end; + + pte = pte_offset_map_nested(pmd, addr); + if (pte_none(*pte)) { + pte_unmap_nested(pte); + pte = NULL; + } +end: + return pte; +} + +static pte_t *get_one_pte_map(struct mm_struct *mm, unsigned long addr) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + + pgd = pgd_offset(mm, addr); + if (pgd_none_or_clear_bad(pgd)) + return NULL; + + pud = pud_offset(pgd, addr); + if (pud_none_or_clear_bad(pud)) + return NULL; + + pmd = pmd_offset(pud, addr); + if (pmd_none_or_clear_bad(pmd)) + return NULL; + + return pte_offset_map(pmd, addr); +} + +static inline pte_t *alloc_one_pte_map(struct mm_struct *mm, unsigned long addr) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte = NULL; + + pgd = pgd_offset(mm, addr); + + pud = pud_alloc(mm, pgd, addr); + if (!pud) + return NULL; + pmd = pmd_alloc(mm, pud, addr); + if (pmd) + pte = pte_alloc_map(mm, pmd, addr); + return pte; +} + +static int +move_one_page(struct vm_area_struct *vma, unsigned long old_addr, + struct vm_area_struct *new_vma, unsigned long new_addr) +{ + struct address_space *mapping = NULL; + struct mm_struct *mm = vma->vm_mm; + int error = 0; + pte_t *src, *dst; + + if (vma->vm_file) { + /* + * Subtle point from Rajesh Venkatasubramanian: before + * moving file-based ptes, we must lock vmtruncate out, + * since it might clean the dst vma before the src vma, + * and we propagate stale pages into the dst afterward. + */ + mapping = vma->vm_file->f_mapping; + spin_lock(&mapping->i_mmap_lock); + if (new_vma->vm_truncate_count && + new_vma->vm_truncate_count != vma->vm_truncate_count) + new_vma->vm_truncate_count = 0; + } + spin_lock(&mm->page_table_lock); + + src = get_one_pte_map_nested(mm, old_addr); + if (src) { + /* + * Look to see whether alloc_one_pte_map needs to perform a + * memory allocation. If it does then we need to drop the + * atomic kmap + */ + dst = get_one_pte_map(mm, new_addr); + if (unlikely(!dst)) { + pte_unmap_nested(src); + if (mapping) + spin_unlock(&mapping->i_mmap_lock); + dst = alloc_one_pte_map(mm, new_addr); + if (mapping && !spin_trylock(&mapping->i_mmap_lock)) { + spin_unlock(&mm->page_table_lock); + spin_lock(&mapping->i_mmap_lock); + spin_lock(&mm->page_table_lock); + } + src = get_one_pte_map_nested(mm, old_addr); + } + /* + * Since alloc_one_pte_map can drop and re-acquire + * page_table_lock, we should re-check the src entry... + */ + if (src) { + if (dst) { + pte_t pte; + pte = ptep_clear_flush(vma, old_addr, src); + set_pte_at(mm, new_addr, dst, pte); + } else + error = -ENOMEM; + pte_unmap_nested(src); + } + if (dst) + pte_unmap(dst); + } + spin_unlock(&mm->page_table_lock); + if (mapping) + spin_unlock(&mapping->i_mmap_lock); + return error; +} + +static unsigned long move_page_tables(struct vm_area_struct *vma, + unsigned long old_addr, struct vm_area_struct *new_vma, + unsigned long new_addr, unsigned long len) +{ + unsigned long offset; + + flush_cache_range(vma, old_addr, old_addr + len); + + /* + * This is not the clever way to do this, but we're taking the + * easy way out on the assumption that most remappings will be + * only a few pages.. This also makes error recovery easier. + */ + for (offset = 0; offset < len; offset += PAGE_SIZE) { + if (move_one_page(vma, old_addr + offset, + new_vma, new_addr + offset) < 0) + break; + cond_resched(); + } + return offset; +} + +static unsigned long move_vma(struct vm_area_struct *vma, + unsigned long old_addr, unsigned long old_len, + unsigned long new_len, unsigned long new_addr) +{ + struct mm_struct *mm = vma->vm_mm; + struct vm_area_struct *new_vma; + unsigned long vm_flags = vma->vm_flags; + unsigned long new_pgoff; + unsigned long moved_len; + unsigned long excess = 0; + int split = 0; + + /* + * We'd prefer to avoid failure later on in do_munmap: + * which may split one vma into three before unmapping. + */ + if (mm->map_count >= sysctl_max_map_count - 3) + return -ENOMEM; + + new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT); + new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff); + if (!new_vma) + return -ENOMEM; + + moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len); + if (moved_len < old_len) { + /* + * On error, move entries back from new area to old, + * which will succeed since page tables still there, + * and then proceed to unmap new area instead of old. + */ + move_page_tables(new_vma, new_addr, vma, old_addr, moved_len); + vma = new_vma; + old_len = new_len; + old_addr = new_addr; + new_addr = -ENOMEM; + } + + /* Conceal VM_ACCOUNT so old reservation is not undone */ + if (vm_flags & VM_ACCOUNT) { + vma->vm_flags &= ~VM_ACCOUNT; + excess = vma->vm_end - vma->vm_start - old_len; + if (old_addr > vma->vm_start && + old_addr + old_len < vma->vm_end) + split = 1; + } + + if (do_munmap(mm, old_addr, old_len) < 0) { + /* OOM: unable to split vma, just get accounts right */ + vm_unacct_memory(excess >> PAGE_SHIFT); + excess = 0; + } + + /* Restore VM_ACCOUNT if one or two pieces of vma left */ + if (excess) { + vma->vm_flags |= VM_ACCOUNT; + if (split) + vma->vm_next->vm_flags |= VM_ACCOUNT; + } + + mm->total_vm += new_len >> PAGE_SHIFT; + __vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT); + if (vm_flags & VM_LOCKED) { + mm->locked_vm += new_len >> PAGE_SHIFT; + if (new_len > old_len) + make_pages_present(new_addr + old_len, + new_addr + new_len); + } + + return new_addr; +} + +/* + * Expand (or shrink) an existing mapping, potentially moving it at the + * same time (controlled by the MREMAP_MAYMOVE flag and available VM space) + * + * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise + * This option implies MREMAP_MAYMOVE. + */ +unsigned long do_mremap(unsigned long addr, + unsigned long old_len, unsigned long new_len, + unsigned long flags, unsigned long new_addr) +{ + struct vm_area_struct *vma; + unsigned long ret = -EINVAL; + unsigned long charged = 0; + + if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE)) + goto out; + + if (addr & ~PAGE_MASK) + goto out; + + old_len = PAGE_ALIGN(old_len); + new_len = PAGE_ALIGN(new_len); + + /* + * We allow a zero old-len as a special case + * for DOS-emu "duplicate shm area" thing. But + * a zero new-len is nonsensical. + */ + if (!new_len) + goto out; + + /* new_addr is only valid if MREMAP_FIXED is specified */ + if (flags & MREMAP_FIXED) { + if (new_addr & ~PAGE_MASK) + goto out; + if (!(flags & MREMAP_MAYMOVE)) + goto out; + + if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len) + goto out; + + /* Check if the location we're moving into overlaps the + * old location at all, and fail if it does. + */ + if ((new_addr <= addr) && (new_addr+new_len) > addr) + goto out; + + if ((addr <= new_addr) && (addr+old_len) > new_addr) + goto out; + + ret = do_munmap(current->mm, new_addr, new_len); + if (ret) + goto out; + } + + /* + * Always allow a shrinking remap: that just unmaps + * the unnecessary pages.. + * do_munmap does all the needed commit accounting + */ + if (old_len >= new_len) { + ret = do_munmap(current->mm, addr+new_len, old_len - new_len); + if (ret && old_len != new_len) + goto out; + ret = addr; + if (!(flags & MREMAP_FIXED) || (new_addr == addr)) + goto out; + old_len = new_len; + } + + /* + * Ok, we need to grow.. or relocate. + */ + ret = -EFAULT; + vma = find_vma(current->mm, addr); + if (!vma || vma->vm_start > addr) + goto out; + if (is_vm_hugetlb_page(vma)) { + ret = -EINVAL; + goto out; + } + /* We can't remap across vm area boundaries */ + if (old_len > vma->vm_end - addr) + goto out; + if (vma->vm_flags & VM_DONTEXPAND) { + if (new_len > old_len) + goto out; + } + if (vma->vm_flags & VM_LOCKED) { + unsigned long locked, lock_limit; + locked = current->mm->locked_vm << PAGE_SHIFT; + lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; + locked += new_len - old_len; + ret = -EAGAIN; + if (locked > lock_limit && !capable(CAP_IPC_LOCK)) + goto out; + } + ret = -ENOMEM; + if ((current->mm->total_vm << PAGE_SHIFT) + (new_len - old_len) + > current->signal->rlim[RLIMIT_AS].rlim_cur) + goto out; + + if (vma->vm_flags & VM_ACCOUNT) { + charged = (new_len - old_len) >> PAGE_SHIFT; + if (security_vm_enough_memory(charged)) + goto out_nc; + } + + /* old_len exactly to the end of the area.. + * And we're not relocating the area. + */ + if (old_len == vma->vm_end - addr && + !((flags & MREMAP_FIXED) && (addr != new_addr)) && + (old_len != new_len || !(flags & MREMAP_MAYMOVE))) { + unsigned long max_addr = TASK_SIZE; + if (vma->vm_next) + max_addr = vma->vm_next->vm_start; + /* can we just expand the current mapping? */ + if (max_addr - addr >= new_len) { + int pages = (new_len - old_len) >> PAGE_SHIFT; + + vma_adjust(vma, vma->vm_start, + addr + new_len, vma->vm_pgoff, NULL); + + current->mm->total_vm += pages; + __vm_stat_account(vma->vm_mm, vma->vm_flags, + vma->vm_file, pages); + if (vma->vm_flags & VM_LOCKED) { + current->mm->locked_vm += pages; + make_pages_present(addr + old_len, + addr + new_len); + } + ret = addr; + goto out; + } + } + + /* + * We weren't able to just expand or shrink the area, + * we need to create a new one and move it.. + */ + ret = -ENOMEM; + if (flags & MREMAP_MAYMOVE) { + if (!(flags & MREMAP_FIXED)) { + unsigned long map_flags = 0; + if (vma->vm_flags & VM_MAYSHARE) + map_flags |= MAP_SHARED; + + new_addr = get_unmapped_area(vma->vm_file, 0, new_len, + vma->vm_pgoff, map_flags); + ret = new_addr; + if (new_addr & ~PAGE_MASK) + goto out; + } + ret = move_vma(vma, addr, old_len, new_len, new_addr); + } +out: + if (ret & ~PAGE_MASK) + vm_unacct_memory(charged); +out_nc: + return ret; +} + +asmlinkage unsigned long sys_mremap(unsigned long addr, + unsigned long old_len, unsigned long new_len, + unsigned long flags, unsigned long new_addr) +{ + unsigned long ret; + + down_write(¤t->mm->mmap_sem); + ret = do_mremap(addr, old_len, new_len, flags, new_addr); + up_write(¤t->mm->mmap_sem); + return ret; +} diff --git a/mm/msync.c b/mm/msync.c new file mode 100644 index 000000000000..090f426bca7d --- /dev/null +++ b/mm/msync.c @@ -0,0 +1,236 @@ +/* + * linux/mm/msync.c + * + * Copyright (C) 1994-1999 Linus Torvalds + */ + +/* + * The msync() system call. + */ +#include +#include +#include +#include +#include +#include + +#include +#include + +/* + * Called with mm->page_table_lock held to protect against other + * threads/the swapper from ripping pte's out from under us. + */ + +static void sync_pte_range(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, unsigned long end) +{ + pte_t *pte; + + pte = pte_offset_map(pmd, addr); + do { + unsigned long pfn; + struct page *page; + + if (!pte_present(*pte)) + continue; + pfn = pte_pfn(*pte); + if (!pfn_valid(pfn)) + continue; + page = pfn_to_page(pfn); + if (PageReserved(page)) + continue; + + if (ptep_clear_flush_dirty(vma, addr, pte) || + page_test_and_clear_dirty(page)) + set_page_dirty(page); + } while (pte++, addr += PAGE_SIZE, addr != end); + pte_unmap(pte - 1); +} + +static inline void sync_pmd_range(struct vm_area_struct *vma, pud_t *pud, + unsigned long addr, unsigned long end) +{ + pmd_t *pmd; + unsigned long next; + + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (pmd_none_or_clear_bad(pmd)) + continue; + sync_pte_range(vma, pmd, addr, next); + } while (pmd++, addr = next, addr != end); +} + +static inline void sync_pud_range(struct vm_area_struct *vma, pgd_t *pgd, + unsigned long addr, unsigned long end) +{ + pud_t *pud; + unsigned long next; + + pud = pud_offset(pgd, addr); + do { + next = pud_addr_end(addr, end); + if (pud_none_or_clear_bad(pud)) + continue; + sync_pmd_range(vma, pud, addr, next); + } while (pud++, addr = next, addr != end); +} + +static void sync_page_range(struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + struct mm_struct *mm = vma->vm_mm; + pgd_t *pgd; + unsigned long next; + + /* For hugepages we can't go walking the page table normally, + * but that's ok, hugetlbfs is memory based, so we don't need + * to do anything more on an msync() */ + if (is_vm_hugetlb_page(vma)) + return; + + BUG_ON(addr >= end); + pgd = pgd_offset(mm, addr); + flush_cache_range(vma, addr, end); + spin_lock(&mm->page_table_lock); + do { + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(pgd)) + continue; + sync_pud_range(vma, pgd, addr, next); + } while (pgd++, addr = next, addr != end); + spin_unlock(&mm->page_table_lock); +} + +#ifdef CONFIG_PREEMPT +static inline void filemap_sync(struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + const size_t chunk = 64 * 1024; /* bytes */ + unsigned long next; + + do { + next = addr + chunk; + if (next > end || next < addr) + next = end; + sync_page_range(vma, addr, next); + cond_resched(); + } while (addr = next, addr != end); +} +#else +static inline void filemap_sync(struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + sync_page_range(vma, addr, end); +} +#endif + +/* + * MS_SYNC syncs the entire file - including mappings. + * + * MS_ASYNC does not start I/O (it used to, up to 2.5.67). Instead, it just + * marks the relevant pages dirty. The application may now run fsync() to + * write out the dirty pages and wait on the writeout and check the result. + * Or the application may run fadvise(FADV_DONTNEED) against the fd to start + * async writeout immediately. + * So my _not_ starting I/O in MS_ASYNC we provide complete flexibility to + * applications. + */ +static int msync_interval(struct vm_area_struct *vma, + unsigned long addr, unsigned long end, int flags) +{ + int ret = 0; + struct file *file = vma->vm_file; + + if ((flags & MS_INVALIDATE) && (vma->vm_flags & VM_LOCKED)) + return -EBUSY; + + if (file && (vma->vm_flags & VM_SHARED)) { + filemap_sync(vma, addr, end); + + if (flags & MS_SYNC) { + struct address_space *mapping = file->f_mapping; + int err; + + ret = filemap_fdatawrite(mapping); + if (file->f_op && file->f_op->fsync) { + /* + * We don't take i_sem here because mmap_sem + * is already held. + */ + err = file->f_op->fsync(file,file->f_dentry,1); + if (err && !ret) + ret = err; + } + err = filemap_fdatawait(mapping); + if (!ret) + ret = err; + } + } + return ret; +} + +asmlinkage long sys_msync(unsigned long start, size_t len, int flags) +{ + unsigned long end; + struct vm_area_struct *vma; + int unmapped_error, error = -EINVAL; + + if (flags & MS_SYNC) + current->flags |= PF_SYNCWRITE; + + down_read(¤t->mm->mmap_sem); + if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC)) + goto out; + if (start & ~PAGE_MASK) + goto out; + if ((flags & MS_ASYNC) && (flags & MS_SYNC)) + goto out; + error = -ENOMEM; + len = (len + ~PAGE_MASK) & PAGE_MASK; + end = start + len; + if (end < start) + goto out; + error = 0; + if (end == start) + goto out; + /* + * If the interval [start,end) covers some unmapped address ranges, + * just ignore them, but return -ENOMEM at the end. + */ + vma = find_vma(current->mm, start); + unmapped_error = 0; + for (;;) { + /* Still start < end. */ + error = -ENOMEM; + if (!vma) + goto out; + /* Here start < vma->vm_end. */ + if (start < vma->vm_start) { + unmapped_error = -ENOMEM; + start = vma->vm_start; + } + /* Here vma->vm_start <= start < vma->vm_end. */ + if (end <= vma->vm_end) { + if (start < end) { + error = msync_interval(vma, start, end, flags); + if (error) + goto out; + } + error = unmapped_error; + goto out; + } + /* Here vma->vm_start <= start < vma->vm_end < end. */ + error = msync_interval(vma, start, vma->vm_end, flags); + if (error) + goto out; + start = vma->vm_end; + vma = vma->vm_next; + } +out: + up_read(¤t->mm->mmap_sem); + current->flags &= ~PF_SYNCWRITE; + return error; +} diff --git a/mm/nommu.c b/mm/nommu.c new file mode 100644 index 000000000000..b293ec1cc4e6 --- /dev/null +++ b/mm/nommu.c @@ -0,0 +1,1180 @@ +/* + * linux/mm/nommu.c + * + * Replacement code for mm functions to support CPU's that don't + * have any form of memory management unit (thus no virtual memory). + * + * See Documentation/nommu-mmap.txt + * + * Copyright (c) 2004-2005 David Howells + * Copyright (c) 2000-2003 David McCullough + * Copyright (c) 2000-2001 D Jeff Dionne + * Copyright (c) 2002 Greg Ungerer + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +void *high_memory; +struct page *mem_map; +unsigned long max_mapnr; +unsigned long num_physpages; +unsigned long askedalloc, realalloc; +atomic_t vm_committed_space = ATOMIC_INIT(0); +int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ +int sysctl_overcommit_ratio = 50; /* default is 50% */ +int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; +int heap_stack_gap = 0; + +EXPORT_SYMBOL(mem_map); +EXPORT_SYMBOL(sysctl_max_map_count); +EXPORT_SYMBOL(sysctl_overcommit_memory); +EXPORT_SYMBOL(sysctl_overcommit_ratio); +EXPORT_SYMBOL(vm_committed_space); +EXPORT_SYMBOL(__vm_enough_memory); + +/* list of shareable VMAs */ +struct rb_root nommu_vma_tree = RB_ROOT; +DECLARE_RWSEM(nommu_vma_sem); + +struct vm_operations_struct generic_file_vm_ops = { +}; + +/* + * Handle all mappings that got truncated by a "truncate()" + * system call. + * + * NOTE! We have to be ready to update the memory sharing + * between the file and the memory map for a potential last + * incomplete page. Ugly, but necessary. + */ +int vmtruncate(struct inode *inode, loff_t offset) +{ + struct address_space *mapping = inode->i_mapping; + unsigned long limit; + + if (inode->i_size < offset) + goto do_expand; + i_size_write(inode, offset); + + truncate_inode_pages(mapping, offset); + goto out_truncate; + +do_expand: + limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; + if (limit != RLIM_INFINITY && offset > limit) + goto out_sig; + if (offset > inode->i_sb->s_maxbytes) + goto out; + i_size_write(inode, offset); + +out_truncate: + if (inode->i_op && inode->i_op->truncate) + inode->i_op->truncate(inode); + return 0; +out_sig: + send_sig(SIGXFSZ, current, 0); +out: + return -EFBIG; +} + +EXPORT_SYMBOL(vmtruncate); + +/* + * Return the total memory allocated for this pointer, not + * just what the caller asked for. + * + * Doesn't have to be accurate, i.e. may have races. + */ +unsigned int kobjsize(const void *objp) +{ + struct page *page; + + if (!objp || !((page = virt_to_page(objp)))) + return 0; + + if (PageSlab(page)) + return ksize(objp); + + BUG_ON(page->index < 0); + BUG_ON(page->index >= MAX_ORDER); + + return (PAGE_SIZE << page->index); +} + +/* + * The nommu dodgy version :-) + */ +int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, int len, int write, int force, + struct page **pages, struct vm_area_struct **vmas) +{ + int i; + static struct vm_area_struct dummy_vma; + + for (i = 0; i < len; i++) { + if (pages) { + pages[i] = virt_to_page(start); + if (pages[i]) + page_cache_get(pages[i]); + } + if (vmas) + vmas[i] = &dummy_vma; + start += PAGE_SIZE; + } + return(i); +} + +DEFINE_RWLOCK(vmlist_lock); +struct vm_struct *vmlist; + +void vfree(void *addr) +{ + kfree(addr); +} + +void *__vmalloc(unsigned long size, int gfp_mask, pgprot_t prot) +{ + /* + * kmalloc doesn't like __GFP_HIGHMEM for some reason + */ + return kmalloc(size, gfp_mask & ~__GFP_HIGHMEM); +} + +struct page * vmalloc_to_page(void *addr) +{ + return virt_to_page(addr); +} + +unsigned long vmalloc_to_pfn(void *addr) +{ + return page_to_pfn(virt_to_page(addr)); +} + + +long vread(char *buf, char *addr, unsigned long count) +{ + memcpy(buf, addr, count); + return count; +} + +long vwrite(char *buf, char *addr, unsigned long count) +{ + /* Don't allow overflow */ + if ((unsigned long) addr + count < count) + count = -(unsigned long) addr; + + memcpy(addr, buf, count); + return(count); +} + +/* + * vmalloc - allocate virtually continguos memory + * + * @size: allocation size + * + * Allocate enough pages to cover @size from the page level + * allocator and map them into continguos kernel virtual space. + * + * For tight cotrol over page level allocator and protection flags + * use __vmalloc() instead. + */ +void *vmalloc(unsigned long size) +{ + return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL); +} + +/* + * vmalloc_32 - allocate virtually continguos memory (32bit addressable) + * + * @size: allocation size + * + * Allocate enough 32bit PA addressable pages to cover @size from the + * page level allocator and map them into continguos kernel virtual space. + */ +void *vmalloc_32(unsigned long size) +{ + return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL); +} + +void *vmap(struct page **pages, unsigned int count, unsigned long flags, pgprot_t prot) +{ + BUG(); + return NULL; +} + +void vunmap(void *addr) +{ + BUG(); +} + +/* + * sys_brk() for the most part doesn't need the global kernel + * lock, except when an application is doing something nasty + * like trying to un-brk an area that has already been mapped + * to a regular file. in this case, the unmapping will need + * to invoke file system routines that need the global lock. + */ +asmlinkage unsigned long sys_brk(unsigned long brk) +{ + struct mm_struct *mm = current->mm; + + if (brk < mm->start_brk || brk > mm->context.end_brk) + return mm->brk; + + if (mm->brk == brk) + return mm->brk; + + /* + * Always allow shrinking brk + */ + if (brk <= mm->brk) { + mm->brk = brk; + return brk; + } + + /* + * Ok, looks good - let it rip. + */ + return mm->brk = brk; +} + +#ifdef DEBUG +static void show_process_blocks(void) +{ + struct vm_list_struct *vml; + + printk("Process blocks %d:", current->pid); + + for (vml = ¤t->mm->context.vmlist; vml; vml = vml->next) { + printk(" %p: %p", vml, vml->vma); + if (vml->vma) + printk(" (%d @%lx #%d)", + kobjsize((void *) vml->vma->vm_start), + vml->vma->vm_start, + atomic_read(&vml->vma->vm_usage)); + printk(vml->next ? " ->" : ".\n"); + } +} +#endif /* DEBUG */ + +static inline struct vm_area_struct *find_nommu_vma(unsigned long start) +{ + struct vm_area_struct *vma; + struct rb_node *n = nommu_vma_tree.rb_node; + + while (n) { + vma = rb_entry(n, struct vm_area_struct, vm_rb); + + if (start < vma->vm_start) + n = n->rb_left; + else if (start > vma->vm_start) + n = n->rb_right; + else + return vma; + } + + return NULL; +} + +static void add_nommu_vma(struct vm_area_struct *vma) +{ + struct vm_area_struct *pvma; + struct address_space *mapping; + struct rb_node **p = &nommu_vma_tree.rb_node; + struct rb_node *parent = NULL; + + /* add the VMA to the mapping */ + if (vma->vm_file) { + mapping = vma->vm_file->f_mapping; + + flush_dcache_mmap_lock(mapping); + vma_prio_tree_insert(vma, &mapping->i_mmap); + flush_dcache_mmap_unlock(mapping); + } + + /* add the VMA to the master list */ + while (*p) { + parent = *p; + pvma = rb_entry(parent, struct vm_area_struct, vm_rb); + + if (vma->vm_start < pvma->vm_start) { + p = &(*p)->rb_left; + } + else if (vma->vm_start > pvma->vm_start) { + p = &(*p)->rb_right; + } + else { + /* mappings are at the same address - this can only + * happen for shared-mem chardevs and shared file + * mappings backed by ramfs/tmpfs */ + BUG_ON(!(pvma->vm_flags & VM_SHARED)); + + if (vma < pvma) + p = &(*p)->rb_left; + else if (vma > pvma) + p = &(*p)->rb_right; + else + BUG(); + } + } + + rb_link_node(&vma->vm_rb, parent, p); + rb_insert_color(&vma->vm_rb, &nommu_vma_tree); +} + +static void delete_nommu_vma(struct vm_area_struct *vma) +{ + struct address_space *mapping; + + /* remove the VMA from the mapping */ + if (vma->vm_file) { + mapping = vma->vm_file->f_mapping; + + flush_dcache_mmap_lock(mapping); + vma_prio_tree_remove(vma, &mapping->i_mmap); + flush_dcache_mmap_unlock(mapping); + } + + /* remove from the master list */ + rb_erase(&vma->vm_rb, &nommu_vma_tree); +} + +/* + * determine whether a mapping should be permitted and, if so, what sort of + * mapping we're capable of supporting + */ +static int validate_mmap_request(struct file *file, + unsigned long addr, + unsigned long len, + unsigned long prot, + unsigned long flags, + unsigned long pgoff, + unsigned long *_capabilities) +{ + unsigned long capabilities; + unsigned long reqprot = prot; + int ret; + + /* do the simple checks first */ + if (flags & MAP_FIXED || addr) { + printk(KERN_DEBUG + "%d: Can't do fixed-address/overlay mmap of RAM\n", + current->pid); + return -EINVAL; + } + + if ((flags & MAP_TYPE) != MAP_PRIVATE && + (flags & MAP_TYPE) != MAP_SHARED) + return -EINVAL; + + if (PAGE_ALIGN(len) == 0) + return addr; + + if (len > TASK_SIZE) + return -EINVAL; + + /* offset overflow? */ + if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) + return -EINVAL; + + if (file) { + /* validate file mapping requests */ + struct address_space *mapping; + + /* files must support mmap */ + if (!file->f_op || !file->f_op->mmap) + return -ENODEV; + + /* work out if what we've got could possibly be shared + * - we support chardevs that provide their own "memory" + * - we support files/blockdevs that are memory backed + */ + mapping = file->f_mapping; + if (!mapping) + mapping = file->f_dentry->d_inode->i_mapping; + + capabilities = 0; + if (mapping && mapping->backing_dev_info) + capabilities = mapping->backing_dev_info->capabilities; + + if (!capabilities) { + /* no explicit capabilities set, so assume some + * defaults */ + switch (file->f_dentry->d_inode->i_mode & S_IFMT) { + case S_IFREG: + case S_IFBLK: + capabilities = BDI_CAP_MAP_COPY; + break; + + case S_IFCHR: + capabilities = + BDI_CAP_MAP_DIRECT | + BDI_CAP_READ_MAP | + BDI_CAP_WRITE_MAP; + break; + + default: + return -EINVAL; + } + } + + /* eliminate any capabilities that we can't support on this + * device */ + if (!file->f_op->get_unmapped_area) + capabilities &= ~BDI_CAP_MAP_DIRECT; + if (!file->f_op->read) + capabilities &= ~BDI_CAP_MAP_COPY; + + if (flags & MAP_SHARED) { + /* do checks for writing, appending and locking */ + if ((prot & PROT_WRITE) && + !(file->f_mode & FMODE_WRITE)) + return -EACCES; + + if (IS_APPEND(file->f_dentry->d_inode) && + (file->f_mode & FMODE_WRITE)) + return -EACCES; + + if (locks_verify_locked(file->f_dentry->d_inode)) + return -EAGAIN; + + if (!(capabilities & BDI_CAP_MAP_DIRECT)) + return -ENODEV; + + if (((prot & PROT_READ) && !(capabilities & BDI_CAP_READ_MAP)) || + ((prot & PROT_WRITE) && !(capabilities & BDI_CAP_WRITE_MAP)) || + ((prot & PROT_EXEC) && !(capabilities & BDI_CAP_EXEC_MAP)) + ) { + printk("MAP_SHARED not completely supported on !MMU\n"); + return -EINVAL; + } + + /* we mustn't privatise shared mappings */ + capabilities &= ~BDI_CAP_MAP_COPY; + } + else { + /* we're going to read the file into private memory we + * allocate */ + if (!(capabilities & BDI_CAP_MAP_COPY)) + return -ENODEV; + + /* we don't permit a private writable mapping to be + * shared with the backing device */ + if (prot & PROT_WRITE) + capabilities &= ~BDI_CAP_MAP_DIRECT; + } + + /* handle executable mappings and implied executable + * mappings */ + if (file->f_vfsmnt->mnt_flags & MNT_NOEXEC) { + if (prot & PROT_EXEC) + return -EPERM; + } + else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) { + /* handle implication of PROT_EXEC by PROT_READ */ + if (current->personality & READ_IMPLIES_EXEC) { + if (capabilities & BDI_CAP_EXEC_MAP) + prot |= PROT_EXEC; + } + } + else if ((prot & PROT_READ) && + (prot & PROT_EXEC) && + !(capabilities & BDI_CAP_EXEC_MAP) + ) { + /* backing file is not executable, try to copy */ + capabilities &= ~BDI_CAP_MAP_DIRECT; + } + } + else { + /* anonymous mappings are always memory backed and can be + * privately mapped + */ + capabilities = BDI_CAP_MAP_COPY; + + /* handle PROT_EXEC implication by PROT_READ */ + if ((prot & PROT_READ) && + (current->personality & READ_IMPLIES_EXEC)) + prot |= PROT_EXEC; + } + + /* allow the security API to have its say */ + ret = security_file_mmap(file, reqprot, prot, flags); + if (ret < 0) + return ret; + + /* looks okay */ + *_capabilities = capabilities; + return 0; +} + +/* + * we've determined that we can make the mapping, now translate what we + * now know into VMA flags + */ +static unsigned long determine_vm_flags(struct file *file, + unsigned long prot, + unsigned long flags, + unsigned long capabilities) +{ + unsigned long vm_flags; + + vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags); + vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; + /* vm_flags |= mm->def_flags; */ + + if (!(capabilities & BDI_CAP_MAP_DIRECT)) { + /* attempt to share read-only copies of mapped file chunks */ + if (file && !(prot & PROT_WRITE)) + vm_flags |= VM_MAYSHARE; + } + else { + /* overlay a shareable mapping on the backing device or inode + * if possible - used for chardevs, ramfs/tmpfs/shmfs and + * romfs/cramfs */ + if (flags & MAP_SHARED) + vm_flags |= VM_MAYSHARE | VM_SHARED; + else if ((((vm_flags & capabilities) ^ vm_flags) & BDI_CAP_VMFLAGS) == 0) + vm_flags |= VM_MAYSHARE; + } + + /* refuse to let anyone share private mappings with this process if + * it's being traced - otherwise breakpoints set in it may interfere + * with another untraced process + */ + if ((flags & MAP_PRIVATE) && (current->ptrace & PT_PTRACED)) + vm_flags &= ~VM_MAYSHARE; + + return vm_flags; +} + +/* + * set up a shared mapping on a file + */ +static int do_mmap_shared_file(struct vm_area_struct *vma, unsigned long len) +{ + int ret; + + ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); + if (ret != -ENOSYS) + return ret; + + /* getting an ENOSYS error indicates that direct mmap isn't + * possible (as opposed to tried but failed) so we'll fall + * through to making a private copy of the data and mapping + * that if we can */ + return -ENODEV; +} + +/* + * set up a private mapping or an anonymous shared mapping + */ +static int do_mmap_private(struct vm_area_struct *vma, unsigned long len) +{ + void *base; + int ret; + + /* invoke the file's mapping function so that it can keep track of + * shared mappings on devices or memory + * - VM_MAYSHARE will be set if it may attempt to share + */ + if (vma->vm_file) { + ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); + if (ret != -ENOSYS) { + /* shouldn't return success if we're not sharing */ + BUG_ON(ret == 0 && !(vma->vm_flags & VM_MAYSHARE)); + return ret; /* success or a real error */ + } + + /* getting an ENOSYS error indicates that direct mmap isn't + * possible (as opposed to tried but failed) so we'll try to + * make a private copy of the data and map that instead */ + } + + /* allocate some memory to hold the mapping + * - note that this may not return a page-aligned address if the object + * we're allocating is smaller than a page + */ + base = kmalloc(len, GFP_KERNEL); + if (!base) + goto enomem; + + vma->vm_start = (unsigned long) base; + vma->vm_end = vma->vm_start + len; + vma->vm_flags |= VM_MAPPED_COPY; + +#ifdef WARN_ON_SLACK + if (len + WARN_ON_SLACK <= kobjsize(result)) + printk("Allocation of %lu bytes from process %d has %lu bytes of slack\n", + len, current->pid, kobjsize(result) - len); +#endif + + if (vma->vm_file) { + /* read the contents of a file into the copy */ + mm_segment_t old_fs; + loff_t fpos; + + fpos = vma->vm_pgoff; + fpos <<= PAGE_SHIFT; + + old_fs = get_fs(); + set_fs(KERNEL_DS); + ret = vma->vm_file->f_op->read(vma->vm_file, base, len, &fpos); + set_fs(old_fs); + + if (ret < 0) + goto error_free; + + /* clear the last little bit */ + if (ret < len) + memset(base + ret, 0, len - ret); + + } else { + /* if it's an anonymous mapping, then just clear it */ + memset(base, 0, len); + } + + return 0; + +error_free: + kfree(base); + vma->vm_start = 0; + return ret; + +enomem: + printk("Allocation of length %lu from process %d failed\n", + len, current->pid); + show_free_areas(); + return -ENOMEM; +} + +/* + * handle mapping creation for uClinux + */ +unsigned long do_mmap_pgoff(struct file *file, + unsigned long addr, + unsigned long len, + unsigned long prot, + unsigned long flags, + unsigned long pgoff) +{ + struct vm_list_struct *vml = NULL; + struct vm_area_struct *vma = NULL; + struct rb_node *rb; + unsigned long capabilities, vm_flags; + void *result; + int ret; + + /* decide whether we should attempt the mapping, and if so what sort of + * mapping */ + ret = validate_mmap_request(file, addr, len, prot, flags, pgoff, + &capabilities); + if (ret < 0) + return ret; + + /* we've determined that we can make the mapping, now translate what we + * now know into VMA flags */ + vm_flags = determine_vm_flags(file, prot, flags, capabilities); + + /* we're going to need to record the mapping if it works */ + vml = kmalloc(sizeof(struct vm_list_struct), GFP_KERNEL); + if (!vml) + goto error_getting_vml; + memset(vml, 0, sizeof(*vml)); + + down_write(&nommu_vma_sem); + + /* if we want to share, we need to check for VMAs created by other + * mmap() calls that overlap with our proposed mapping + * - we can only share with an exact match on most regular files + * - shared mappings on character devices and memory backed files are + * permitted to overlap inexactly as far as we are concerned for in + * these cases, sharing is handled in the driver or filesystem rather + * than here + */ + if (vm_flags & VM_MAYSHARE) { + unsigned long pglen = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; + unsigned long vmpglen; + + for (rb = rb_first(&nommu_vma_tree); rb; rb = rb_next(rb)) { + vma = rb_entry(rb, struct vm_area_struct, vm_rb); + + if (!(vma->vm_flags & VM_MAYSHARE)) + continue; + + /* search for overlapping mappings on the same file */ + if (vma->vm_file->f_dentry->d_inode != file->f_dentry->d_inode) + continue; + + if (vma->vm_pgoff >= pgoff + pglen) + continue; + + vmpglen = vma->vm_end - vma->vm_start + PAGE_SIZE - 1; + vmpglen >>= PAGE_SHIFT; + if (pgoff >= vma->vm_pgoff + vmpglen) + continue; + + /* handle inexactly overlapping matches between mappings */ + if (vma->vm_pgoff != pgoff || vmpglen != pglen) { + if (!(capabilities & BDI_CAP_MAP_DIRECT)) + goto sharing_violation; + continue; + } + + /* we've found a VMA we can share */ + atomic_inc(&vma->vm_usage); + + vml->vma = vma; + result = (void *) vma->vm_start; + goto shared; + } + + vma = NULL; + + /* obtain the address at which to make a shared mapping + * - this is the hook for quasi-memory character devices to + * tell us the location of a shared mapping + */ + if (file && file->f_op->get_unmapped_area) { + addr = file->f_op->get_unmapped_area(file, addr, len, + pgoff, flags); + if (IS_ERR((void *) addr)) { + ret = addr; + if (ret != (unsigned long) -ENOSYS) + goto error; + + /* the driver refused to tell us where to site + * the mapping so we'll have to attempt to copy + * it */ + ret = (unsigned long) -ENODEV; + if (!(capabilities & BDI_CAP_MAP_COPY)) + goto error; + + capabilities &= ~BDI_CAP_MAP_DIRECT; + } + } + } + + /* we're going to need a VMA struct as well */ + vma = kmalloc(sizeof(struct vm_area_struct), GFP_KERNEL); + if (!vma) + goto error_getting_vma; + + memset(vma, 0, sizeof(*vma)); + INIT_LIST_HEAD(&vma->anon_vma_node); + atomic_set(&vma->vm_usage, 1); + if (file) + get_file(file); + vma->vm_file = file; + vma->vm_flags = vm_flags; + vma->vm_start = addr; + vma->vm_end = addr + len; + vma->vm_pgoff = pgoff; + + vml->vma = vma; + + /* set up the mapping */ + if (file && vma->vm_flags & VM_SHARED) + ret = do_mmap_shared_file(vma, len); + else + ret = do_mmap_private(vma, len); + if (ret < 0) + goto error; + + /* okay... we have a mapping; now we have to register it */ + result = (void *) vma->vm_start; + + if (vma->vm_flags & VM_MAPPED_COPY) { + realalloc += kobjsize(result); + askedalloc += len; + } + + realalloc += kobjsize(vma); + askedalloc += sizeof(*vma); + + current->mm->total_vm += len >> PAGE_SHIFT; + + add_nommu_vma(vma); + + shared: + realalloc += kobjsize(vml); + askedalloc += sizeof(*vml); + + vml->next = current->mm->context.vmlist; + current->mm->context.vmlist = vml; + + up_write(&nommu_vma_sem); + + if (prot & PROT_EXEC) + flush_icache_range((unsigned long) result, + (unsigned long) result + len); + +#ifdef DEBUG + printk("do_mmap:\n"); + show_process_blocks(); +#endif + + return (unsigned long) result; + + error: + up_write(&nommu_vma_sem); + kfree(vml); + if (vma) { + fput(vma->vm_file); + kfree(vma); + } + return ret; + + sharing_violation: + up_write(&nommu_vma_sem); + printk("Attempt to share mismatched mappings\n"); + kfree(vml); + return -EINVAL; + + error_getting_vma: + up_write(&nommu_vma_sem); + kfree(vml); + printk("Allocation of vml for %lu byte allocation from process %d failed\n", + len, current->pid); + show_free_areas(); + return -ENOMEM; + + error_getting_vml: + printk("Allocation of vml for %lu byte allocation from process %d failed\n", + len, current->pid); + show_free_areas(); + return -ENOMEM; +} + +/* + * handle mapping disposal for uClinux + */ +static void put_vma(struct vm_area_struct *vma) +{ + if (vma) { + down_write(&nommu_vma_sem); + + if (atomic_dec_and_test(&vma->vm_usage)) { + delete_nommu_vma(vma); + + if (vma->vm_ops && vma->vm_ops->close) + vma->vm_ops->close(vma); + + /* IO memory and memory shared directly out of the pagecache from + * ramfs/tmpfs mustn't be released here */ + if (vma->vm_flags & VM_MAPPED_COPY) { + realalloc -= kobjsize((void *) vma->vm_start); + askedalloc -= vma->vm_end - vma->vm_start; + kfree((void *) vma->vm_start); + } + + realalloc -= kobjsize(vma); + askedalloc -= sizeof(*vma); + + if (vma->vm_file) + fput(vma->vm_file); + kfree(vma); + } + + up_write(&nommu_vma_sem); + } +} + +int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len) +{ + struct vm_list_struct *vml, **parent; + unsigned long end = addr + len; + +#ifdef DEBUG + printk("do_munmap:\n"); +#endif + + for (parent = &mm->context.vmlist; *parent; parent = &(*parent)->next) + if ((*parent)->vma->vm_start == addr && + (*parent)->vma->vm_end == end) + goto found; + + printk("munmap of non-mmaped memory by process %d (%s): %p\n", + current->pid, current->comm, (void *) addr); + return -EINVAL; + + found: + vml = *parent; + + put_vma(vml->vma); + + *parent = vml->next; + realalloc -= kobjsize(vml); + askedalloc -= sizeof(*vml); + kfree(vml); + mm->total_vm -= len >> PAGE_SHIFT; + +#ifdef DEBUG + show_process_blocks(); +#endif + + return 0; +} + +/* Release all mmaps. */ +void exit_mmap(struct mm_struct * mm) +{ + struct vm_list_struct *tmp; + + if (mm) { +#ifdef DEBUG + printk("Exit_mmap:\n"); +#endif + + mm->total_vm = 0; + + while ((tmp = mm->context.vmlist)) { + mm->context.vmlist = tmp->next; + put_vma(tmp->vma); + + realalloc -= kobjsize(tmp); + askedalloc -= sizeof(*tmp); + kfree(tmp); + } + +#ifdef DEBUG + show_process_blocks(); +#endif + } +} + +asmlinkage long sys_munmap(unsigned long addr, size_t len) +{ + int ret; + struct mm_struct *mm = current->mm; + + down_write(&mm->mmap_sem); + ret = do_munmap(mm, addr, len); + up_write(&mm->mmap_sem); + return ret; +} + +unsigned long do_brk(unsigned long addr, unsigned long len) +{ + return -ENOMEM; +} + +/* + * Expand (or shrink) an existing mapping, potentially moving it at the + * same time (controlled by the MREMAP_MAYMOVE flag and available VM space) + * + * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise + * This option implies MREMAP_MAYMOVE. + * + * on uClinux, we only permit changing a mapping's size, and only as long as it stays within the + * hole allocated by the kmalloc() call in do_mmap_pgoff() and the block is not shareable + */ +unsigned long do_mremap(unsigned long addr, + unsigned long old_len, unsigned long new_len, + unsigned long flags, unsigned long new_addr) +{ + struct vm_list_struct *vml = NULL; + + /* insanity checks first */ + if (new_len == 0) + return (unsigned long) -EINVAL; + + if (flags & MREMAP_FIXED && new_addr != addr) + return (unsigned long) -EINVAL; + + for (vml = current->mm->context.vmlist; vml; vml = vml->next) + if (vml->vma->vm_start == addr) + goto found; + + return (unsigned long) -EINVAL; + + found: + if (vml->vma->vm_end != vml->vma->vm_start + old_len) + return (unsigned long) -EFAULT; + + if (vml->vma->vm_flags & VM_MAYSHARE) + return (unsigned long) -EPERM; + + if (new_len > kobjsize((void *) addr)) + return (unsigned long) -ENOMEM; + + /* all checks complete - do it */ + vml->vma->vm_end = vml->vma->vm_start + new_len; + + askedalloc -= old_len; + askedalloc += new_len; + + return vml->vma->vm_start; +} + +/* + * Look up the first VMA which satisfies addr < vm_end, NULL if none + */ +struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) +{ + struct vm_list_struct *vml; + + for (vml = mm->context.vmlist; vml; vml = vml->next) + if (addr >= vml->vma->vm_start && addr < vml->vma->vm_end) + return vml->vma; + + return NULL; +} + +EXPORT_SYMBOL(find_vma); + +struct page * follow_page(struct mm_struct *mm, unsigned long addr, int write) +{ + return NULL; +} + +struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr) +{ + return NULL; +} + +int remap_pfn_range(struct vm_area_struct *vma, unsigned long from, + unsigned long to, unsigned long size, pgprot_t prot) +{ + return -EPERM; +} + +void swap_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) +{ +} + +unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr, + unsigned long len, unsigned long pgoff, unsigned long flags) +{ + return -ENOMEM; +} + +void arch_unmap_area(struct vm_area_struct *area) +{ +} + +void update_mem_hiwater(struct task_struct *tsk) +{ + unsigned long rss = get_mm_counter(tsk->mm, rss); + + if (likely(tsk->mm)) { + if (tsk->mm->hiwater_rss < rss) + tsk->mm->hiwater_rss = rss; + if (tsk->mm->hiwater_vm < tsk->mm->total_vm) + tsk->mm->hiwater_vm = tsk->mm->total_vm; + } +} + +void unmap_mapping_range(struct address_space *mapping, + loff_t const holebegin, loff_t const holelen, + int even_cows) +{ +} + +/* + * Check that a process has enough memory to allocate a new virtual + * mapping. 0 means there is enough memory for the allocation to + * succeed and -ENOMEM implies there is not. + * + * We currently support three overcommit policies, which are set via the + * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting + * + * Strict overcommit modes added 2002 Feb 26 by Alan Cox. + * Additional code 2002 Jul 20 by Robert Love. + * + * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise. + * + * Note this is a helper function intended to be used by LSMs which + * wish to use this logic. + */ +int __vm_enough_memory(long pages, int cap_sys_admin) +{ + unsigned long free, allowed; + + vm_acct_memory(pages); + + /* + * Sometimes we want to use more memory than we have + */ + if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS) + return 0; + + if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { + unsigned long n; + + free = get_page_cache_size(); + free += nr_swap_pages; + + /* + * Any slabs which are created with the + * SLAB_RECLAIM_ACCOUNT flag claim to have contents + * which are reclaimable, under pressure. The dentry + * cache and most inode caches should fall into this + */ + free += atomic_read(&slab_reclaim_pages); + + /* + * Leave the last 3% for root + */ + if (!cap_sys_admin) + free -= free / 32; + + if (free > pages) + return 0; + + /* + * nr_free_pages() is very expensive on large systems, + * only call if we're about to fail. + */ + n = nr_free_pages(); + if (!cap_sys_admin) + n -= n / 32; + free += n; + + if (free > pages) + return 0; + vm_unacct_memory(pages); + return -ENOMEM; + } + + allowed = totalram_pages * sysctl_overcommit_ratio / 100; + /* + * Leave the last 3% for root + */ + if (!cap_sys_admin) + allowed -= allowed / 32; + allowed += total_swap_pages; + + /* Don't let a single process grow too big: + leave 3% of the size of this process for other processes */ + allowed -= current->mm->total_vm / 32; + + if (atomic_read(&vm_committed_space) < allowed) + return 0; + + vm_unacct_memory(pages); + + return -ENOMEM; +} + +int in_gate_area_no_task(unsigned long addr) +{ + return 0; +} diff --git a/mm/oom_kill.c b/mm/oom_kill.c new file mode 100644 index 000000000000..9595a0f6c4b8 --- /dev/null +++ b/mm/oom_kill.c @@ -0,0 +1,292 @@ +/* + * linux/mm/oom_kill.c + * + * Copyright (C) 1998,2000 Rik van Riel + * Thanks go out to Claus Fischer for some serious inspiration and + * for goading me into coding this file... + * + * The routines in this file are used to kill a process when + * we're seriously out of memory. This gets called from kswapd() + * in linux/mm/vmscan.c when we really run out of memory. + * + * Since we won't call these routines often (on a well-configured + * machine) this file will double as a 'coding guide' and a signpost + * for newbie kernel hackers. It features several pointers to major + * kernel subsystems and hints as to where to find out what things do. + */ + +#include +#include +#include +#include +#include + +/* #define DEBUG */ + +/** + * oom_badness - calculate a numeric value for how bad this task has been + * @p: task struct of which task we should calculate + * @p: current uptime in seconds + * + * The formula used is relatively simple and documented inline in the + * function. The main rationale is that we want to select a good task + * to kill when we run out of memory. + * + * Good in this context means that: + * 1) we lose the minimum amount of work done + * 2) we recover a large amount of memory + * 3) we don't kill anything innocent of eating tons of memory + * 4) we want to kill the minimum amount of processes (one) + * 5) we try to kill the process the user expects us to kill, this + * algorithm has been meticulously tuned to meet the principle + * of least surprise ... (be careful when you change it) + */ + +unsigned long badness(struct task_struct *p, unsigned long uptime) +{ + unsigned long points, cpu_time, run_time, s; + struct list_head *tsk; + + if (!p->mm) + return 0; + + /* + * The memory size of the process is the basis for the badness. + */ + points = p->mm->total_vm; + + /* + * Processes which fork a lot of child processes are likely + * a good choice. We add the vmsize of the childs if they + * have an own mm. This prevents forking servers to flood the + * machine with an endless amount of childs + */ + list_for_each(tsk, &p->children) { + struct task_struct *chld; + chld = list_entry(tsk, struct task_struct, sibling); + if (chld->mm != p->mm && chld->mm) + points += chld->mm->total_vm; + } + + /* + * CPU time is in tens of seconds and run time is in thousands + * of seconds. There is no particular reason for this other than + * that it turned out to work very well in practice. + */ + cpu_time = (cputime_to_jiffies(p->utime) + cputime_to_jiffies(p->stime)) + >> (SHIFT_HZ + 3); + + if (uptime >= p->start_time.tv_sec) + run_time = (uptime - p->start_time.tv_sec) >> 10; + else + run_time = 0; + + s = int_sqrt(cpu_time); + if (s) + points /= s; + s = int_sqrt(int_sqrt(run_time)); + if (s) + points /= s; + + /* + * Niced processes are most likely less important, so double + * their badness points. + */ + if (task_nice(p) > 0) + points *= 2; + + /* + * Superuser processes are usually more important, so we make it + * less likely that we kill those. + */ + if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_ADMIN) || + p->uid == 0 || p->euid == 0) + points /= 4; + + /* + * We don't want to kill a process with direct hardware access. + * Not only could that mess up the hardware, but usually users + * tend to only have this flag set on applications they think + * of as important. + */ + if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_RAWIO)) + points /= 4; + + /* + * Adjust the score by oomkilladj. + */ + if (p->oomkilladj) { + if (p->oomkilladj > 0) + points <<= p->oomkilladj; + else + points >>= -(p->oomkilladj); + } + +#ifdef DEBUG + printk(KERN_DEBUG "OOMkill: task %d (%s) got %d points\n", + p->pid, p->comm, points); +#endif + return points; +} + +/* + * Simple selection loop. We chose the process with the highest + * number of 'points'. We expect the caller will lock the tasklist. + * + * (not docbooked, we don't want this one cluttering up the manual) + */ +static struct task_struct * select_bad_process(void) +{ + unsigned long maxpoints = 0; + struct task_struct *g, *p; + struct task_struct *chosen = NULL; + struct timespec uptime; + + do_posix_clock_monotonic_gettime(&uptime); + do_each_thread(g, p) + /* skip the init task with pid == 1 */ + if (p->pid > 1) { + unsigned long points; + + /* + * This is in the process of releasing memory so wait it + * to finish before killing some other task by mistake. + */ + if ((unlikely(test_tsk_thread_flag(p, TIF_MEMDIE)) || (p->flags & PF_EXITING)) && + !(p->flags & PF_DEAD)) + return ERR_PTR(-1UL); + if (p->flags & PF_SWAPOFF) + return p; + + points = badness(p, uptime.tv_sec); + if (points > maxpoints || !chosen) { + chosen = p; + maxpoints = points; + } + } + while_each_thread(g, p); + return chosen; +} + +/** + * We must be careful though to never send SIGKILL a process with + * CAP_SYS_RAW_IO set, send SIGTERM instead (but it's unlikely that + * we select a process with CAP_SYS_RAW_IO set). + */ +static void __oom_kill_task(task_t *p) +{ + if (p->pid == 1) { + WARN_ON(1); + printk(KERN_WARNING "tried to kill init!\n"); + return; + } + + task_lock(p); + if (!p->mm || p->mm == &init_mm) { + WARN_ON(1); + printk(KERN_WARNING "tried to kill an mm-less task!\n"); + task_unlock(p); + return; + } + task_unlock(p); + printk(KERN_ERR "Out of Memory: Killed process %d (%s).\n", p->pid, p->comm); + + /* + * We give our sacrificial lamb high priority and access to + * all the memory it needs. That way it should be able to + * exit() and clear out its resources quickly... + */ + p->time_slice = HZ; + set_tsk_thread_flag(p, TIF_MEMDIE); + + force_sig(SIGKILL, p); +} + +static struct mm_struct *oom_kill_task(task_t *p) +{ + struct mm_struct *mm = get_task_mm(p); + task_t * g, * q; + + if (!mm) + return NULL; + if (mm == &init_mm) { + mmput(mm); + return NULL; + } + + __oom_kill_task(p); + /* + * kill all processes that share the ->mm (i.e. all threads), + * but are in a different thread group + */ + do_each_thread(g, q) + if (q->mm == mm && q->tgid != p->tgid) + __oom_kill_task(q); + while_each_thread(g, q); + + return mm; +} + +static struct mm_struct *oom_kill_process(struct task_struct *p) +{ + struct mm_struct *mm; + struct task_struct *c; + struct list_head *tsk; + + /* Try to kill a child first */ + list_for_each(tsk, &p->children) { + c = list_entry(tsk, struct task_struct, sibling); + if (c->mm == p->mm) + continue; + mm = oom_kill_task(c); + if (mm) + return mm; + } + return oom_kill_task(p); +} + +/** + * oom_kill - kill the "best" process when we run out of memory + * + * If we run out of memory, we have the choice between either + * killing a random task (bad), letting the system crash (worse) + * OR try to be smart about which process to kill. Note that we + * don't have to be perfect here, we just have to be good. + */ +void out_of_memory(unsigned int __nocast gfp_mask) +{ + struct mm_struct *mm = NULL; + task_t * p; + + read_lock(&tasklist_lock); +retry: + p = select_bad_process(); + + if (PTR_ERR(p) == -1UL) + goto out; + + /* Found nothing?!?! Either we hang forever, or we panic. */ + if (!p) { + read_unlock(&tasklist_lock); + show_free_areas(); + panic("Out of memory and no killable processes...\n"); + } + + printk("oom-killer: gfp_mask=0x%x\n", gfp_mask); + show_free_areas(); + mm = oom_kill_process(p); + if (!mm) + goto retry; + + out: + read_unlock(&tasklist_lock); + if (mm) + mmput(mm); + + /* + * Give "p" a good chance of killing itself before we + * retry to allocate memory. + */ + __set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(1); +} diff --git a/mm/page-writeback.c b/mm/page-writeback.c new file mode 100644 index 000000000000..6ddd6a29c73b --- /dev/null +++ b/mm/page-writeback.c @@ -0,0 +1,819 @@ +/* + * mm/page-writeback.c. + * + * Copyright (C) 2002, Linus Torvalds. + * + * Contains functions related to writing back dirty pages at the + * address_space level. + * + * 10Apr2002 akpm@zip.com.au + * Initial version + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * The maximum number of pages to writeout in a single bdflush/kupdate + * operation. We do this so we don't hold I_LOCK against an inode for + * enormous amounts of time, which would block a userspace task which has + * been forced to throttle against that inode. Also, the code reevaluates + * the dirty each time it has written this many pages. + */ +#define MAX_WRITEBACK_PAGES 1024 + +/* + * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited + * will look to see if it needs to force writeback or throttling. + */ +static long ratelimit_pages = 32; + +static long total_pages; /* The total number of pages in the machine. */ +static int dirty_exceeded; /* Dirty mem may be over limit */ + +/* + * When balance_dirty_pages decides that the caller needs to perform some + * non-background writeback, this is how many pages it will attempt to write. + * It should be somewhat larger than RATELIMIT_PAGES to ensure that reasonably + * large amounts of I/O are submitted. + */ +static inline long sync_writeback_pages(void) +{ + return ratelimit_pages + ratelimit_pages / 2; +} + +/* The following parameters are exported via /proc/sys/vm */ + +/* + * Start background writeback (via pdflush) at this percentage + */ +int dirty_background_ratio = 10; + +/* + * The generator of dirty data starts writeback at this percentage + */ +int vm_dirty_ratio = 40; + +/* + * The interval between `kupdate'-style writebacks, in centiseconds + * (hundredths of a second) + */ +int dirty_writeback_centisecs = 5 * 100; + +/* + * The longest number of centiseconds for which data is allowed to remain dirty + */ +int dirty_expire_centisecs = 30 * 100; + +/* + * Flag that makes the machine dump writes/reads and block dirtyings. + */ +int block_dump; + +/* + * Flag that puts the machine in "laptop mode". + */ +int laptop_mode; + +EXPORT_SYMBOL(laptop_mode); + +/* End of sysctl-exported parameters */ + + +static void background_writeout(unsigned long _min_pages); + +struct writeback_state +{ + unsigned long nr_dirty; + unsigned long nr_unstable; + unsigned long nr_mapped; + unsigned long nr_writeback; +}; + +static void get_writeback_state(struct writeback_state *wbs) +{ + wbs->nr_dirty = read_page_state(nr_dirty); + wbs->nr_unstable = read_page_state(nr_unstable); + wbs->nr_mapped = read_page_state(nr_mapped); + wbs->nr_writeback = read_page_state(nr_writeback); +} + +/* + * Work out the current dirty-memory clamping and background writeout + * thresholds. + * + * The main aim here is to lower them aggressively if there is a lot of mapped + * memory around. To avoid stressing page reclaim with lots of unreclaimable + * pages. It is better to clamp down on writers than to start swapping, and + * performing lots of scanning. + * + * We only allow 1/2 of the currently-unmapped memory to be dirtied. + * + * We don't permit the clamping level to fall below 5% - that is getting rather + * excessive. + * + * We make sure that the background writeout level is below the adjusted + * clamping level. + */ +static void +get_dirty_limits(struct writeback_state *wbs, long *pbackground, long *pdirty, + struct address_space *mapping) +{ + int background_ratio; /* Percentages */ + int dirty_ratio; + int unmapped_ratio; + long background; + long dirty; + unsigned long available_memory = total_pages; + struct task_struct *tsk; + + get_writeback_state(wbs); + +#ifdef CONFIG_HIGHMEM + /* + * If this mapping can only allocate from low memory, + * we exclude high memory from our count. + */ + if (mapping && !(mapping_gfp_mask(mapping) & __GFP_HIGHMEM)) + available_memory -= totalhigh_pages; +#endif + + + unmapped_ratio = 100 - (wbs->nr_mapped * 100) / total_pages; + + dirty_ratio = vm_dirty_ratio; + if (dirty_ratio > unmapped_ratio / 2) + dirty_ratio = unmapped_ratio / 2; + + if (dirty_ratio < 5) + dirty_ratio = 5; + + background_ratio = dirty_background_ratio; + if (background_ratio >= dirty_ratio) + background_ratio = dirty_ratio / 2; + + background = (background_ratio * available_memory) / 100; + dirty = (dirty_ratio * available_memory) / 100; + tsk = current; + if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) { + background += background / 4; + dirty += dirty / 4; + } + *pbackground = background; + *pdirty = dirty; +} + +/* + * balance_dirty_pages() must be called by processes which are generating dirty + * data. It looks at the number of dirty pages in the machine and will force + * the caller to perform writeback if the system is over `vm_dirty_ratio'. + * If we're over `background_thresh' then pdflush is woken to perform some + * writeout. + */ +static void balance_dirty_pages(struct address_space *mapping) +{ + struct writeback_state wbs; + long nr_reclaimable; + long background_thresh; + long dirty_thresh; + unsigned long pages_written = 0; + unsigned long write_chunk = sync_writeback_pages(); + + struct backing_dev_info *bdi = mapping->backing_dev_info; + + for (;;) { + struct writeback_control wbc = { + .bdi = bdi, + .sync_mode = WB_SYNC_NONE, + .older_than_this = NULL, + .nr_to_write = write_chunk, + }; + + get_dirty_limits(&wbs, &background_thresh, + &dirty_thresh, mapping); + nr_reclaimable = wbs.nr_dirty + wbs.nr_unstable; + if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh) + break; + + dirty_exceeded = 1; + + /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. + * Unstable writes are a feature of certain networked + * filesystems (i.e. NFS) in which data may have been + * written to the server's write cache, but has not yet + * been flushed to permanent storage. + */ + if (nr_reclaimable) { + writeback_inodes(&wbc); + get_dirty_limits(&wbs, &background_thresh, + &dirty_thresh, mapping); + nr_reclaimable = wbs.nr_dirty + wbs.nr_unstable; + if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh) + break; + pages_written += write_chunk - wbc.nr_to_write; + if (pages_written >= write_chunk) + break; /* We've done our duty */ + } + blk_congestion_wait(WRITE, HZ/10); + } + + if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh) + dirty_exceeded = 0; + + if (writeback_in_progress(bdi)) + return; /* pdflush is already working this queue */ + + /* + * In laptop mode, we wait until hitting the higher threshold before + * starting background writeout, and then write out all the way down + * to the lower threshold. So slow writers cause minimal disk activity. + * + * In normal mode, we start background writeout at the lower + * background_thresh, to keep the amount of dirty memory low. + */ + if ((laptop_mode && pages_written) || + (!laptop_mode && (nr_reclaimable > background_thresh))) + pdflush_operation(background_writeout, 0); +} + +/** + * balance_dirty_pages_ratelimited - balance dirty memory state + * @mapping - address_space which was dirtied + * + * Processes which are dirtying memory should call in here once for each page + * which was newly dirtied. The function will periodically check the system's + * dirty state and will initiate writeback if needed. + * + * On really big machines, get_writeback_state is expensive, so try to avoid + * calling it too often (ratelimiting). But once we're over the dirty memory + * limit we decrease the ratelimiting by a lot, to prevent individual processes + * from overshooting the limit by (ratelimit_pages) each. + */ +void balance_dirty_pages_ratelimited(struct address_space *mapping) +{ + static DEFINE_PER_CPU(int, ratelimits) = 0; + long ratelimit; + + ratelimit = ratelimit_pages; + if (dirty_exceeded) + ratelimit = 8; + + /* + * Check the rate limiting. Also, we do not want to throttle real-time + * tasks in balance_dirty_pages(). Period. + */ + if (get_cpu_var(ratelimits)++ >= ratelimit) { + __get_cpu_var(ratelimits) = 0; + put_cpu_var(ratelimits); + balance_dirty_pages(mapping); + return; + } + put_cpu_var(ratelimits); +} +EXPORT_SYMBOL(balance_dirty_pages_ratelimited); + +void throttle_vm_writeout(void) +{ + struct writeback_state wbs; + long background_thresh; + long dirty_thresh; + + for ( ; ; ) { + get_dirty_limits(&wbs, &background_thresh, &dirty_thresh, NULL); + + /* + * Boost the allowable dirty threshold a bit for page + * allocators so they don't get DoS'ed by heavy writers + */ + dirty_thresh += dirty_thresh / 10; /* wheeee... */ + + if (wbs.nr_unstable + wbs.nr_writeback <= dirty_thresh) + break; + blk_congestion_wait(WRITE, HZ/10); + } +} + + +/* + * writeback at least _min_pages, and keep writing until the amount of dirty + * memory is less than the background threshold, or until we're all clean. + */ +static void background_writeout(unsigned long _min_pages) +{ + long min_pages = _min_pages; + struct writeback_control wbc = { + .bdi = NULL, + .sync_mode = WB_SYNC_NONE, + .older_than_this = NULL, + .nr_to_write = 0, + .nonblocking = 1, + }; + + for ( ; ; ) { + struct writeback_state wbs; + long background_thresh; + long dirty_thresh; + + get_dirty_limits(&wbs, &background_thresh, &dirty_thresh, NULL); + if (wbs.nr_dirty + wbs.nr_unstable < background_thresh + && min_pages <= 0) + break; + wbc.encountered_congestion = 0; + wbc.nr_to_write = MAX_WRITEBACK_PAGES; + wbc.pages_skipped = 0; + writeback_inodes(&wbc); + min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; + if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) { + /* Wrote less than expected */ + blk_congestion_wait(WRITE, HZ/10); + if (!wbc.encountered_congestion) + break; + } + } +} + +/* + * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back + * the whole world. Returns 0 if a pdflush thread was dispatched. Returns + * -1 if all pdflush threads were busy. + */ +int wakeup_bdflush(long nr_pages) +{ + if (nr_pages == 0) { + struct writeback_state wbs; + + get_writeback_state(&wbs); + nr_pages = wbs.nr_dirty + wbs.nr_unstable; + } + return pdflush_operation(background_writeout, nr_pages); +} + +static void wb_timer_fn(unsigned long unused); +static void laptop_timer_fn(unsigned long unused); + +static struct timer_list wb_timer = + TIMER_INITIALIZER(wb_timer_fn, 0, 0); +static struct timer_list laptop_mode_wb_timer = + TIMER_INITIALIZER(laptop_timer_fn, 0, 0); + +/* + * Periodic writeback of "old" data. + * + * Define "old": the first time one of an inode's pages is dirtied, we mark the + * dirtying-time in the inode's address_space. So this periodic writeback code + * just walks the superblock inode list, writing back any inodes which are + * older than a specific point in time. + * + * Try to run once per dirty_writeback_centisecs. But if a writeback event + * takes longer than a dirty_writeback_centisecs interval, then leave a + * one-second gap. + * + * older_than_this takes precedence over nr_to_write. So we'll only write back + * all dirty pages if they are all attached to "old" mappings. + */ +static void wb_kupdate(unsigned long arg) +{ + unsigned long oldest_jif; + unsigned long start_jif; + unsigned long next_jif; + long nr_to_write; + struct writeback_state wbs; + struct writeback_control wbc = { + .bdi = NULL, + .sync_mode = WB_SYNC_NONE, + .older_than_this = &oldest_jif, + .nr_to_write = 0, + .nonblocking = 1, + .for_kupdate = 1, + }; + + sync_supers(); + + get_writeback_state(&wbs); + oldest_jif = jiffies - (dirty_expire_centisecs * HZ) / 100; + start_jif = jiffies; + next_jif = start_jif + (dirty_writeback_centisecs * HZ) / 100; + nr_to_write = wbs.nr_dirty + wbs.nr_unstable + + (inodes_stat.nr_inodes - inodes_stat.nr_unused); + while (nr_to_write > 0) { + wbc.encountered_congestion = 0; + wbc.nr_to_write = MAX_WRITEBACK_PAGES; + writeback_inodes(&wbc); + if (wbc.nr_to_write > 0) { + if (wbc.encountered_congestion) + blk_congestion_wait(WRITE, HZ/10); + else + break; /* All the old data is written */ + } + nr_to_write -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; + } + if (time_before(next_jif, jiffies + HZ)) + next_jif = jiffies + HZ; + if (dirty_writeback_centisecs) + mod_timer(&wb_timer, next_jif); +} + +/* + * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs + */ +int dirty_writeback_centisecs_handler(ctl_table *table, int write, + struct file *file, void __user *buffer, size_t *length, loff_t *ppos) +{ + proc_dointvec(table, write, file, buffer, length, ppos); + if (dirty_writeback_centisecs) { + mod_timer(&wb_timer, + jiffies + (dirty_writeback_centisecs * HZ) / 100); + } else { + del_timer(&wb_timer); + } + return 0; +} + +static void wb_timer_fn(unsigned long unused) +{ + if (pdflush_operation(wb_kupdate, 0) < 0) + mod_timer(&wb_timer, jiffies + HZ); /* delay 1 second */ +} + +static void laptop_flush(unsigned long unused) +{ + sys_sync(); +} + +static void laptop_timer_fn(unsigned long unused) +{ + pdflush_operation(laptop_flush, 0); +} + +/* + * We've spun up the disk and we're in laptop mode: schedule writeback + * of all dirty data a few seconds from now. If the flush is already scheduled + * then push it back - the user is still using the disk. + */ +void laptop_io_completion(void) +{ + mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode * HZ); +} + +/* + * We're in laptop mode and we've just synced. The sync's writes will have + * caused another writeback to be scheduled by laptop_io_completion. + * Nothing needs to be written back anymore, so we unschedule the writeback. + */ +void laptop_sync_completion(void) +{ + del_timer(&laptop_mode_wb_timer); +} + +/* + * If ratelimit_pages is too high then we can get into dirty-data overload + * if a large number of processes all perform writes at the same time. + * If it is too low then SMP machines will call the (expensive) + * get_writeback_state too often. + * + * Here we set ratelimit_pages to a level which ensures that when all CPUs are + * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory + * thresholds before writeback cuts in. + * + * But the limit should not be set too high. Because it also controls the + * amount of memory which the balance_dirty_pages() caller has to write back. + * If this is too large then the caller will block on the IO queue all the + * time. So limit it to four megabytes - the balance_dirty_pages() caller + * will write six megabyte chunks, max. + */ + +static void set_ratelimit(void) +{ + ratelimit_pages = total_pages / (num_online_cpus() * 32); + if (ratelimit_pages < 16) + ratelimit_pages = 16; + if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024) + ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE; +} + +static int +ratelimit_handler(struct notifier_block *self, unsigned long u, void *v) +{ + set_ratelimit(); + return 0; +} + +static struct notifier_block ratelimit_nb = { + .notifier_call = ratelimit_handler, + .next = NULL, +}; + +/* + * If the machine has a large highmem:lowmem ratio then scale back the default + * dirty memory thresholds: allowing too much dirty highmem pins an excessive + * number of buffer_heads. + */ +void __init page_writeback_init(void) +{ + long buffer_pages = nr_free_buffer_pages(); + long correction; + + total_pages = nr_free_pagecache_pages(); + + correction = (100 * 4 * buffer_pages) / total_pages; + + if (correction < 100) { + dirty_background_ratio *= correction; + dirty_background_ratio /= 100; + vm_dirty_ratio *= correction; + vm_dirty_ratio /= 100; + + if (dirty_background_ratio <= 0) + dirty_background_ratio = 1; + if (vm_dirty_ratio <= 0) + vm_dirty_ratio = 1; + } + mod_timer(&wb_timer, jiffies + (dirty_writeback_centisecs * HZ) / 100); + set_ratelimit(); + register_cpu_notifier(&ratelimit_nb); +} + +int do_writepages(struct address_space *mapping, struct writeback_control *wbc) +{ + if (wbc->nr_to_write <= 0) + return 0; + if (mapping->a_ops->writepages) + return mapping->a_ops->writepages(mapping, wbc); + return generic_writepages(mapping, wbc); +} + +/** + * write_one_page - write out a single page and optionally wait on I/O + * + * @page - the page to write + * @wait - if true, wait on writeout + * + * The page must be locked by the caller and will be unlocked upon return. + * + * write_one_page() returns a negative error code if I/O failed. + */ +int write_one_page(struct page *page, int wait) +{ + struct address_space *mapping = page->mapping; + int ret = 0; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = 1, + }; + + BUG_ON(!PageLocked(page)); + + if (wait) + wait_on_page_writeback(page); + + if (clear_page_dirty_for_io(page)) { + page_cache_get(page); + ret = mapping->a_ops->writepage(page, &wbc); + if (ret == 0 && wait) { + wait_on_page_writeback(page); + if (PageError(page)) + ret = -EIO; + } + page_cache_release(page); + } else { + unlock_page(page); + } + return ret; +} +EXPORT_SYMBOL(write_one_page); + +/* + * For address_spaces which do not use buffers. Just tag the page as dirty in + * its radix tree. + * + * This is also used when a single buffer is being dirtied: we want to set the + * page dirty in that case, but not all the buffers. This is a "bottom-up" + * dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying. + * + * Most callers have locked the page, which pins the address_space in memory. + * But zap_pte_range() does not lock the page, however in that case the + * mapping is pinned by the vma's ->vm_file reference. + * + * We take care to handle the case where the page was truncated from the + * mapping by re-checking page_mapping() insode tree_lock. + */ +int __set_page_dirty_nobuffers(struct page *page) +{ + int ret = 0; + + if (!TestSetPageDirty(page)) { + struct address_space *mapping = page_mapping(page); + struct address_space *mapping2; + + if (mapping) { + write_lock_irq(&mapping->tree_lock); + mapping2 = page_mapping(page); + if (mapping2) { /* Race with truncate? */ + BUG_ON(mapping2 != mapping); + if (mapping_cap_account_dirty(mapping)) + inc_page_state(nr_dirty); + radix_tree_tag_set(&mapping->page_tree, + page_index(page), PAGECACHE_TAG_DIRTY); + } + write_unlock_irq(&mapping->tree_lock); + if (mapping->host) { + /* !PageAnon && !swapper_space */ + __mark_inode_dirty(mapping->host, + I_DIRTY_PAGES); + } + } + } + return ret; +} +EXPORT_SYMBOL(__set_page_dirty_nobuffers); + +/* + * When a writepage implementation decides that it doesn't want to write this + * page for some reason, it should redirty the locked page via + * redirty_page_for_writepage() and it should then unlock the page and return 0 + */ +int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page) +{ + wbc->pages_skipped++; + return __set_page_dirty_nobuffers(page); +} +EXPORT_SYMBOL(redirty_page_for_writepage); + +/* + * If the mapping doesn't provide a set_page_dirty a_op, then + * just fall through and assume that it wants buffer_heads. + */ +int fastcall set_page_dirty(struct page *page) +{ + struct address_space *mapping = page_mapping(page); + + if (likely(mapping)) { + int (*spd)(struct page *) = mapping->a_ops->set_page_dirty; + if (spd) + return (*spd)(page); + return __set_page_dirty_buffers(page); + } + if (!PageDirty(page)) + SetPageDirty(page); + return 0; +} +EXPORT_SYMBOL(set_page_dirty); + +/* + * set_page_dirty() is racy if the caller has no reference against + * page->mapping->host, and if the page is unlocked. This is because another + * CPU could truncate the page off the mapping and then free the mapping. + * + * Usually, the page _is_ locked, or the caller is a user-space process which + * holds a reference on the inode by having an open file. + * + * In other cases, the page should be locked before running set_page_dirty(). + */ +int set_page_dirty_lock(struct page *page) +{ + int ret; + + lock_page(page); + ret = set_page_dirty(page); + unlock_page(page); + return ret; +} +EXPORT_SYMBOL(set_page_dirty_lock); + +/* + * Clear a page's dirty flag, while caring for dirty memory accounting. + * Returns true if the page was previously dirty. + */ +int test_clear_page_dirty(struct page *page) +{ + struct address_space *mapping = page_mapping(page); + unsigned long flags; + + if (mapping) { + write_lock_irqsave(&mapping->tree_lock, flags); + if (TestClearPageDirty(page)) { + radix_tree_tag_clear(&mapping->page_tree, + page_index(page), + PAGECACHE_TAG_DIRTY); + write_unlock_irqrestore(&mapping->tree_lock, flags); + if (mapping_cap_account_dirty(mapping)) + dec_page_state(nr_dirty); + return 1; + } + write_unlock_irqrestore(&mapping->tree_lock, flags); + return 0; + } + return TestClearPageDirty(page); +} +EXPORT_SYMBOL(test_clear_page_dirty); + +/* + * Clear a page's dirty flag, while caring for dirty memory accounting. + * Returns true if the page was previously dirty. + * + * This is for preparing to put the page under writeout. We leave the page + * tagged as dirty in the radix tree so that a concurrent write-for-sync + * can discover it via a PAGECACHE_TAG_DIRTY walk. The ->writepage + * implementation will run either set_page_writeback() or set_page_dirty(), + * at which stage we bring the page's dirty flag and radix-tree dirty tag + * back into sync. + * + * This incoherency between the page's dirty flag and radix-tree tag is + * unfortunate, but it only exists while the page is locked. + */ +int clear_page_dirty_for_io(struct page *page) +{ + struct address_space *mapping = page_mapping(page); + + if (mapping) { + if (TestClearPageDirty(page)) { + if (mapping_cap_account_dirty(mapping)) + dec_page_state(nr_dirty); + return 1; + } + return 0; + } + return TestClearPageDirty(page); +} +EXPORT_SYMBOL(clear_page_dirty_for_io); + +int test_clear_page_writeback(struct page *page) +{ + struct address_space *mapping = page_mapping(page); + int ret; + + if (mapping) { + unsigned long flags; + + write_lock_irqsave(&mapping->tree_lock, flags); + ret = TestClearPageWriteback(page); + if (ret) + radix_tree_tag_clear(&mapping->page_tree, + page_index(page), + PAGECACHE_TAG_WRITEBACK); + write_unlock_irqrestore(&mapping->tree_lock, flags); + } else { + ret = TestClearPageWriteback(page); + } + return ret; +} + +int test_set_page_writeback(struct page *page) +{ + struct address_space *mapping = page_mapping(page); + int ret; + + if (mapping) { + unsigned long flags; + + write_lock_irqsave(&mapping->tree_lock, flags); + ret = TestSetPageWriteback(page); + if (!ret) + radix_tree_tag_set(&mapping->page_tree, + page_index(page), + PAGECACHE_TAG_WRITEBACK); + if (!PageDirty(page)) + radix_tree_tag_clear(&mapping->page_tree, + page_index(page), + PAGECACHE_TAG_DIRTY); + write_unlock_irqrestore(&mapping->tree_lock, flags); + } else { + ret = TestSetPageWriteback(page); + } + return ret; + +} +EXPORT_SYMBOL(test_set_page_writeback); + +/* + * Return true if any of the pages in the mapping are marged with the + * passed tag. + */ +int mapping_tagged(struct address_space *mapping, int tag) +{ + unsigned long flags; + int ret; + + read_lock_irqsave(&mapping->tree_lock, flags); + ret = radix_tree_tagged(&mapping->page_tree, tag); + read_unlock_irqrestore(&mapping->tree_lock, flags); + return ret; +} +EXPORT_SYMBOL(mapping_tagged); diff --git a/mm/page_alloc.c b/mm/page_alloc.c new file mode 100644 index 000000000000..c73dbbc1cd8f --- /dev/null +++ b/mm/page_alloc.c @@ -0,0 +1,2220 @@ +/* + * linux/mm/page_alloc.c + * + * Manages the free list, the system allocates free pages here. + * Note that kmalloc() lives in slab.c + * + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * Swap reorganised 29.12.95, Stephen Tweedie + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 + * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999 + * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 + * Zone balancing, Kanoj Sarcar, SGI, Jan 2000 + * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002 + * (lots of bits borrowed from Ingo Molnar & Andrew Morton) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "internal.h" + +/* + * MCD - HACK: Find somewhere to initialize this EARLY, or make this + * initializer cleaner + */ +nodemask_t node_online_map = { { [0] = 1UL } }; +nodemask_t node_possible_map = NODE_MASK_ALL; +struct pglist_data *pgdat_list; +unsigned long totalram_pages; +unsigned long totalhigh_pages; +long nr_swap_pages; + +/* + * results with 256, 32 in the lowmem_reserve sysctl: + * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high) + * 1G machine -> (16M dma, 784M normal, 224M high) + * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA + * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL + * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA + */ +int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 32 }; + +EXPORT_SYMBOL(totalram_pages); +EXPORT_SYMBOL(nr_swap_pages); + +/* + * Used by page_zone() to look up the address of the struct zone whose + * id is encoded in the upper bits of page->flags + */ +struct zone *zone_table[1 << (ZONES_SHIFT + NODES_SHIFT)]; +EXPORT_SYMBOL(zone_table); + +static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; +int min_free_kbytes = 1024; + +unsigned long __initdata nr_kernel_pages; +unsigned long __initdata nr_all_pages; + +/* + * Temporary debugging check for pages not lying within a given zone. + */ +static int bad_range(struct zone *zone, struct page *page) +{ + if (page_to_pfn(page) >= zone->zone_start_pfn + zone->spanned_pages) + return 1; + if (page_to_pfn(page) < zone->zone_start_pfn) + return 1; +#ifdef CONFIG_HOLES_IN_ZONE + if (!pfn_valid(page_to_pfn(page))) + return 1; +#endif + if (zone != page_zone(page)) + return 1; + return 0; +} + +static void bad_page(const char *function, struct page *page) +{ + printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n", + function, current->comm, page); + printk(KERN_EMERG "flags:0x%0*lx mapping:%p mapcount:%d count:%d\n", + (int)(2*sizeof(page_flags_t)), (unsigned long)page->flags, + page->mapping, page_mapcount(page), page_count(page)); + printk(KERN_EMERG "Backtrace:\n"); + dump_stack(); + printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"); + page->flags &= ~(1 << PG_private | + 1 << PG_locked | + 1 << PG_lru | + 1 << PG_active | + 1 << PG_dirty | + 1 << PG_swapcache | + 1 << PG_writeback); + set_page_count(page, 0); + reset_page_mapcount(page); + page->mapping = NULL; + tainted |= TAINT_BAD_PAGE; +} + +#ifndef CONFIG_HUGETLB_PAGE +#define prep_compound_page(page, order) do { } while (0) +#define destroy_compound_page(page, order) do { } while (0) +#else +/* + * Higher-order pages are called "compound pages". They are structured thusly: + * + * The first PAGE_SIZE page is called the "head page". + * + * The remaining PAGE_SIZE pages are called "tail pages". + * + * All pages have PG_compound set. All pages have their ->private pointing at + * the head page (even the head page has this). + * + * The first tail page's ->mapping, if non-zero, holds the address of the + * compound page's put_page() function. + * + * The order of the allocation is stored in the first tail page's ->index + * This is only for debug at present. This usage means that zero-order pages + * may not be compound. + */ +static void prep_compound_page(struct page *page, unsigned long order) +{ + int i; + int nr_pages = 1 << order; + + page[1].mapping = NULL; + page[1].index = order; + for (i = 0; i < nr_pages; i++) { + struct page *p = page + i; + + SetPageCompound(p); + p->private = (unsigned long)page; + } +} + +static void destroy_compound_page(struct page *page, unsigned long order) +{ + int i; + int nr_pages = 1 << order; + + if (!PageCompound(page)) + return; + + if (page[1].index != order) + bad_page(__FUNCTION__, page); + + for (i = 0; i < nr_pages; i++) { + struct page *p = page + i; + + if (!PageCompound(p)) + bad_page(__FUNCTION__, page); + if (p->private != (unsigned long)page) + bad_page(__FUNCTION__, page); + ClearPageCompound(p); + } +} +#endif /* CONFIG_HUGETLB_PAGE */ + +/* + * function for dealing with page's order in buddy system. + * zone->lock is already acquired when we use these. + * So, we don't need atomic page->flags operations here. + */ +static inline unsigned long page_order(struct page *page) { + return page->private; +} + +static inline void set_page_order(struct page *page, int order) { + page->private = order; + __SetPagePrivate(page); +} + +static inline void rmv_page_order(struct page *page) +{ + __ClearPagePrivate(page); + page->private = 0; +} + +/* + * Locate the struct page for both the matching buddy in our + * pair (buddy1) and the combined O(n+1) page they form (page). + * + * 1) Any buddy B1 will have an order O twin B2 which satisfies + * the following equation: + * B2 = B1 ^ (1 << O) + * For example, if the starting buddy (buddy2) is #8 its order + * 1 buddy is #10: + * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10 + * + * 2) Any buddy B will have an order O+1 parent P which + * satisfies the following equation: + * P = B & ~(1 << O) + * + * Assumption: *_mem_map is contigious at least up to MAX_ORDER + */ +static inline struct page * +__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order) +{ + unsigned long buddy_idx = page_idx ^ (1 << order); + + return page + (buddy_idx - page_idx); +} + +static inline unsigned long +__find_combined_index(unsigned long page_idx, unsigned int order) +{ + return (page_idx & ~(1 << order)); +} + +/* + * This function checks whether a page is free && is the buddy + * we can do coalesce a page and its buddy if + * (a) the buddy is free && + * (b) the buddy is on the buddy system && + * (c) a page and its buddy have the same order. + * for recording page's order, we use page->private and PG_private. + * + */ +static inline int page_is_buddy(struct page *page, int order) +{ + if (PagePrivate(page) && + (page_order(page) == order) && + !PageReserved(page) && + page_count(page) == 0) + return 1; + return 0; +} + +/* + * Freeing function for a buddy system allocator. + * + * The concept of a buddy system is to maintain direct-mapped table + * (containing bit values) for memory blocks of various "orders". + * The bottom level table contains the map for the smallest allocatable + * units of memory (here, pages), and each level above it describes + * pairs of units from the levels below, hence, "buddies". + * At a high level, all that happens here is marking the table entry + * at the bottom level available, and propagating the changes upward + * as necessary, plus some accounting needed to play nicely with other + * parts of the VM system. + * At each level, we keep a list of pages, which are heads of continuous + * free pages of length of (1 << order) and marked with PG_Private.Page's + * order is recorded in page->private field. + * So when we are allocating or freeing one, we can derive the state of the + * other. That is, if we allocate a small block, and both were + * free, the remainder of the region must be split into blocks. + * If a block is freed, and its buddy is also free, then this + * triggers coalescing into a block of larger size. + * + * -- wli + */ + +static inline void __free_pages_bulk (struct page *page, + struct zone *zone, unsigned int order) +{ + unsigned long page_idx; + int order_size = 1 << order; + + if (unlikely(order)) + destroy_compound_page(page, order); + + page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); + + BUG_ON(page_idx & (order_size - 1)); + BUG_ON(bad_range(zone, page)); + + zone->free_pages += order_size; + while (order < MAX_ORDER-1) { + unsigned long combined_idx; + struct free_area *area; + struct page *buddy; + + combined_idx = __find_combined_index(page_idx, order); + buddy = __page_find_buddy(page, page_idx, order); + + if (bad_range(zone, buddy)) + break; + if (!page_is_buddy(buddy, order)) + break; /* Move the buddy up one level. */ + list_del(&buddy->lru); + area = zone->free_area + order; + area->nr_free--; + rmv_page_order(buddy); + page = page + (combined_idx - page_idx); + page_idx = combined_idx; + order++; + } + set_page_order(page, order); + list_add(&page->lru, &zone->free_area[order].free_list); + zone->free_area[order].nr_free++; +} + +static inline void free_pages_check(const char *function, struct page *page) +{ + if ( page_mapcount(page) || + page->mapping != NULL || + page_count(page) != 0 || + (page->flags & ( + 1 << PG_lru | + 1 << PG_private | + 1 << PG_locked | + 1 << PG_active | + 1 << PG_reclaim | + 1 << PG_slab | + 1 << PG_swapcache | + 1 << PG_writeback ))) + bad_page(function, page); + if (PageDirty(page)) + ClearPageDirty(page); +} + +/* + * Frees a list of pages. + * Assumes all pages on list are in same zone, and of same order. + * count is the number of pages to free, or 0 for all on the list. + * + * If the zone was previously in an "all pages pinned" state then look to + * see if this freeing clears that state. + * + * And clear the zone's pages_scanned counter, to hold off the "all pages are + * pinned" detection logic. + */ +static int +free_pages_bulk(struct zone *zone, int count, + struct list_head *list, unsigned int order) +{ + unsigned long flags; + struct page *page = NULL; + int ret = 0; + + spin_lock_irqsave(&zone->lock, flags); + zone->all_unreclaimable = 0; + zone->pages_scanned = 0; + while (!list_empty(list) && count--) { + page = list_entry(list->prev, struct page, lru); + /* have to delete it as __free_pages_bulk list manipulates */ + list_del(&page->lru); + __free_pages_bulk(page, zone, order); + ret++; + } + spin_unlock_irqrestore(&zone->lock, flags); + return ret; +} + +void __free_pages_ok(struct page *page, unsigned int order) +{ + LIST_HEAD(list); + int i; + + arch_free_page(page, order); + + mod_page_state(pgfree, 1 << order); + +#ifndef CONFIG_MMU + if (order > 0) + for (i = 1 ; i < (1 << order) ; ++i) + __put_page(page + i); +#endif + + for (i = 0 ; i < (1 << order) ; ++i) + free_pages_check(__FUNCTION__, page + i); + list_add(&page->lru, &list); + kernel_map_pages(page, 1< low) { + area--; + high--; + size >>= 1; + BUG_ON(bad_range(zone, &page[size])); + list_add(&page[size].lru, &area->free_list); + area->nr_free++; + set_page_order(&page[size], high); + } + return page; +} + +void set_page_refs(struct page *page, int order) +{ +#ifdef CONFIG_MMU + set_page_count(page, 1); +#else + int i; + + /* + * We need to reference all the pages for this order, otherwise if + * anyone accesses one of the pages with (get/put) it will be freed. + * - eg: access_process_vm() + */ + for (i = 0; i < (1 << order); i++) + set_page_count(page + i, 1); +#endif /* CONFIG_MMU */ +} + +/* + * This page is about to be returned from the page allocator + */ +static void prep_new_page(struct page *page, int order) +{ + if (page->mapping || page_mapcount(page) || + (page->flags & ( + 1 << PG_private | + 1 << PG_locked | + 1 << PG_lru | + 1 << PG_active | + 1 << PG_dirty | + 1 << PG_reclaim | + 1 << PG_swapcache | + 1 << PG_writeback ))) + bad_page(__FUNCTION__, page); + + page->flags &= ~(1 << PG_uptodate | 1 << PG_error | + 1 << PG_referenced | 1 << PG_arch_1 | + 1 << PG_checked | 1 << PG_mappedtodisk); + page->private = 0; + set_page_refs(page, order); + kernel_map_pages(page, 1 << order, 1); +} + +/* + * Do the hard work of removing an element from the buddy allocator. + * Call me with the zone->lock already held. + */ +static struct page *__rmqueue(struct zone *zone, unsigned int order) +{ + struct free_area * area; + unsigned int current_order; + struct page *page; + + for (current_order = order; current_order < MAX_ORDER; ++current_order) { + area = zone->free_area + current_order; + if (list_empty(&area->free_list)) + continue; + + page = list_entry(area->free_list.next, struct page, lru); + list_del(&page->lru); + rmv_page_order(page); + area->nr_free--; + zone->free_pages -= 1UL << order; + return expand(zone, page, order, current_order, area); + } + + return NULL; +} + +/* + * Obtain a specified number of elements from the buddy allocator, all under + * a single hold of the lock, for efficiency. Add them to the supplied list. + * Returns the number of new pages which were placed at *list. + */ +static int rmqueue_bulk(struct zone *zone, unsigned int order, + unsigned long count, struct list_head *list) +{ + unsigned long flags; + int i; + int allocated = 0; + struct page *page; + + spin_lock_irqsave(&zone->lock, flags); + for (i = 0; i < count; ++i) { + page = __rmqueue(zone, order); + if (page == NULL) + break; + allocated++; + list_add_tail(&page->lru, list); + } + spin_unlock_irqrestore(&zone->lock, flags); + return allocated; +} + +#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU) +static void __drain_pages(unsigned int cpu) +{ + struct zone *zone; + int i; + + for_each_zone(zone) { + struct per_cpu_pageset *pset; + + pset = &zone->pageset[cpu]; + for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { + struct per_cpu_pages *pcp; + + pcp = &pset->pcp[i]; + pcp->count -= free_pages_bulk(zone, pcp->count, + &pcp->list, 0); + } + } +} +#endif /* CONFIG_PM || CONFIG_HOTPLUG_CPU */ + +#ifdef CONFIG_PM + +void mark_free_pages(struct zone *zone) +{ + unsigned long zone_pfn, flags; + int order; + struct list_head *curr; + + if (!zone->spanned_pages) + return; + + spin_lock_irqsave(&zone->lock, flags); + for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) + ClearPageNosaveFree(pfn_to_page(zone_pfn + zone->zone_start_pfn)); + + for (order = MAX_ORDER - 1; order >= 0; --order) + list_for_each(curr, &zone->free_area[order].free_list) { + unsigned long start_pfn, i; + + start_pfn = page_to_pfn(list_entry(curr, struct page, lru)); + + for (i=0; i < (1<lock, flags); +} + +/* + * Spill all of this CPU's per-cpu pages back into the buddy allocator. + */ +void drain_local_pages(void) +{ + unsigned long flags; + + local_irq_save(flags); + __drain_pages(smp_processor_id()); + local_irq_restore(flags); +} +#endif /* CONFIG_PM */ + +static void zone_statistics(struct zonelist *zonelist, struct zone *z) +{ +#ifdef CONFIG_NUMA + unsigned long flags; + int cpu; + pg_data_t *pg = z->zone_pgdat; + pg_data_t *orig = zonelist->zones[0]->zone_pgdat; + struct per_cpu_pageset *p; + + local_irq_save(flags); + cpu = smp_processor_id(); + p = &z->pageset[cpu]; + if (pg == orig) { + z->pageset[cpu].numa_hit++; + } else { + p->numa_miss++; + zonelist->zones[0]->pageset[cpu].numa_foreign++; + } + if (pg == NODE_DATA(numa_node_id())) + p->local_node++; + else + p->other_node++; + local_irq_restore(flags); +#endif +} + +/* + * Free a 0-order page + */ +static void FASTCALL(free_hot_cold_page(struct page *page, int cold)); +static void fastcall free_hot_cold_page(struct page *page, int cold) +{ + struct zone *zone = page_zone(page); + struct per_cpu_pages *pcp; + unsigned long flags; + + arch_free_page(page, 0); + + kernel_map_pages(page, 1, 0); + inc_page_state(pgfree); + if (PageAnon(page)) + page->mapping = NULL; + free_pages_check(__FUNCTION__, page); + pcp = &zone->pageset[get_cpu()].pcp[cold]; + local_irq_save(flags); + if (pcp->count >= pcp->high) + pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0); + list_add(&page->lru, &pcp->list); + pcp->count++; + local_irq_restore(flags); + put_cpu(); +} + +void fastcall free_hot_page(struct page *page) +{ + free_hot_cold_page(page, 0); +} + +void fastcall free_cold_page(struct page *page) +{ + free_hot_cold_page(page, 1); +} + +static inline void prep_zero_page(struct page *page, int order, unsigned int __nocast gfp_flags) +{ + int i; + + BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM); + for(i = 0; i < (1 << order); i++) + clear_highpage(page + i); +} + +/* + * Really, prep_compound_page() should be called from __rmqueue_bulk(). But + * we cheat by calling it from here, in the order > 0 path. Saves a branch + * or two. + */ +static struct page * +buffered_rmqueue(struct zone *zone, int order, unsigned int __nocast gfp_flags) +{ + unsigned long flags; + struct page *page = NULL; + int cold = !!(gfp_flags & __GFP_COLD); + + if (order == 0) { + struct per_cpu_pages *pcp; + + pcp = &zone->pageset[get_cpu()].pcp[cold]; + local_irq_save(flags); + if (pcp->count <= pcp->low) + pcp->count += rmqueue_bulk(zone, 0, + pcp->batch, &pcp->list); + if (pcp->count) { + page = list_entry(pcp->list.next, struct page, lru); + list_del(&page->lru); + pcp->count--; + } + local_irq_restore(flags); + put_cpu(); + } + + if (page == NULL) { + spin_lock_irqsave(&zone->lock, flags); + page = __rmqueue(zone, order); + spin_unlock_irqrestore(&zone->lock, flags); + } + + if (page != NULL) { + BUG_ON(bad_range(zone, page)); + mod_page_state_zone(zone, pgalloc, 1 << order); + prep_new_page(page, order); + + if (gfp_flags & __GFP_ZERO) + prep_zero_page(page, order, gfp_flags); + + if (order && (gfp_flags & __GFP_COMP)) + prep_compound_page(page, order); + } + return page; +} + +/* + * Return 1 if free pages are above 'mark'. This takes into account the order + * of the allocation. + */ +int zone_watermark_ok(struct zone *z, int order, unsigned long mark, + int classzone_idx, int can_try_harder, int gfp_high) +{ + /* free_pages my go negative - that's OK */ + long min = mark, free_pages = z->free_pages - (1 << order) + 1; + int o; + + if (gfp_high) + min -= min / 2; + if (can_try_harder) + min -= min / 4; + + if (free_pages <= min + z->lowmem_reserve[classzone_idx]) + return 0; + for (o = 0; o < order; o++) { + /* At the next order, this order's pages become unavailable */ + free_pages -= z->free_area[o].nr_free << o; + + /* Require fewer higher order pages to be free */ + min >>= 1; + + if (free_pages <= min) + return 0; + } + return 1; +} + +/* + * This is the 'heart' of the zoned buddy allocator. + */ +struct page * fastcall +__alloc_pages(unsigned int __nocast gfp_mask, unsigned int order, + struct zonelist *zonelist) +{ + const int wait = gfp_mask & __GFP_WAIT; + struct zone **zones, *z; + struct page *page; + struct reclaim_state reclaim_state; + struct task_struct *p = current; + int i; + int classzone_idx; + int do_retry; + int can_try_harder; + int did_some_progress; + + might_sleep_if(wait); + + /* + * The caller may dip into page reserves a bit more if the caller + * cannot run direct reclaim, or is the caller has realtime scheduling + * policy + */ + can_try_harder = (unlikely(rt_task(p)) && !in_interrupt()) || !wait; + + zones = zonelist->zones; /* the list of zones suitable for gfp_mask */ + + if (unlikely(zones[0] == NULL)) { + /* Should this ever happen?? */ + return NULL; + } + + classzone_idx = zone_idx(zones[0]); + + restart: + /* Go through the zonelist once, looking for a zone with enough free */ + for (i = 0; (z = zones[i]) != NULL; i++) { + + if (!zone_watermark_ok(z, order, z->pages_low, + classzone_idx, 0, 0)) + continue; + + if (!cpuset_zone_allowed(z)) + continue; + + page = buffered_rmqueue(z, order, gfp_mask); + if (page) + goto got_pg; + } + + for (i = 0; (z = zones[i]) != NULL; i++) + wakeup_kswapd(z, order); + + /* + * Go through the zonelist again. Let __GFP_HIGH and allocations + * coming from realtime tasks to go deeper into reserves + * + * This is the last chance, in general, before the goto nopage. + * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. + */ + for (i = 0; (z = zones[i]) != NULL; i++) { + if (!zone_watermark_ok(z, order, z->pages_min, + classzone_idx, can_try_harder, + gfp_mask & __GFP_HIGH)) + continue; + + if (wait && !cpuset_zone_allowed(z)) + continue; + + page = buffered_rmqueue(z, order, gfp_mask); + if (page) + goto got_pg; + } + + /* This allocation should allow future memory freeing. */ + if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) && !in_interrupt()) { + /* go through the zonelist yet again, ignoring mins */ + for (i = 0; (z = zones[i]) != NULL; i++) { + if (!cpuset_zone_allowed(z)) + continue; + page = buffered_rmqueue(z, order, gfp_mask); + if (page) + goto got_pg; + } + goto nopage; + } + + /* Atomic allocations - we can't balance anything */ + if (!wait) + goto nopage; + +rebalance: + cond_resched(); + + /* We now go into synchronous reclaim */ + p->flags |= PF_MEMALLOC; + reclaim_state.reclaimed_slab = 0; + p->reclaim_state = &reclaim_state; + + did_some_progress = try_to_free_pages(zones, gfp_mask, order); + + p->reclaim_state = NULL; + p->flags &= ~PF_MEMALLOC; + + cond_resched(); + + if (likely(did_some_progress)) { + /* + * Go through the zonelist yet one more time, keep + * very high watermark here, this is only to catch + * a parallel oom killing, we must fail if we're still + * under heavy pressure. + */ + for (i = 0; (z = zones[i]) != NULL; i++) { + if (!zone_watermark_ok(z, order, z->pages_min, + classzone_idx, can_try_harder, + gfp_mask & __GFP_HIGH)) + continue; + + if (!cpuset_zone_allowed(z)) + continue; + + page = buffered_rmqueue(z, order, gfp_mask); + if (page) + goto got_pg; + } + } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { + /* + * Go through the zonelist yet one more time, keep + * very high watermark here, this is only to catch + * a parallel oom killing, we must fail if we're still + * under heavy pressure. + */ + for (i = 0; (z = zones[i]) != NULL; i++) { + if (!zone_watermark_ok(z, order, z->pages_high, + classzone_idx, 0, 0)) + continue; + + if (!cpuset_zone_allowed(z)) + continue; + + page = buffered_rmqueue(z, order, gfp_mask); + if (page) + goto got_pg; + } + + out_of_memory(gfp_mask); + goto restart; + } + + /* + * Don't let big-order allocations loop unless the caller explicitly + * requests that. Wait for some write requests to complete then retry. + * + * In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order + * <= 3, but that may not be true in other implementations. + */ + do_retry = 0; + if (!(gfp_mask & __GFP_NORETRY)) { + if ((order <= 3) || (gfp_mask & __GFP_REPEAT)) + do_retry = 1; + if (gfp_mask & __GFP_NOFAIL) + do_retry = 1; + } + if (do_retry) { + blk_congestion_wait(WRITE, HZ/50); + goto rebalance; + } + +nopage: + if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { + printk(KERN_WARNING "%s: page allocation failure." + " order:%d, mode:0x%x\n", + p->comm, order, gfp_mask); + dump_stack(); + } + return NULL; +got_pg: + zone_statistics(zonelist, z); + return page; +} + +EXPORT_SYMBOL(__alloc_pages); + +/* + * Common helper functions. + */ +fastcall unsigned long __get_free_pages(unsigned int __nocast gfp_mask, unsigned int order) +{ + struct page * page; + page = alloc_pages(gfp_mask, order); + if (!page) + return 0; + return (unsigned long) page_address(page); +} + +EXPORT_SYMBOL(__get_free_pages); + +fastcall unsigned long get_zeroed_page(unsigned int __nocast gfp_mask) +{ + struct page * page; + + /* + * get_zeroed_page() returns a 32-bit address, which cannot represent + * a highmem page + */ + BUG_ON(gfp_mask & __GFP_HIGHMEM); + + page = alloc_pages(gfp_mask | __GFP_ZERO, 0); + if (page) + return (unsigned long) page_address(page); + return 0; +} + +EXPORT_SYMBOL(get_zeroed_page); + +void __pagevec_free(struct pagevec *pvec) +{ + int i = pagevec_count(pvec); + + while (--i >= 0) + free_hot_cold_page(pvec->pages[i], pvec->cold); +} + +fastcall void __free_pages(struct page *page, unsigned int order) +{ + if (!PageReserved(page) && put_page_testzero(page)) { + if (order == 0) + free_hot_page(page); + else + __free_pages_ok(page, order); + } +} + +EXPORT_SYMBOL(__free_pages); + +fastcall void free_pages(unsigned long addr, unsigned int order) +{ + if (addr != 0) { + BUG_ON(!virt_addr_valid((void *)addr)); + __free_pages(virt_to_page((void *)addr), order); + } +} + +EXPORT_SYMBOL(free_pages); + +/* + * Total amount of free (allocatable) RAM: + */ +unsigned int nr_free_pages(void) +{ + unsigned int sum = 0; + struct zone *zone; + + for_each_zone(zone) + sum += zone->free_pages; + + return sum; +} + +EXPORT_SYMBOL(nr_free_pages); + +#ifdef CONFIG_NUMA +unsigned int nr_free_pages_pgdat(pg_data_t *pgdat) +{ + unsigned int i, sum = 0; + + for (i = 0; i < MAX_NR_ZONES; i++) + sum += pgdat->node_zones[i].free_pages; + + return sum; +} +#endif + +static unsigned int nr_free_zone_pages(int offset) +{ + pg_data_t *pgdat; + unsigned int sum = 0; + + for_each_pgdat(pgdat) { + struct zonelist *zonelist = pgdat->node_zonelists + offset; + struct zone **zonep = zonelist->zones; + struct zone *zone; + + for (zone = *zonep++; zone; zone = *zonep++) { + unsigned long size = zone->present_pages; + unsigned long high = zone->pages_high; + if (size > high) + sum += size - high; + } + } + + return sum; +} + +/* + * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL + */ +unsigned int nr_free_buffer_pages(void) +{ + return nr_free_zone_pages(GFP_USER & GFP_ZONEMASK); +} + +/* + * Amount of free RAM allocatable within all zones + */ +unsigned int nr_free_pagecache_pages(void) +{ + return nr_free_zone_pages(GFP_HIGHUSER & GFP_ZONEMASK); +} + +#ifdef CONFIG_HIGHMEM +unsigned int nr_free_highpages (void) +{ + pg_data_t *pgdat; + unsigned int pages = 0; + + for_each_pgdat(pgdat) + pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages; + + return pages; +} +#endif + +#ifdef CONFIG_NUMA +static void show_node(struct zone *zone) +{ + printk("Node %d ", zone->zone_pgdat->node_id); +} +#else +#define show_node(zone) do { } while (0) +#endif + +/* + * Accumulate the page_state information across all CPUs. + * The result is unavoidably approximate - it can change + * during and after execution of this function. + */ +static DEFINE_PER_CPU(struct page_state, page_states) = {0}; + +atomic_t nr_pagecache = ATOMIC_INIT(0); +EXPORT_SYMBOL(nr_pagecache); +#ifdef CONFIG_SMP +DEFINE_PER_CPU(long, nr_pagecache_local) = 0; +#endif + +void __get_page_state(struct page_state *ret, int nr) +{ + int cpu = 0; + + memset(ret, 0, sizeof(*ret)); + + cpu = first_cpu(cpu_online_map); + while (cpu < NR_CPUS) { + unsigned long *in, *out, off; + + in = (unsigned long *)&per_cpu(page_states, cpu); + + cpu = next_cpu(cpu, cpu_online_map); + + if (cpu < NR_CPUS) + prefetch(&per_cpu(page_states, cpu)); + + out = (unsigned long *)ret; + for (off = 0; off < nr; off++) + *out++ += *in++; + } +} + +void get_page_state(struct page_state *ret) +{ + int nr; + + nr = offsetof(struct page_state, GET_PAGE_STATE_LAST); + nr /= sizeof(unsigned long); + + __get_page_state(ret, nr + 1); +} + +void get_full_page_state(struct page_state *ret) +{ + __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long)); +} + +unsigned long __read_page_state(unsigned offset) +{ + unsigned long ret = 0; + int cpu; + + for_each_online_cpu(cpu) { + unsigned long in; + + in = (unsigned long)&per_cpu(page_states, cpu) + offset; + ret += *((unsigned long *)in); + } + return ret; +} + +void __mod_page_state(unsigned offset, unsigned long delta) +{ + unsigned long flags; + void* ptr; + + local_irq_save(flags); + ptr = &__get_cpu_var(page_states); + *(unsigned long*)(ptr + offset) += delta; + local_irq_restore(flags); +} + +EXPORT_SYMBOL(__mod_page_state); + +void __get_zone_counts(unsigned long *active, unsigned long *inactive, + unsigned long *free, struct pglist_data *pgdat) +{ + struct zone *zones = pgdat->node_zones; + int i; + + *active = 0; + *inactive = 0; + *free = 0; + for (i = 0; i < MAX_NR_ZONES; i++) { + *active += zones[i].nr_active; + *inactive += zones[i].nr_inactive; + *free += zones[i].free_pages; + } +} + +void get_zone_counts(unsigned long *active, + unsigned long *inactive, unsigned long *free) +{ + struct pglist_data *pgdat; + + *active = 0; + *inactive = 0; + *free = 0; + for_each_pgdat(pgdat) { + unsigned long l, m, n; + __get_zone_counts(&l, &m, &n, pgdat); + *active += l; + *inactive += m; + *free += n; + } +} + +void si_meminfo(struct sysinfo *val) +{ + val->totalram = totalram_pages; + val->sharedram = 0; + val->freeram = nr_free_pages(); + val->bufferram = nr_blockdev_pages(); +#ifdef CONFIG_HIGHMEM + val->totalhigh = totalhigh_pages; + val->freehigh = nr_free_highpages(); +#else + val->totalhigh = 0; + val->freehigh = 0; +#endif + val->mem_unit = PAGE_SIZE; +} + +EXPORT_SYMBOL(si_meminfo); + +#ifdef CONFIG_NUMA +void si_meminfo_node(struct sysinfo *val, int nid) +{ + pg_data_t *pgdat = NODE_DATA(nid); + + val->totalram = pgdat->node_present_pages; + val->freeram = nr_free_pages_pgdat(pgdat); + val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages; + val->freehigh = pgdat->node_zones[ZONE_HIGHMEM].free_pages; + val->mem_unit = PAGE_SIZE; +} +#endif + +#define K(x) ((x) << (PAGE_SHIFT-10)) + +/* + * Show free area list (used inside shift_scroll-lock stuff) + * We also calculate the percentage fragmentation. We do this by counting the + * memory on each free list with the exception of the first item on the list. + */ +void show_free_areas(void) +{ + struct page_state ps; + int cpu, temperature; + unsigned long active; + unsigned long inactive; + unsigned long free; + struct zone *zone; + + for_each_zone(zone) { + show_node(zone); + printk("%s per-cpu:", zone->name); + + if (!zone->present_pages) { + printk(" empty\n"); + continue; + } else + printk("\n"); + + for (cpu = 0; cpu < NR_CPUS; ++cpu) { + struct per_cpu_pageset *pageset; + + if (!cpu_possible(cpu)) + continue; + + pageset = zone->pageset + cpu; + + for (temperature = 0; temperature < 2; temperature++) + printk("cpu %d %s: low %d, high %d, batch %d\n", + cpu, + temperature ? "cold" : "hot", + pageset->pcp[temperature].low, + pageset->pcp[temperature].high, + pageset->pcp[temperature].batch); + } + } + + get_page_state(&ps); + get_zone_counts(&active, &inactive, &free); + + printk("\nFree pages: %11ukB (%ukB HighMem)\n", + K(nr_free_pages()), + K(nr_free_highpages())); + + printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu " + "unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n", + active, + inactive, + ps.nr_dirty, + ps.nr_writeback, + ps.nr_unstable, + nr_free_pages(), + ps.nr_slab, + ps.nr_mapped, + ps.nr_page_table_pages); + + for_each_zone(zone) { + int i; + + show_node(zone); + printk("%s" + " free:%lukB" + " min:%lukB" + " low:%lukB" + " high:%lukB" + " active:%lukB" + " inactive:%lukB" + " present:%lukB" + " pages_scanned:%lu" + " all_unreclaimable? %s" + "\n", + zone->name, + K(zone->free_pages), + K(zone->pages_min), + K(zone->pages_low), + K(zone->pages_high), + K(zone->nr_active), + K(zone->nr_inactive), + K(zone->present_pages), + zone->pages_scanned, + (zone->all_unreclaimable ? "yes" : "no") + ); + printk("lowmem_reserve[]:"); + for (i = 0; i < MAX_NR_ZONES; i++) + printk(" %lu", zone->lowmem_reserve[i]); + printk("\n"); + } + + for_each_zone(zone) { + unsigned long nr, flags, order, total = 0; + + show_node(zone); + printk("%s: ", zone->name); + if (!zone->present_pages) { + printk("empty\n"); + continue; + } + + spin_lock_irqsave(&zone->lock, flags); + for (order = 0; order < MAX_ORDER; order++) { + nr = zone->free_area[order].nr_free; + total += nr << order; + printk("%lu*%lukB ", nr, K(1UL) << order); + } + spin_unlock_irqrestore(&zone->lock, flags); + printk("= %lukB\n", K(total)); + } + + show_swap_cache_info(); +} + +/* + * Builds allocation fallback zone lists. + */ +static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k) +{ + switch (k) { + struct zone *zone; + default: + BUG(); + case ZONE_HIGHMEM: + zone = pgdat->node_zones + ZONE_HIGHMEM; + if (zone->present_pages) { +#ifndef CONFIG_HIGHMEM + BUG(); +#endif + zonelist->zones[j++] = zone; + } + case ZONE_NORMAL: + zone = pgdat->node_zones + ZONE_NORMAL; + if (zone->present_pages) + zonelist->zones[j++] = zone; + case ZONE_DMA: + zone = pgdat->node_zones + ZONE_DMA; + if (zone->present_pages) + zonelist->zones[j++] = zone; + } + + return j; +} + +#ifdef CONFIG_NUMA +#define MAX_NODE_LOAD (num_online_nodes()) +static int __initdata node_load[MAX_NUMNODES]; +/** + * find_next_best_node - find the next node that should appear in a given + * node's fallback list + * @node: node whose fallback list we're appending + * @used_node_mask: nodemask_t of already used nodes + * + * We use a number of factors to determine which is the next node that should + * appear on a given node's fallback list. The node should not have appeared + * already in @node's fallback list, and it should be the next closest node + * according to the distance array (which contains arbitrary distance values + * from each node to each node in the system), and should also prefer nodes + * with no CPUs, since presumably they'll have very little allocation pressure + * on them otherwise. + * It returns -1 if no node is found. + */ +static int __init find_next_best_node(int node, nodemask_t *used_node_mask) +{ + int i, n, val; + int min_val = INT_MAX; + int best_node = -1; + + for_each_online_node(i) { + cpumask_t tmp; + + /* Start from local node */ + n = (node+i) % num_online_nodes(); + + /* Don't want a node to appear more than once */ + if (node_isset(n, *used_node_mask)) + continue; + + /* Use the local node if we haven't already */ + if (!node_isset(node, *used_node_mask)) { + best_node = node; + break; + } + + /* Use the distance array to find the distance */ + val = node_distance(node, n); + + /* Give preference to headless and unused nodes */ + tmp = node_to_cpumask(n); + if (!cpus_empty(tmp)) + val += PENALTY_FOR_NODE_WITH_CPUS; + + /* Slight preference for less loaded node */ + val *= (MAX_NODE_LOAD*MAX_NUMNODES); + val += node_load[n]; + + if (val < min_val) { + min_val = val; + best_node = n; + } + } + + if (best_node >= 0) + node_set(best_node, *used_node_mask); + + return best_node; +} + +static void __init build_zonelists(pg_data_t *pgdat) +{ + int i, j, k, node, local_node; + int prev_node, load; + struct zonelist *zonelist; + nodemask_t used_mask; + + /* initialize zonelists */ + for (i = 0; i < GFP_ZONETYPES; i++) { + zonelist = pgdat->node_zonelists + i; + zonelist->zones[0] = NULL; + } + + /* NUMA-aware ordering of nodes */ + local_node = pgdat->node_id; + load = num_online_nodes(); + prev_node = local_node; + nodes_clear(used_mask); + while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { + /* + * We don't want to pressure a particular node. + * So adding penalty to the first node in same + * distance group to make it round-robin. + */ + if (node_distance(local_node, node) != + node_distance(local_node, prev_node)) + node_load[node] += load; + prev_node = node; + load--; + for (i = 0; i < GFP_ZONETYPES; i++) { + zonelist = pgdat->node_zonelists + i; + for (j = 0; zonelist->zones[j] != NULL; j++); + + k = ZONE_NORMAL; + if (i & __GFP_HIGHMEM) + k = ZONE_HIGHMEM; + if (i & __GFP_DMA) + k = ZONE_DMA; + + j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); + zonelist->zones[j] = NULL; + } + } +} + +#else /* CONFIG_NUMA */ + +static void __init build_zonelists(pg_data_t *pgdat) +{ + int i, j, k, node, local_node; + + local_node = pgdat->node_id; + for (i = 0; i < GFP_ZONETYPES; i++) { + struct zonelist *zonelist; + + zonelist = pgdat->node_zonelists + i; + + j = 0; + k = ZONE_NORMAL; + if (i & __GFP_HIGHMEM) + k = ZONE_HIGHMEM; + if (i & __GFP_DMA) + k = ZONE_DMA; + + j = build_zonelists_node(pgdat, zonelist, j, k); + /* + * Now we build the zonelist so that it contains the zones + * of all the other nodes. + * We don't want to pressure a particular node, so when + * building the zones for node N, we make sure that the + * zones coming right after the local ones are those from + * node N+1 (modulo N) + */ + for (node = local_node + 1; node < MAX_NUMNODES; node++) { + if (!node_online(node)) + continue; + j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); + } + for (node = 0; node < local_node; node++) { + if (!node_online(node)) + continue; + j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); + } + + zonelist->zones[j] = NULL; + } +} + +#endif /* CONFIG_NUMA */ + +void __init build_all_zonelists(void) +{ + int i; + + for_each_online_node(i) + build_zonelists(NODE_DATA(i)); + printk("Built %i zonelists\n", num_online_nodes()); + cpuset_init_current_mems_allowed(); +} + +/* + * Helper functions to size the waitqueue hash table. + * Essentially these want to choose hash table sizes sufficiently + * large so that collisions trying to wait on pages are rare. + * But in fact, the number of active page waitqueues on typical + * systems is ridiculously low, less than 200. So this is even + * conservative, even though it seems large. + * + * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to + * waitqueues, i.e. the size of the waitq table given the number of pages. + */ +#define PAGES_PER_WAITQUEUE 256 + +static inline unsigned long wait_table_size(unsigned long pages) +{ + unsigned long size = 1; + + pages /= PAGES_PER_WAITQUEUE; + + while (size < pages) + size <<= 1; + + /* + * Once we have dozens or even hundreds of threads sleeping + * on IO we've got bigger problems than wait queue collision. + * Limit the size of the wait table to a reasonable size. + */ + size = min(size, 4096UL); + + return max(size, 4UL); +} + +/* + * This is an integer logarithm so that shifts can be used later + * to extract the more random high bits from the multiplicative + * hash function before the remainder is taken. + */ +static inline unsigned long wait_table_bits(unsigned long size) +{ + return ffz(~size); +} + +#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) + +static void __init calculate_zone_totalpages(struct pglist_data *pgdat, + unsigned long *zones_size, unsigned long *zholes_size) +{ + unsigned long realtotalpages, totalpages = 0; + int i; + + for (i = 0; i < MAX_NR_ZONES; i++) + totalpages += zones_size[i]; + pgdat->node_spanned_pages = totalpages; + + realtotalpages = totalpages; + if (zholes_size) + for (i = 0; i < MAX_NR_ZONES; i++) + realtotalpages -= zholes_size[i]; + pgdat->node_present_pages = realtotalpages; + printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages); +} + + +/* + * Initially all pages are reserved - free ones are freed + * up by free_all_bootmem() once the early boot process is + * done. Non-atomic initialization, single-pass. + */ +void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone, + unsigned long start_pfn) +{ + struct page *start = pfn_to_page(start_pfn); + struct page *page; + + for (page = start; page < (start + size); page++) { + set_page_zone(page, NODEZONE(nid, zone)); + set_page_count(page, 0); + reset_page_mapcount(page); + SetPageReserved(page); + INIT_LIST_HEAD(&page->lru); +#ifdef WANT_PAGE_VIRTUAL + /* The shift won't overflow because ZONE_NORMAL is below 4G. */ + if (!is_highmem_idx(zone)) + set_page_address(page, __va(start_pfn << PAGE_SHIFT)); +#endif + start_pfn++; + } +} + +void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone, + unsigned long size) +{ + int order; + for (order = 0; order < MAX_ORDER ; order++) { + INIT_LIST_HEAD(&zone->free_area[order].free_list); + zone->free_area[order].nr_free = 0; + } +} + +#ifndef __HAVE_ARCH_MEMMAP_INIT +#define memmap_init(size, nid, zone, start_pfn) \ + memmap_init_zone((size), (nid), (zone), (start_pfn)) +#endif + +/* + * Set up the zone data structures: + * - mark all pages reserved + * - mark all memory queues empty + * - clear the memory bitmaps + */ +static void __init free_area_init_core(struct pglist_data *pgdat, + unsigned long *zones_size, unsigned long *zholes_size) +{ + unsigned long i, j; + const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1); + int cpu, nid = pgdat->node_id; + unsigned long zone_start_pfn = pgdat->node_start_pfn; + + pgdat->nr_zones = 0; + init_waitqueue_head(&pgdat->kswapd_wait); + pgdat->kswapd_max_order = 0; + + for (j = 0; j < MAX_NR_ZONES; j++) { + struct zone *zone = pgdat->node_zones + j; + unsigned long size, realsize; + unsigned long batch; + + zone_table[NODEZONE(nid, j)] = zone; + realsize = size = zones_size[j]; + if (zholes_size) + realsize -= zholes_size[j]; + + if (j == ZONE_DMA || j == ZONE_NORMAL) + nr_kernel_pages += realsize; + nr_all_pages += realsize; + + zone->spanned_pages = size; + zone->present_pages = realsize; + zone->name = zone_names[j]; + spin_lock_init(&zone->lock); + spin_lock_init(&zone->lru_lock); + zone->zone_pgdat = pgdat; + zone->free_pages = 0; + + zone->temp_priority = zone->prev_priority = DEF_PRIORITY; + + /* + * The per-cpu-pages pools are set to around 1000th of the + * size of the zone. But no more than 1/4 of a meg - there's + * no point in going beyond the size of L2 cache. + * + * OK, so we don't know how big the cache is. So guess. + */ + batch = zone->present_pages / 1024; + if (batch * PAGE_SIZE > 256 * 1024) + batch = (256 * 1024) / PAGE_SIZE; + batch /= 4; /* We effectively *= 4 below */ + if (batch < 1) + batch = 1; + + for (cpu = 0; cpu < NR_CPUS; cpu++) { + struct per_cpu_pages *pcp; + + pcp = &zone->pageset[cpu].pcp[0]; /* hot */ + pcp->count = 0; + pcp->low = 2 * batch; + pcp->high = 6 * batch; + pcp->batch = 1 * batch; + INIT_LIST_HEAD(&pcp->list); + + pcp = &zone->pageset[cpu].pcp[1]; /* cold */ + pcp->count = 0; + pcp->low = 0; + pcp->high = 2 * batch; + pcp->batch = 1 * batch; + INIT_LIST_HEAD(&pcp->list); + } + printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", + zone_names[j], realsize, batch); + INIT_LIST_HEAD(&zone->active_list); + INIT_LIST_HEAD(&zone->inactive_list); + zone->nr_scan_active = 0; + zone->nr_scan_inactive = 0; + zone->nr_active = 0; + zone->nr_inactive = 0; + if (!size) + continue; + + /* + * The per-page waitqueue mechanism uses hashed waitqueues + * per zone. + */ + zone->wait_table_size = wait_table_size(size); + zone->wait_table_bits = + wait_table_bits(zone->wait_table_size); + zone->wait_table = (wait_queue_head_t *) + alloc_bootmem_node(pgdat, zone->wait_table_size + * sizeof(wait_queue_head_t)); + + for(i = 0; i < zone->wait_table_size; ++i) + init_waitqueue_head(zone->wait_table + i); + + pgdat->nr_zones = j+1; + + zone->zone_mem_map = pfn_to_page(zone_start_pfn); + zone->zone_start_pfn = zone_start_pfn; + + if ((zone_start_pfn) & (zone_required_alignment-1)) + printk(KERN_CRIT "BUG: wrong zone alignment, it will crash\n"); + + memmap_init(size, nid, j, zone_start_pfn); + + zone_start_pfn += size; + + zone_init_free_lists(pgdat, zone, zone->spanned_pages); + } +} + +static void __init alloc_node_mem_map(struct pglist_data *pgdat) +{ + unsigned long size; + + /* Skip empty nodes */ + if (!pgdat->node_spanned_pages) + return; + + /* ia64 gets its own node_mem_map, before this, without bootmem */ + if (!pgdat->node_mem_map) { + size = (pgdat->node_spanned_pages + 1) * sizeof(struct page); + pgdat->node_mem_map = alloc_bootmem_node(pgdat, size); + } +#ifndef CONFIG_DISCONTIGMEM + /* + * With no DISCONTIG, the global mem_map is just set as node 0's + */ + if (pgdat == NODE_DATA(0)) + mem_map = NODE_DATA(0)->node_mem_map; +#endif +} + +void __init free_area_init_node(int nid, struct pglist_data *pgdat, + unsigned long *zones_size, unsigned long node_start_pfn, + unsigned long *zholes_size) +{ + pgdat->node_id = nid; + pgdat->node_start_pfn = node_start_pfn; + calculate_zone_totalpages(pgdat, zones_size, zholes_size); + + alloc_node_mem_map(pgdat); + + free_area_init_core(pgdat, zones_size, zholes_size); +} + +#ifndef CONFIG_DISCONTIGMEM +static bootmem_data_t contig_bootmem_data; +struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data }; + +EXPORT_SYMBOL(contig_page_data); + +void __init free_area_init(unsigned long *zones_size) +{ + free_area_init_node(0, &contig_page_data, zones_size, + __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); +} +#endif + +#ifdef CONFIG_PROC_FS + +#include + +static void *frag_start(struct seq_file *m, loff_t *pos) +{ + pg_data_t *pgdat; + loff_t node = *pos; + + for (pgdat = pgdat_list; pgdat && node; pgdat = pgdat->pgdat_next) + --node; + + return pgdat; +} + +static void *frag_next(struct seq_file *m, void *arg, loff_t *pos) +{ + pg_data_t *pgdat = (pg_data_t *)arg; + + (*pos)++; + return pgdat->pgdat_next; +} + +static void frag_stop(struct seq_file *m, void *arg) +{ +} + +/* + * This walks the free areas for each zone. + */ +static int frag_show(struct seq_file *m, void *arg) +{ + pg_data_t *pgdat = (pg_data_t *)arg; + struct zone *zone; + struct zone *node_zones = pgdat->node_zones; + unsigned long flags; + int order; + + for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { + if (!zone->present_pages) + continue; + + spin_lock_irqsave(&zone->lock, flags); + seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); + for (order = 0; order < MAX_ORDER; ++order) + seq_printf(m, "%6lu ", zone->free_area[order].nr_free); + spin_unlock_irqrestore(&zone->lock, flags); + seq_putc(m, '\n'); + } + return 0; +} + +struct seq_operations fragmentation_op = { + .start = frag_start, + .next = frag_next, + .stop = frag_stop, + .show = frag_show, +}; + +static char *vmstat_text[] = { + "nr_dirty", + "nr_writeback", + "nr_unstable", + "nr_page_table_pages", + "nr_mapped", + "nr_slab", + + "pgpgin", + "pgpgout", + "pswpin", + "pswpout", + "pgalloc_high", + + "pgalloc_normal", + "pgalloc_dma", + "pgfree", + "pgactivate", + "pgdeactivate", + + "pgfault", + "pgmajfault", + "pgrefill_high", + "pgrefill_normal", + "pgrefill_dma", + + "pgsteal_high", + "pgsteal_normal", + "pgsteal_dma", + "pgscan_kswapd_high", + "pgscan_kswapd_normal", + + "pgscan_kswapd_dma", + "pgscan_direct_high", + "pgscan_direct_normal", + "pgscan_direct_dma", + "pginodesteal", + + "slabs_scanned", + "kswapd_steal", + "kswapd_inodesteal", + "pageoutrun", + "allocstall", + + "pgrotated", +}; + +static void *vmstat_start(struct seq_file *m, loff_t *pos) +{ + struct page_state *ps; + + if (*pos >= ARRAY_SIZE(vmstat_text)) + return NULL; + + ps = kmalloc(sizeof(*ps), GFP_KERNEL); + m->private = ps; + if (!ps) + return ERR_PTR(-ENOMEM); + get_full_page_state(ps); + ps->pgpgin /= 2; /* sectors -> kbytes */ + ps->pgpgout /= 2; + return (unsigned long *)ps + *pos; +} + +static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos) +{ + (*pos)++; + if (*pos >= ARRAY_SIZE(vmstat_text)) + return NULL; + return (unsigned long *)m->private + *pos; +} + +static int vmstat_show(struct seq_file *m, void *arg) +{ + unsigned long *l = arg; + unsigned long off = l - (unsigned long *)m->private; + + seq_printf(m, "%s %lu\n", vmstat_text[off], *l); + return 0; +} + +static void vmstat_stop(struct seq_file *m, void *arg) +{ + kfree(m->private); + m->private = NULL; +} + +struct seq_operations vmstat_op = { + .start = vmstat_start, + .next = vmstat_next, + .stop = vmstat_stop, + .show = vmstat_show, +}; + +#endif /* CONFIG_PROC_FS */ + +#ifdef CONFIG_HOTPLUG_CPU +static int page_alloc_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + int cpu = (unsigned long)hcpu; + long *count; + unsigned long *src, *dest; + + if (action == CPU_DEAD) { + int i; + + /* Drain local pagecache count. */ + count = &per_cpu(nr_pagecache_local, cpu); + atomic_add(*count, &nr_pagecache); + *count = 0; + local_irq_disable(); + __drain_pages(cpu); + + /* Add dead cpu's page_states to our own. */ + dest = (unsigned long *)&__get_cpu_var(page_states); + src = (unsigned long *)&per_cpu(page_states, cpu); + + for (i = 0; i < sizeof(struct page_state)/sizeof(unsigned long); + i++) { + dest[i] += src[i]; + src[i] = 0; + } + + local_irq_enable(); + } + return NOTIFY_OK; +} +#endif /* CONFIG_HOTPLUG_CPU */ + +void __init page_alloc_init(void) +{ + hotcpu_notifier(page_alloc_cpu_notify, 0); +} + +/* + * setup_per_zone_lowmem_reserve - called whenever + * sysctl_lower_zone_reserve_ratio changes. Ensures that each zone + * has a correct pages reserved value, so an adequate number of + * pages are left in the zone after a successful __alloc_pages(). + */ +static void setup_per_zone_lowmem_reserve(void) +{ + struct pglist_data *pgdat; + int j, idx; + + for_each_pgdat(pgdat) { + for (j = 0; j < MAX_NR_ZONES; j++) { + struct zone *zone = pgdat->node_zones + j; + unsigned long present_pages = zone->present_pages; + + zone->lowmem_reserve[j] = 0; + + for (idx = j-1; idx >= 0; idx--) { + struct zone *lower_zone; + + if (sysctl_lowmem_reserve_ratio[idx] < 1) + sysctl_lowmem_reserve_ratio[idx] = 1; + + lower_zone = pgdat->node_zones + idx; + lower_zone->lowmem_reserve[j] = present_pages / + sysctl_lowmem_reserve_ratio[idx]; + present_pages += lower_zone->present_pages; + } + } + } +} + +/* + * setup_per_zone_pages_min - called when min_free_kbytes changes. Ensures + * that the pages_{min,low,high} values for each zone are set correctly + * with respect to min_free_kbytes. + */ +static void setup_per_zone_pages_min(void) +{ + unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); + unsigned long lowmem_pages = 0; + struct zone *zone; + unsigned long flags; + + /* Calculate total number of !ZONE_HIGHMEM pages */ + for_each_zone(zone) { + if (!is_highmem(zone)) + lowmem_pages += zone->present_pages; + } + + for_each_zone(zone) { + spin_lock_irqsave(&zone->lru_lock, flags); + if (is_highmem(zone)) { + /* + * Often, highmem doesn't need to reserve any pages. + * But the pages_min/low/high values are also used for + * batching up page reclaim activity so we need a + * decent value here. + */ + int min_pages; + + min_pages = zone->present_pages / 1024; + if (min_pages < SWAP_CLUSTER_MAX) + min_pages = SWAP_CLUSTER_MAX; + if (min_pages > 128) + min_pages = 128; + zone->pages_min = min_pages; + } else { + /* if it's a lowmem zone, reserve a number of pages + * proportionate to the zone's size. + */ + zone->pages_min = (pages_min * zone->present_pages) / + lowmem_pages; + } + + /* + * When interpreting these watermarks, just keep in mind that: + * zone->pages_min == (zone->pages_min * 4) / 4; + */ + zone->pages_low = (zone->pages_min * 5) / 4; + zone->pages_high = (zone->pages_min * 6) / 4; + spin_unlock_irqrestore(&zone->lru_lock, flags); + } +} + +/* + * Initialise min_free_kbytes. + * + * For small machines we want it small (128k min). For large machines + * we want it large (64MB max). But it is not linear, because network + * bandwidth does not increase linearly with machine size. We use + * + * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: + * min_free_kbytes = sqrt(lowmem_kbytes * 16) + * + * which yields + * + * 16MB: 512k + * 32MB: 724k + * 64MB: 1024k + * 128MB: 1448k + * 256MB: 2048k + * 512MB: 2896k + * 1024MB: 4096k + * 2048MB: 5792k + * 4096MB: 8192k + * 8192MB: 11584k + * 16384MB: 16384k + */ +static int __init init_per_zone_pages_min(void) +{ + unsigned long lowmem_kbytes; + + lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); + + min_free_kbytes = int_sqrt(lowmem_kbytes * 16); + if (min_free_kbytes < 128) + min_free_kbytes = 128; + if (min_free_kbytes > 65536) + min_free_kbytes = 65536; + setup_per_zone_pages_min(); + setup_per_zone_lowmem_reserve(); + return 0; +} +module_init(init_per_zone_pages_min) + +/* + * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so + * that we can call two helper functions whenever min_free_kbytes + * changes. + */ +int min_free_kbytes_sysctl_handler(ctl_table *table, int write, + struct file *file, void __user *buffer, size_t *length, loff_t *ppos) +{ + proc_dointvec(table, write, file, buffer, length, ppos); + setup_per_zone_pages_min(); + return 0; +} + +/* + * lowmem_reserve_ratio_sysctl_handler - just a wrapper around + * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve() + * whenever sysctl_lowmem_reserve_ratio changes. + * + * The reserve ratio obviously has absolutely no relation with the + * pages_min watermarks. The lowmem reserve ratio can only make sense + * if in function of the boot time zone sizes. + */ +int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, + struct file *file, void __user *buffer, size_t *length, loff_t *ppos) +{ + proc_dointvec_minmax(table, write, file, buffer, length, ppos); + setup_per_zone_lowmem_reserve(); + return 0; +} + +__initdata int hashdist = HASHDIST_DEFAULT; + +#ifdef CONFIG_NUMA +static int __init set_hashdist(char *str) +{ + if (!str) + return 0; + hashdist = simple_strtoul(str, &str, 0); + return 1; +} +__setup("hashdist=", set_hashdist); +#endif + +/* + * allocate a large system hash table from bootmem + * - it is assumed that the hash table must contain an exact power-of-2 + * quantity of entries + * - limit is the number of hash buckets, not the total allocation size + */ +void *__init alloc_large_system_hash(const char *tablename, + unsigned long bucketsize, + unsigned long numentries, + int scale, + int flags, + unsigned int *_hash_shift, + unsigned int *_hash_mask, + unsigned long limit) +{ + unsigned long long max = limit; + unsigned long log2qty, size; + void *table = NULL; + + /* allow the kernel cmdline to have a say */ + if (!numentries) { + /* round applicable memory size up to nearest megabyte */ + numentries = (flags & HASH_HIGHMEM) ? nr_all_pages : nr_kernel_pages; + numentries += (1UL << (20 - PAGE_SHIFT)) - 1; + numentries >>= 20 - PAGE_SHIFT; + numentries <<= 20 - PAGE_SHIFT; + + /* limit to 1 bucket per 2^scale bytes of low memory */ + if (scale > PAGE_SHIFT) + numentries >>= (scale - PAGE_SHIFT); + else + numentries <<= (PAGE_SHIFT - scale); + } + /* rounded up to nearest power of 2 in size */ + numentries = 1UL << (long_log2(numentries) + 1); + + /* limit allocation size to 1/16 total memory by default */ + if (max == 0) { + max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4; + do_div(max, bucketsize); + } + + if (numentries > max) + numentries = max; + + log2qty = long_log2(numentries); + + do { + size = bucketsize << log2qty; + if (flags & HASH_EARLY) + table = alloc_bootmem(size); + else if (hashdist) + table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); + else { + unsigned long order; + for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++) + ; + table = (void*) __get_free_pages(GFP_ATOMIC, order); + } + } while (!table && size > PAGE_SIZE && --log2qty); + + if (!table) + panic("Failed to allocate %s hash table\n", tablename); + + printk("%s hash table entries: %d (order: %d, %lu bytes)\n", + tablename, + (1U << log2qty), + long_log2(size) - PAGE_SHIFT, + size); + + if (_hash_shift) + *_hash_shift = log2qty; + if (_hash_mask) + *_hash_mask = (1 << log2qty) - 1; + + return table; +} diff --git a/mm/page_io.c b/mm/page_io.c new file mode 100644 index 000000000000..667c76df1ec2 --- /dev/null +++ b/mm/page_io.c @@ -0,0 +1,160 @@ +/* + * linux/mm/page_io.c + * + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * + * Swap reorganised 29.12.95, + * Asynchronous swapping added 30.12.95. Stephen Tweedie + * Removed race in async swapping. 14.4.1996. Bruno Haible + * Add swap of shared pages through the page cache. 20.2.1998. Stephen Tweedie + * Always use brw_page, life becomes simpler. 12 May 1998 Eric Biederman + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +static struct bio *get_swap_bio(unsigned int __nocast gfp_flags, pgoff_t index, + struct page *page, bio_end_io_t end_io) +{ + struct bio *bio; + + bio = bio_alloc(gfp_flags, 1); + if (bio) { + struct swap_info_struct *sis; + swp_entry_t entry = { .val = index, }; + + sis = get_swap_info_struct(swp_type(entry)); + bio->bi_sector = map_swap_page(sis, swp_offset(entry)) * + (PAGE_SIZE >> 9); + bio->bi_bdev = sis->bdev; + bio->bi_io_vec[0].bv_page = page; + bio->bi_io_vec[0].bv_len = PAGE_SIZE; + bio->bi_io_vec[0].bv_offset = 0; + bio->bi_vcnt = 1; + bio->bi_idx = 0; + bio->bi_size = PAGE_SIZE; + bio->bi_end_io = end_io; + } + return bio; +} + +static int end_swap_bio_write(struct bio *bio, unsigned int bytes_done, int err) +{ + const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct page *page = bio->bi_io_vec[0].bv_page; + + if (bio->bi_size) + return 1; + + if (!uptodate) + SetPageError(page); + end_page_writeback(page); + bio_put(bio); + return 0; +} + +static int end_swap_bio_read(struct bio *bio, unsigned int bytes_done, int err) +{ + const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct page *page = bio->bi_io_vec[0].bv_page; + + if (bio->bi_size) + return 1; + + if (!uptodate) { + SetPageError(page); + ClearPageUptodate(page); + } else { + SetPageUptodate(page); + } + unlock_page(page); + bio_put(bio); + return 0; +} + +/* + * We may have stale swap cache pages in memory: notice + * them here and get rid of the unnecessary final write. + */ +int swap_writepage(struct page *page, struct writeback_control *wbc) +{ + struct bio *bio; + int ret = 0, rw = WRITE; + + if (remove_exclusive_swap_page(page)) { + unlock_page(page); + goto out; + } + bio = get_swap_bio(GFP_NOIO, page->private, page, end_swap_bio_write); + if (bio == NULL) { + set_page_dirty(page); + unlock_page(page); + ret = -ENOMEM; + goto out; + } + if (wbc->sync_mode == WB_SYNC_ALL) + rw |= (1 << BIO_RW_SYNC); + inc_page_state(pswpout); + set_page_writeback(page); + unlock_page(page); + submit_bio(rw, bio); +out: + return ret; +} + +int swap_readpage(struct file *file, struct page *page) +{ + struct bio *bio; + int ret = 0; + + BUG_ON(!PageLocked(page)); + ClearPageUptodate(page); + bio = get_swap_bio(GFP_KERNEL, page->private, page, end_swap_bio_read); + if (bio == NULL) { + unlock_page(page); + ret = -ENOMEM; + goto out; + } + inc_page_state(pswpin); + submit_bio(READ, bio); +out: + return ret; +} + +#if defined(CONFIG_SOFTWARE_SUSPEND) || defined(CONFIG_PM_DISK) +/* + * A scruffy utility function to read or write an arbitrary swap page + * and wait on the I/O. The caller must have a ref on the page. + * + * We use end_swap_bio_read() even for writes, because it happens to do what + * we want. + */ +int rw_swap_page_sync(int rw, swp_entry_t entry, struct page *page) +{ + struct bio *bio; + int ret = 0; + + lock_page(page); + + bio = get_swap_bio(GFP_KERNEL, entry.val, page, end_swap_bio_read); + if (bio == NULL) { + unlock_page(page); + ret = -ENOMEM; + goto out; + } + + submit_bio(rw | (1 << BIO_RW_SYNC), bio); + wait_on_page_locked(page); + + if (!PageUptodate(page) || PageError(page)) + ret = -EIO; +out: + return ret; +} +#endif diff --git a/mm/pdflush.c b/mm/pdflush.c new file mode 100644 index 000000000000..38ce279cc8cd --- /dev/null +++ b/mm/pdflush.c @@ -0,0 +1,228 @@ +/* + * mm/pdflush.c - worker threads for writing back filesystem data + * + * Copyright (C) 2002, Linus Torvalds. + * + * 09Apr2002 akpm@zip.com.au + * Initial version + * 29Feb2004 kaos@sgi.com + * Move worker thread creation to kthread to avoid chewing + * up stack space with nested calls to kernel_thread. + */ + +#include +#include +#include +#include +#include +#include +#include +#include // Needed by writeback.h +#include // Prototypes pdflush_operation() +#include + + +/* + * Minimum and maximum number of pdflush instances + */ +#define MIN_PDFLUSH_THREADS 2 +#define MAX_PDFLUSH_THREADS 8 + +static void start_one_pdflush_thread(void); + + +/* + * The pdflush threads are worker threads for writing back dirty data. + * Ideally, we'd like one thread per active disk spindle. But the disk + * topology is very hard to divine at this level. Instead, we take + * care in various places to prevent more than one pdflush thread from + * performing writeback against a single filesystem. pdflush threads + * have the PF_FLUSHER flag set in current->flags to aid in this. + */ + +/* + * All the pdflush threads. Protected by pdflush_lock + */ +static LIST_HEAD(pdflush_list); +static DEFINE_SPINLOCK(pdflush_lock); + +/* + * The count of currently-running pdflush threads. Protected + * by pdflush_lock. + * + * Readable by sysctl, but not writable. Published to userspace at + * /proc/sys/vm/nr_pdflush_threads. + */ +int nr_pdflush_threads = 0; + +/* + * The time at which the pdflush thread pool last went empty + */ +static unsigned long last_empty_jifs; + +/* + * The pdflush thread. + * + * Thread pool management algorithm: + * + * - The minimum and maximum number of pdflush instances are bound + * by MIN_PDFLUSH_THREADS and MAX_PDFLUSH_THREADS. + * + * - If there have been no idle pdflush instances for 1 second, create + * a new one. + * + * - If the least-recently-went-to-sleep pdflush thread has been asleep + * for more than one second, terminate a thread. + */ + +/* + * A structure for passing work to a pdflush thread. Also for passing + * state information between pdflush threads. Protected by pdflush_lock. + */ +struct pdflush_work { + struct task_struct *who; /* The thread */ + void (*fn)(unsigned long); /* A callback function */ + unsigned long arg0; /* An argument to the callback */ + struct list_head list; /* On pdflush_list, when idle */ + unsigned long when_i_went_to_sleep; +}; + +static int __pdflush(struct pdflush_work *my_work) +{ + current->flags |= PF_FLUSHER; + my_work->fn = NULL; + my_work->who = current; + INIT_LIST_HEAD(&my_work->list); + + spin_lock_irq(&pdflush_lock); + nr_pdflush_threads++; + for ( ; ; ) { + struct pdflush_work *pdf; + + set_current_state(TASK_INTERRUPTIBLE); + list_move(&my_work->list, &pdflush_list); + my_work->when_i_went_to_sleep = jiffies; + spin_unlock_irq(&pdflush_lock); + + schedule(); + if (try_to_freeze(PF_FREEZE)) { + spin_lock_irq(&pdflush_lock); + continue; + } + + spin_lock_irq(&pdflush_lock); + if (!list_empty(&my_work->list)) { + printk("pdflush: bogus wakeup!\n"); + my_work->fn = NULL; + continue; + } + if (my_work->fn == NULL) { + printk("pdflush: NULL work function\n"); + continue; + } + spin_unlock_irq(&pdflush_lock); + + (*my_work->fn)(my_work->arg0); + + /* + * Thread creation: For how long have there been zero + * available threads? + */ + if (jiffies - last_empty_jifs > 1 * HZ) { + /* unlocked list_empty() test is OK here */ + if (list_empty(&pdflush_list)) { + /* unlocked test is OK here */ + if (nr_pdflush_threads < MAX_PDFLUSH_THREADS) + start_one_pdflush_thread(); + } + } + + spin_lock_irq(&pdflush_lock); + my_work->fn = NULL; + + /* + * Thread destruction: For how long has the sleepiest + * thread slept? + */ + if (list_empty(&pdflush_list)) + continue; + if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS) + continue; + pdf = list_entry(pdflush_list.prev, struct pdflush_work, list); + if (jiffies - pdf->when_i_went_to_sleep > 1 * HZ) { + /* Limit exit rate */ + pdf->when_i_went_to_sleep = jiffies; + break; /* exeunt */ + } + } + nr_pdflush_threads--; + spin_unlock_irq(&pdflush_lock); + return 0; +} + +/* + * Of course, my_work wants to be just a local in __pdflush(). It is + * separated out in this manner to hopefully prevent the compiler from + * performing unfortunate optimisations against the auto variables. Because + * these are visible to other tasks and CPUs. (No problem has actually + * been observed. This is just paranoia). + */ +static int pdflush(void *dummy) +{ + struct pdflush_work my_work; + + /* + * pdflush can spend a lot of time doing encryption via dm-crypt. We + * don't want to do that at keventd's priority. + */ + set_user_nice(current, 0); + return __pdflush(&my_work); +} + +/* + * Attempt to wake up a pdflush thread, and get it to do some work for you. + * Returns zero if it indeed managed to find a worker thread, and passed your + * payload to it. + */ +int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0) +{ + unsigned long flags; + int ret = 0; + + if (fn == NULL) + BUG(); /* Hard to diagnose if it's deferred */ + + spin_lock_irqsave(&pdflush_lock, flags); + if (list_empty(&pdflush_list)) { + spin_unlock_irqrestore(&pdflush_lock, flags); + ret = -1; + } else { + struct pdflush_work *pdf; + + pdf = list_entry(pdflush_list.next, struct pdflush_work, list); + list_del_init(&pdf->list); + if (list_empty(&pdflush_list)) + last_empty_jifs = jiffies; + pdf->fn = fn; + pdf->arg0 = arg0; + wake_up_process(pdf->who); + spin_unlock_irqrestore(&pdflush_lock, flags); + } + return ret; +} + +static void start_one_pdflush_thread(void) +{ + kthread_run(pdflush, NULL, "pdflush"); +} + +static int __init pdflush_init(void) +{ + int i; + + for (i = 0; i < MIN_PDFLUSH_THREADS; i++) + start_one_pdflush_thread(); + return 0; +} + +module_init(pdflush_init); diff --git a/mm/prio_tree.c b/mm/prio_tree.c new file mode 100644 index 000000000000..b4e76c25f953 --- /dev/null +++ b/mm/prio_tree.c @@ -0,0 +1,207 @@ +/* + * mm/prio_tree.c - priority search tree for mapping->i_mmap + * + * Copyright (C) 2004, Rajesh Venkatasubramanian + * + * This file is released under the GPL v2. + * + * Based on the radix priority search tree proposed by Edward M. McCreight + * SIAM Journal of Computing, vol. 14, no.2, pages 257-276, May 1985 + * + * 02Feb2004 Initial version + */ + +#include +#include + +/* + * See lib/prio_tree.c for details on the general radix priority search tree + * code. + */ + +/* + * The following #defines are mirrored from lib/prio_tree.c. They're only used + * for debugging, and should be removed (along with the debugging code using + * them) when switching also VMAs to the regular prio_tree code. + */ + +#define RADIX_INDEX(vma) ((vma)->vm_pgoff) +#define VMA_SIZE(vma) (((vma)->vm_end - (vma)->vm_start) >> PAGE_SHIFT) +/* avoid overflow */ +#define HEAP_INDEX(vma) ((vma)->vm_pgoff + (VMA_SIZE(vma) - 1)) + +/* + * Radix priority search tree for address_space->i_mmap + * + * For each vma that map a unique set of file pages i.e., unique [radix_index, + * heap_index] value, we have a corresponing priority search tree node. If + * multiple vmas have identical [radix_index, heap_index] value, then one of + * them is used as a tree node and others are stored in a vm_set list. The tree + * node points to the first vma (head) of the list using vm_set.head. + * + * prio_tree_root + * | + * A vm_set.head + * / \ / + * L R -> H-I-J-K-M-N-O-P-Q-S + * ^ ^ <-- vm_set.list --> + * tree nodes + * + * We need some way to identify whether a vma is a tree node, head of a vm_set + * list, or just a member of a vm_set list. We cannot use vm_flags to store + * such information. The reason is, in the above figure, it is possible that + * vm_flags' of R and H are covered by the different mmap_sems. When R is + * removed under R->mmap_sem, H replaces R as a tree node. Since we do not hold + * H->mmap_sem, we cannot use H->vm_flags for marking that H is a tree node now. + * That's why some trick involving shared.vm_set.parent is used for identifying + * tree nodes and list head nodes. + * + * vma radix priority search tree node rules: + * + * vma->shared.vm_set.parent != NULL ==> a tree node + * vma->shared.vm_set.head != NULL ==> list of others mapping same range + * vma->shared.vm_set.head == NULL ==> no others map the same range + * + * vma->shared.vm_set.parent == NULL + * vma->shared.vm_set.head != NULL ==> list head of vmas mapping same range + * vma->shared.vm_set.head == NULL ==> a list node + */ + +/* + * Add a new vma known to map the same set of pages as the old vma: + * useful for fork's dup_mmap as well as vma_prio_tree_insert below. + * Note that it just happens to work correctly on i_mmap_nonlinear too. + */ +void vma_prio_tree_add(struct vm_area_struct *vma, struct vm_area_struct *old) +{ + /* Leave these BUG_ONs till prio_tree patch stabilizes */ + BUG_ON(RADIX_INDEX(vma) != RADIX_INDEX(old)); + BUG_ON(HEAP_INDEX(vma) != HEAP_INDEX(old)); + + vma->shared.vm_set.head = NULL; + vma->shared.vm_set.parent = NULL; + + if (!old->shared.vm_set.parent) + list_add(&vma->shared.vm_set.list, + &old->shared.vm_set.list); + else if (old->shared.vm_set.head) + list_add_tail(&vma->shared.vm_set.list, + &old->shared.vm_set.head->shared.vm_set.list); + else { + INIT_LIST_HEAD(&vma->shared.vm_set.list); + vma->shared.vm_set.head = old; + old->shared.vm_set.head = vma; + } +} + +void vma_prio_tree_insert(struct vm_area_struct *vma, + struct prio_tree_root *root) +{ + struct prio_tree_node *ptr; + struct vm_area_struct *old; + + vma->shared.vm_set.head = NULL; + + ptr = raw_prio_tree_insert(root, &vma->shared.prio_tree_node); + if (ptr != (struct prio_tree_node *) &vma->shared.prio_tree_node) { + old = prio_tree_entry(ptr, struct vm_area_struct, + shared.prio_tree_node); + vma_prio_tree_add(vma, old); + } +} + +void vma_prio_tree_remove(struct vm_area_struct *vma, + struct prio_tree_root *root) +{ + struct vm_area_struct *node, *head, *new_head; + + if (!vma->shared.vm_set.head) { + if (!vma->shared.vm_set.parent) + list_del_init(&vma->shared.vm_set.list); + else + raw_prio_tree_remove(root, &vma->shared.prio_tree_node); + } else { + /* Leave this BUG_ON till prio_tree patch stabilizes */ + BUG_ON(vma->shared.vm_set.head->shared.vm_set.head != vma); + if (vma->shared.vm_set.parent) { + head = vma->shared.vm_set.head; + if (!list_empty(&head->shared.vm_set.list)) { + new_head = list_entry( + head->shared.vm_set.list.next, + struct vm_area_struct, + shared.vm_set.list); + list_del_init(&head->shared.vm_set.list); + } else + new_head = NULL; + + raw_prio_tree_replace(root, &vma->shared.prio_tree_node, + &head->shared.prio_tree_node); + head->shared.vm_set.head = new_head; + if (new_head) + new_head->shared.vm_set.head = head; + + } else { + node = vma->shared.vm_set.head; + if (!list_empty(&vma->shared.vm_set.list)) { + new_head = list_entry( + vma->shared.vm_set.list.next, + struct vm_area_struct, + shared.vm_set.list); + list_del_init(&vma->shared.vm_set.list); + node->shared.vm_set.head = new_head; + new_head->shared.vm_set.head = node; + } else + node->shared.vm_set.head = NULL; + } + } +} + +/* + * Helper function to enumerate vmas that map a given file page or a set of + * contiguous file pages. The function returns vmas that at least map a single + * page in the given range of contiguous file pages. + */ +struct vm_area_struct *vma_prio_tree_next(struct vm_area_struct *vma, + struct prio_tree_iter *iter) +{ + struct prio_tree_node *ptr; + struct vm_area_struct *next; + + if (!vma) { + /* + * First call is with NULL vma + */ + ptr = prio_tree_next(iter); + if (ptr) { + next = prio_tree_entry(ptr, struct vm_area_struct, + shared.prio_tree_node); + prefetch(next->shared.vm_set.head); + return next; + } else + return NULL; + } + + if (vma->shared.vm_set.parent) { + if (vma->shared.vm_set.head) { + next = vma->shared.vm_set.head; + prefetch(next->shared.vm_set.list.next); + return next; + } + } else { + next = list_entry(vma->shared.vm_set.list.next, + struct vm_area_struct, shared.vm_set.list); + if (!next->shared.vm_set.head) { + prefetch(next->shared.vm_set.list.next); + return next; + } + } + + ptr = prio_tree_next(iter); + if (ptr) { + next = prio_tree_entry(ptr, struct vm_area_struct, + shared.prio_tree_node); + prefetch(next->shared.vm_set.head); + return next; + } else + return NULL; +} diff --git a/mm/readahead.c b/mm/readahead.c new file mode 100644 index 000000000000..b840e7c6ea74 --- /dev/null +++ b/mm/readahead.c @@ -0,0 +1,557 @@ +/* + * mm/readahead.c - address_space-level file readahead. + * + * Copyright (C) 2002, Linus Torvalds + * + * 09Apr2002 akpm@zip.com.au + * Initial version. + */ + +#include +#include +#include +#include +#include +#include +#include + +void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) +{ +} +EXPORT_SYMBOL(default_unplug_io_fn); + +struct backing_dev_info default_backing_dev_info = { + .ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE, + .state = 0, + .capabilities = BDI_CAP_MAP_COPY, + .unplug_io_fn = default_unplug_io_fn, +}; +EXPORT_SYMBOL_GPL(default_backing_dev_info); + +/* + * Initialise a struct file's readahead state. Assumes that the caller has + * memset *ra to zero. + */ +void +file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping) +{ + ra->ra_pages = mapping->backing_dev_info->ra_pages; + ra->prev_page = -1; +} + +/* + * Return max readahead size for this inode in number-of-pages. + */ +static inline unsigned long get_max_readahead(struct file_ra_state *ra) +{ + return ra->ra_pages; +} + +static inline unsigned long get_min_readahead(struct file_ra_state *ra) +{ + return (VM_MIN_READAHEAD * 1024) / PAGE_CACHE_SIZE; +} + +static inline void ra_off(struct file_ra_state *ra) +{ + ra->start = 0; + ra->flags = 0; + ra->size = 0; + ra->ahead_start = 0; + ra->ahead_size = 0; + return; +} + +/* + * Set the initial window size, round to next power of 2 and square + * for small size, x 4 for medium, and x 2 for large + * for 128k (32 page) max ra + * 1-8 page = 32k initial, > 8 page = 128k initial + */ +static unsigned long get_init_ra_size(unsigned long size, unsigned long max) +{ + unsigned long newsize = roundup_pow_of_two(size); + + if (newsize <= max / 64) + newsize = newsize * newsize; + else if (newsize <= max / 4) + newsize = max / 4; + else + newsize = max; + return newsize; +} + +/* + * Set the new window size, this is called only when I/O is to be submitted, + * not for each call to readahead. If a cache miss occured, reduce next I/O + * size, else increase depending on how close to max we are. + */ +static inline unsigned long get_next_ra_size(struct file_ra_state *ra) +{ + unsigned long max = get_max_readahead(ra); + unsigned long min = get_min_readahead(ra); + unsigned long cur = ra->size; + unsigned long newsize; + + if (ra->flags & RA_FLAG_MISS) { + ra->flags &= ~RA_FLAG_MISS; + newsize = max((cur - 2), min); + } else if (cur < max / 16) { + newsize = 4 * cur; + } else { + newsize = 2 * cur; + } + return min(newsize, max); +} + +#define list_to_page(head) (list_entry((head)->prev, struct page, lru)) + +/** + * read_cache_pages - populate an address space with some pages, and + * start reads against them. + * @mapping: the address_space + * @pages: The address of a list_head which contains the target pages. These + * pages have their ->index populated and are otherwise uninitialised. + * @filler: callback routine for filling a single page. + * @data: private data for the callback routine. + * + * Hides the details of the LRU cache etc from the filesystems. + */ +int read_cache_pages(struct address_space *mapping, struct list_head *pages, + int (*filler)(void *, struct page *), void *data) +{ + struct page *page; + struct pagevec lru_pvec; + int ret = 0; + + pagevec_init(&lru_pvec, 0); + + while (!list_empty(pages)) { + page = list_to_page(pages); + list_del(&page->lru); + if (add_to_page_cache(page, mapping, page->index, GFP_KERNEL)) { + page_cache_release(page); + continue; + } + ret = filler(data, page); + if (!pagevec_add(&lru_pvec, page)) + __pagevec_lru_add(&lru_pvec); + if (ret) { + while (!list_empty(pages)) { + struct page *victim; + + victim = list_to_page(pages); + list_del(&victim->lru); + page_cache_release(victim); + } + break; + } + } + pagevec_lru_add(&lru_pvec); + return ret; +} + +EXPORT_SYMBOL(read_cache_pages); + +static int read_pages(struct address_space *mapping, struct file *filp, + struct list_head *pages, unsigned nr_pages) +{ + unsigned page_idx; + struct pagevec lru_pvec; + int ret = 0; + + if (mapping->a_ops->readpages) { + ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages); + goto out; + } + + pagevec_init(&lru_pvec, 0); + for (page_idx = 0; page_idx < nr_pages; page_idx++) { + struct page *page = list_to_page(pages); + list_del(&page->lru); + if (!add_to_page_cache(page, mapping, + page->index, GFP_KERNEL)) { + mapping->a_ops->readpage(filp, page); + if (!pagevec_add(&lru_pvec, page)) + __pagevec_lru_add(&lru_pvec); + } else { + page_cache_release(page); + } + } + pagevec_lru_add(&lru_pvec); +out: + return ret; +} + +/* + * Readahead design. + * + * The fields in struct file_ra_state represent the most-recently-executed + * readahead attempt: + * + * start: Page index at which we started the readahead + * size: Number of pages in that read + * Together, these form the "current window". + * Together, start and size represent the `readahead window'. + * prev_page: The page which the readahead algorithm most-recently inspected. + * It is mainly used to detect sequential file reading. + * If page_cache_readahead sees that it is again being called for + * a page which it just looked at, it can return immediately without + * making any state changes. + * ahead_start, + * ahead_size: Together, these form the "ahead window". + * ra_pages: The externally controlled max readahead for this fd. + * + * When readahead is in the off state (size == 0), readahead is disabled. + * In this state, prev_page is used to detect the resumption of sequential I/O. + * + * The readahead code manages two windows - the "current" and the "ahead" + * windows. The intent is that while the application is walking the pages + * in the current window, I/O is underway on the ahead window. When the + * current window is fully traversed, it is replaced by the ahead window + * and the ahead window is invalidated. When this copying happens, the + * new current window's pages are probably still locked. So + * we submit a new batch of I/O immediately, creating a new ahead window. + * + * So: + * + * ----|----------------|----------------|----- + * ^start ^start+size + * ^ahead_start ^ahead_start+ahead_size + * + * ^ When this page is read, we submit I/O for the + * ahead window. + * + * A `readahead hit' occurs when a read request is made against a page which is + * the next sequential page. Ahead window calculations are done only when it + * is time to submit a new IO. The code ramps up the size agressively at first, + * but slow down as it approaches max_readhead. + * + * Any seek/ramdom IO will result in readahead being turned off. It will resume + * at the first sequential access. + * + * There is a special-case: if the first page which the application tries to + * read happens to be the first page of the file, it is assumed that a linear + * read is about to happen and the window is immediately set to the initial size + * based on I/O request size and the max_readahead. + * + * This function is to be called for every read request, rather than when + * it is time to perform readahead. It is called only once for the entire I/O + * regardless of size unless readahead is unable to start enough I/O to satisfy + * the request (I/O request > max_readahead). + */ + +/* + * do_page_cache_readahead actually reads a chunk of disk. It allocates all + * the pages first, then submits them all for I/O. This avoids the very bad + * behaviour which would occur if page allocations are causing VM writeback. + * We really don't want to intermingle reads and writes like that. + * + * Returns the number of pages requested, or the maximum amount of I/O allowed. + * + * do_page_cache_readahead() returns -1 if it encountered request queue + * congestion. + */ +static int +__do_page_cache_readahead(struct address_space *mapping, struct file *filp, + unsigned long offset, unsigned long nr_to_read) +{ + struct inode *inode = mapping->host; + struct page *page; + unsigned long end_index; /* The last page we want to read */ + LIST_HEAD(page_pool); + int page_idx; + int ret = 0; + loff_t isize = i_size_read(inode); + + if (isize == 0) + goto out; + + end_index = ((isize - 1) >> PAGE_CACHE_SHIFT); + + /* + * Preallocate as many pages as we will need. + */ + read_lock_irq(&mapping->tree_lock); + for (page_idx = 0; page_idx < nr_to_read; page_idx++) { + unsigned long page_offset = offset + page_idx; + + if (page_offset > end_index) + break; + + page = radix_tree_lookup(&mapping->page_tree, page_offset); + if (page) + continue; + + read_unlock_irq(&mapping->tree_lock); + page = page_cache_alloc_cold(mapping); + read_lock_irq(&mapping->tree_lock); + if (!page) + break; + page->index = page_offset; + list_add(&page->lru, &page_pool); + ret++; + } + read_unlock_irq(&mapping->tree_lock); + + /* + * Now start the IO. We ignore I/O errors - if the page is not + * uptodate then the caller will launch readpage again, and + * will then handle the error. + */ + if (ret) + read_pages(mapping, filp, &page_pool, ret); + BUG_ON(!list_empty(&page_pool)); +out: + return ret; +} + +/* + * Chunk the readahead into 2 megabyte units, so that we don't pin too much + * memory at once. + */ +int force_page_cache_readahead(struct address_space *mapping, struct file *filp, + unsigned long offset, unsigned long nr_to_read) +{ + int ret = 0; + + if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages)) + return -EINVAL; + + while (nr_to_read) { + int err; + + unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_CACHE_SIZE; + + if (this_chunk > nr_to_read) + this_chunk = nr_to_read; + err = __do_page_cache_readahead(mapping, filp, + offset, this_chunk); + if (err < 0) { + ret = err; + break; + } + ret += err; + offset += this_chunk; + nr_to_read -= this_chunk; + } + return ret; +} + +/* + * Check how effective readahead is being. If the amount of started IO is + * less than expected then the file is partly or fully in pagecache and + * readahead isn't helping. + * + */ +static inline int check_ra_success(struct file_ra_state *ra, + unsigned long nr_to_read, unsigned long actual) +{ + if (actual == 0) { + ra->cache_hit += nr_to_read; + if (ra->cache_hit >= VM_MAX_CACHE_HIT) { + ra_off(ra); + ra->flags |= RA_FLAG_INCACHE; + return 0; + } + } else { + ra->cache_hit=0; + } + return 1; +} + +/* + * This version skips the IO if the queue is read-congested, and will tell the + * block layer to abandon the readahead if request allocation would block. + * + * force_page_cache_readahead() will ignore queue congestion and will block on + * request queues. + */ +int do_page_cache_readahead(struct address_space *mapping, struct file *filp, + unsigned long offset, unsigned long nr_to_read) +{ + if (bdi_read_congested(mapping->backing_dev_info)) + return -1; + + return __do_page_cache_readahead(mapping, filp, offset, nr_to_read); +} + +/* + * Read 'nr_to_read' pages starting at page 'offset'. If the flag 'block' + * is set wait till the read completes. Otherwise attempt to read without + * blocking. + * Returns 1 meaning 'success' if read is succesfull without switching off + * readhaead mode. Otherwise return failure. + */ +static int +blockable_page_cache_readahead(struct address_space *mapping, struct file *filp, + unsigned long offset, unsigned long nr_to_read, + struct file_ra_state *ra, int block) +{ + int actual; + + if (!block && bdi_read_congested(mapping->backing_dev_info)) + return 0; + + actual = __do_page_cache_readahead(mapping, filp, offset, nr_to_read); + + return check_ra_success(ra, nr_to_read, actual); +} + +static int make_ahead_window(struct address_space *mapping, struct file *filp, + struct file_ra_state *ra, int force) +{ + int block, ret; + + ra->ahead_size = get_next_ra_size(ra); + ra->ahead_start = ra->start + ra->size; + + block = force || (ra->prev_page >= ra->ahead_start); + ret = blockable_page_cache_readahead(mapping, filp, + ra->ahead_start, ra->ahead_size, ra, block); + + if (!ret && !force) { + /* A read failure in blocking mode, implies pages are + * all cached. So we can safely assume we have taken + * care of all the pages requested in this call. + * A read failure in non-blocking mode, implies we are + * reading more pages than requested in this call. So + * we safely assume we have taken care of all the pages + * requested in this call. + * + * Just reset the ahead window in case we failed due to + * congestion. The ahead window will any way be closed + * in case we failed due to excessive page cache hits. + */ + ra->ahead_start = 0; + ra->ahead_size = 0; + } + + return ret; +} + +/* + * page_cache_readahead is the main function. If performs the adaptive + * readahead window size management and submits the readahead I/O. + */ +unsigned long +page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra, + struct file *filp, unsigned long offset, + unsigned long req_size) +{ + unsigned long max, newsize; + int sequential; + + /* + * We avoid doing extra work and bogusly perturbing the readahead + * window expansion logic. + */ + if (offset == ra->prev_page && --req_size) + ++offset; + + /* Note that prev_page == -1 if it is a first read */ + sequential = (offset == ra->prev_page + 1); + ra->prev_page = offset; + + max = get_max_readahead(ra); + newsize = min(req_size, max); + + /* No readahead or sub-page sized read or file already in cache */ + if (newsize == 0 || (ra->flags & RA_FLAG_INCACHE)) + goto out; + + ra->prev_page += newsize - 1; + + /* + * Special case - first read at start of file. We'll assume it's + * a whole-file read and grow the window fast. Or detect first + * sequential access + */ + if (sequential && ra->size == 0) { + ra->size = get_init_ra_size(newsize, max); + ra->start = offset; + if (!blockable_page_cache_readahead(mapping, filp, offset, + ra->size, ra, 1)) + goto out; + + /* + * If the request size is larger than our max readahead, we + * at least want to be sure that we get 2 IOs in flight and + * we know that we will definitly need the new I/O. + * once we do this, subsequent calls should be able to overlap + * IOs,* thus preventing stalls. so issue the ahead window + * immediately. + */ + if (req_size >= max) + make_ahead_window(mapping, filp, ra, 1); + + goto out; + } + + /* + * Now handle the random case: + * partial page reads and first access were handled above, + * so this must be the next page otherwise it is random + */ + if (!sequential) { + ra_off(ra); + blockable_page_cache_readahead(mapping, filp, offset, + newsize, ra, 1); + goto out; + } + + /* + * If we get here we are doing sequential IO and this was not the first + * occurence (ie we have an existing window) + */ + + if (ra->ahead_start == 0) { /* no ahead window yet */ + if (!make_ahead_window(mapping, filp, ra, 0)) + goto out; + } + /* + * Already have an ahead window, check if we crossed into it. + * If so, shift windows and issue a new ahead window. + * Only return the #pages that are in the current window, so that + * we get called back on the first page of the ahead window which + * will allow us to submit more IO. + */ + if (ra->prev_page >= ra->ahead_start) { + ra->start = ra->ahead_start; + ra->size = ra->ahead_size; + make_ahead_window(mapping, filp, ra, 0); + } + +out: + return ra->prev_page + 1; +} + +/* + * handle_ra_miss() is called when it is known that a page which should have + * been present in the pagecache (we just did some readahead there) was in fact + * not found. This will happen if it was evicted by the VM (readahead + * thrashing) + * + * Turn on the cache miss flag in the RA struct, this will cause the RA code + * to reduce the RA size on the next read. + */ +void handle_ra_miss(struct address_space *mapping, + struct file_ra_state *ra, pgoff_t offset) +{ + ra->flags |= RA_FLAG_MISS; + ra->flags &= ~RA_FLAG_INCACHE; +} + +/* + * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a + * sensible upper limit. + */ +unsigned long max_sane_readahead(unsigned long nr) +{ + unsigned long active; + unsigned long inactive; + unsigned long free; + + __get_zone_counts(&active, &inactive, &free, NODE_DATA(numa_node_id())); + return min(nr, (inactive + free) / 2); +} diff --git a/mm/rmap.c b/mm/rmap.c new file mode 100644 index 000000000000..884d6d1928bc --- /dev/null +++ b/mm/rmap.c @@ -0,0 +1,862 @@ +/* + * mm/rmap.c - physical to virtual reverse mappings + * + * Copyright 2001, Rik van Riel + * Released under the General Public License (GPL). + * + * Simple, low overhead reverse mapping scheme. + * Please try to keep this thing as modular as possible. + * + * Provides methods for unmapping each kind of mapped page: + * the anon methods track anonymous pages, and + * the file methods track pages belonging to an inode. + * + * Original design by Rik van Riel 2001 + * File methods by Dave McCracken 2003, 2004 + * Anonymous methods by Andrea Arcangeli 2004 + * Contributions by Hugh Dickins 2003, 2004 + */ + +/* + * Lock ordering in mm: + * + * inode->i_sem (while writing or truncating, not reading or faulting) + * inode->i_alloc_sem + * + * When a page fault occurs in writing from user to file, down_read + * of mmap_sem nests within i_sem; in sys_msync, i_sem nests within + * down_read of mmap_sem; i_sem and down_write of mmap_sem are never + * taken together; in truncation, i_sem is taken outermost. + * + * mm->mmap_sem + * page->flags PG_locked (lock_page) + * mapping->i_mmap_lock + * anon_vma->lock + * mm->page_table_lock + * zone->lru_lock (in mark_page_accessed) + * swap_list_lock (in swap_free etc's swap_info_get) + * mmlist_lock (in mmput, drain_mmlist and others) + * swap_device_lock (in swap_duplicate, swap_info_get) + * mapping->private_lock (in __set_page_dirty_buffers) + * inode_lock (in set_page_dirty's __mark_inode_dirty) + * sb_lock (within inode_lock in fs/fs-writeback.c) + * mapping->tree_lock (widely used, in set_page_dirty, + * in arch-dependent flush_dcache_mmap_lock, + * within inode_lock in __sync_single_inode) + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +//#define RMAP_DEBUG /* can be enabled only for debugging */ + +kmem_cache_t *anon_vma_cachep; + +static inline void validate_anon_vma(struct vm_area_struct *find_vma) +{ +#ifdef RMAP_DEBUG + struct anon_vma *anon_vma = find_vma->anon_vma; + struct vm_area_struct *vma; + unsigned int mapcount = 0; + int found = 0; + + list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { + mapcount++; + BUG_ON(mapcount > 100000); + if (vma == find_vma) + found = 1; + } + BUG_ON(!found); +#endif +} + +/* This must be called under the mmap_sem. */ +int anon_vma_prepare(struct vm_area_struct *vma) +{ + struct anon_vma *anon_vma = vma->anon_vma; + + might_sleep(); + if (unlikely(!anon_vma)) { + struct mm_struct *mm = vma->vm_mm; + struct anon_vma *allocated, *locked; + + anon_vma = find_mergeable_anon_vma(vma); + if (anon_vma) { + allocated = NULL; + locked = anon_vma; + spin_lock(&locked->lock); + } else { + anon_vma = anon_vma_alloc(); + if (unlikely(!anon_vma)) + return -ENOMEM; + allocated = anon_vma; + locked = NULL; + } + + /* page_table_lock to protect against threads */ + spin_lock(&mm->page_table_lock); + if (likely(!vma->anon_vma)) { + vma->anon_vma = anon_vma; + list_add(&vma->anon_vma_node, &anon_vma->head); + allocated = NULL; + } + spin_unlock(&mm->page_table_lock); + + if (locked) + spin_unlock(&locked->lock); + if (unlikely(allocated)) + anon_vma_free(allocated); + } + return 0; +} + +void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next) +{ + BUG_ON(vma->anon_vma != next->anon_vma); + list_del(&next->anon_vma_node); +} + +void __anon_vma_link(struct vm_area_struct *vma) +{ + struct anon_vma *anon_vma = vma->anon_vma; + + if (anon_vma) { + list_add(&vma->anon_vma_node, &anon_vma->head); + validate_anon_vma(vma); + } +} + +void anon_vma_link(struct vm_area_struct *vma) +{ + struct anon_vma *anon_vma = vma->anon_vma; + + if (anon_vma) { + spin_lock(&anon_vma->lock); + list_add(&vma->anon_vma_node, &anon_vma->head); + validate_anon_vma(vma); + spin_unlock(&anon_vma->lock); + } +} + +void anon_vma_unlink(struct vm_area_struct *vma) +{ + struct anon_vma *anon_vma = vma->anon_vma; + int empty; + + if (!anon_vma) + return; + + spin_lock(&anon_vma->lock); + validate_anon_vma(vma); + list_del(&vma->anon_vma_node); + + /* We must garbage collect the anon_vma if it's empty */ + empty = list_empty(&anon_vma->head); + spin_unlock(&anon_vma->lock); + + if (empty) + anon_vma_free(anon_vma); +} + +static void anon_vma_ctor(void *data, kmem_cache_t *cachep, unsigned long flags) +{ + if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == + SLAB_CTOR_CONSTRUCTOR) { + struct anon_vma *anon_vma = data; + + spin_lock_init(&anon_vma->lock); + INIT_LIST_HEAD(&anon_vma->head); + } +} + +void __init anon_vma_init(void) +{ + anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), + 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor, NULL); +} + +/* + * Getting a lock on a stable anon_vma from a page off the LRU is + * tricky: page_lock_anon_vma rely on RCU to guard against the races. + */ +static struct anon_vma *page_lock_anon_vma(struct page *page) +{ + struct anon_vma *anon_vma = NULL; + unsigned long anon_mapping; + + rcu_read_lock(); + anon_mapping = (unsigned long) page->mapping; + if (!(anon_mapping & PAGE_MAPPING_ANON)) + goto out; + if (!page_mapped(page)) + goto out; + + anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); + spin_lock(&anon_vma->lock); +out: + rcu_read_unlock(); + return anon_vma; +} + +/* + * At what user virtual address is page expected in vma? + */ +static inline unsigned long +vma_address(struct page *page, struct vm_area_struct *vma) +{ + pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + unsigned long address; + + address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); + if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { + /* page should be within any vma from prio_tree_next */ + BUG_ON(!PageAnon(page)); + return -EFAULT; + } + return address; +} + +/* + * At what user virtual address is page expected in vma? checking that the + * page matches the vma: currently only used by unuse_process, on anon pages. + */ +unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) +{ + if (PageAnon(page)) { + if ((void *)vma->anon_vma != + (void *)page->mapping - PAGE_MAPPING_ANON) + return -EFAULT; + } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) { + if (vma->vm_file->f_mapping != page->mapping) + return -EFAULT; + } else + return -EFAULT; + return vma_address(page, vma); +} + +/* + * Subfunctions of page_referenced: page_referenced_one called + * repeatedly from either page_referenced_anon or page_referenced_file. + */ +static int page_referenced_one(struct page *page, + struct vm_area_struct *vma, unsigned int *mapcount, int ignore_token) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long address; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + int referenced = 0; + + if (!get_mm_counter(mm, rss)) + goto out; + address = vma_address(page, vma); + if (address == -EFAULT) + goto out; + + spin_lock(&mm->page_table_lock); + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + goto out_unlock; + + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + goto out_unlock; + + pmd = pmd_offset(pud, address); + if (!pmd_present(*pmd)) + goto out_unlock; + + pte = pte_offset_map(pmd, address); + if (!pte_present(*pte)) + goto out_unmap; + + if (page_to_pfn(page) != pte_pfn(*pte)) + goto out_unmap; + + if (ptep_clear_flush_young(vma, address, pte)) + referenced++; + + if (mm != current->mm && !ignore_token && has_swap_token(mm)) + referenced++; + + (*mapcount)--; + +out_unmap: + pte_unmap(pte); +out_unlock: + spin_unlock(&mm->page_table_lock); +out: + return referenced; +} + +static int page_referenced_anon(struct page *page, int ignore_token) +{ + unsigned int mapcount; + struct anon_vma *anon_vma; + struct vm_area_struct *vma; + int referenced = 0; + + anon_vma = page_lock_anon_vma(page); + if (!anon_vma) + return referenced; + + mapcount = page_mapcount(page); + list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { + referenced += page_referenced_one(page, vma, &mapcount, + ignore_token); + if (!mapcount) + break; + } + spin_unlock(&anon_vma->lock); + return referenced; +} + +/** + * page_referenced_file - referenced check for object-based rmap + * @page: the page we're checking references on. + * + * For an object-based mapped page, find all the places it is mapped and + * check/clear the referenced flag. This is done by following the page->mapping + * pointer, then walking the chain of vmas it holds. It returns the number + * of references it found. + * + * This function is only called from page_referenced for object-based pages. + */ +static int page_referenced_file(struct page *page, int ignore_token) +{ + unsigned int mapcount; + struct address_space *mapping = page->mapping; + pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + struct vm_area_struct *vma; + struct prio_tree_iter iter; + int referenced = 0; + + /* + * The caller's checks on page->mapping and !PageAnon have made + * sure that this is a file page: the check for page->mapping + * excludes the case just before it gets set on an anon page. + */ + BUG_ON(PageAnon(page)); + + /* + * The page lock not only makes sure that page->mapping cannot + * suddenly be NULLified by truncation, it makes sure that the + * structure at mapping cannot be freed and reused yet, + * so we can safely take mapping->i_mmap_lock. + */ + BUG_ON(!PageLocked(page)); + + spin_lock(&mapping->i_mmap_lock); + + /* + * i_mmap_lock does not stabilize mapcount at all, but mapcount + * is more likely to be accurate if we note it after spinning. + */ + mapcount = page_mapcount(page); + + vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE)) + == (VM_LOCKED|VM_MAYSHARE)) { + referenced++; + break; + } + referenced += page_referenced_one(page, vma, &mapcount, + ignore_token); + if (!mapcount) + break; + } + + spin_unlock(&mapping->i_mmap_lock); + return referenced; +} + +/** + * page_referenced - test if the page was referenced + * @page: the page to test + * @is_locked: caller holds lock on the page + * + * Quick test_and_clear_referenced for all mappings to a page, + * returns the number of ptes which referenced the page. + */ +int page_referenced(struct page *page, int is_locked, int ignore_token) +{ + int referenced = 0; + + if (!swap_token_default_timeout) + ignore_token = 1; + + if (page_test_and_clear_young(page)) + referenced++; + + if (TestClearPageReferenced(page)) + referenced++; + + if (page_mapped(page) && page->mapping) { + if (PageAnon(page)) + referenced += page_referenced_anon(page, ignore_token); + else if (is_locked) + referenced += page_referenced_file(page, ignore_token); + else if (TestSetPageLocked(page)) + referenced++; + else { + if (page->mapping) + referenced += page_referenced_file(page, + ignore_token); + unlock_page(page); + } + } + return referenced; +} + +/** + * page_add_anon_rmap - add pte mapping to an anonymous page + * @page: the page to add the mapping to + * @vma: the vm area in which the mapping is added + * @address: the user virtual address mapped + * + * The caller needs to hold the mm->page_table_lock. + */ +void page_add_anon_rmap(struct page *page, + struct vm_area_struct *vma, unsigned long address) +{ + struct anon_vma *anon_vma = vma->anon_vma; + pgoff_t index; + + BUG_ON(PageReserved(page)); + BUG_ON(!anon_vma); + + inc_mm_counter(vma->vm_mm, anon_rss); + + anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; + index = (address - vma->vm_start) >> PAGE_SHIFT; + index += vma->vm_pgoff; + index >>= PAGE_CACHE_SHIFT - PAGE_SHIFT; + + if (atomic_inc_and_test(&page->_mapcount)) { + page->index = index; + page->mapping = (struct address_space *) anon_vma; + inc_page_state(nr_mapped); + } + /* else checking page index and mapping is racy */ +} + +/** + * page_add_file_rmap - add pte mapping to a file page + * @page: the page to add the mapping to + * + * The caller needs to hold the mm->page_table_lock. + */ +void page_add_file_rmap(struct page *page) +{ + BUG_ON(PageAnon(page)); + if (!pfn_valid(page_to_pfn(page)) || PageReserved(page)) + return; + + if (atomic_inc_and_test(&page->_mapcount)) + inc_page_state(nr_mapped); +} + +/** + * page_remove_rmap - take down pte mapping from a page + * @page: page to remove mapping from + * + * Caller needs to hold the mm->page_table_lock. + */ +void page_remove_rmap(struct page *page) +{ + BUG_ON(PageReserved(page)); + + if (atomic_add_negative(-1, &page->_mapcount)) { + BUG_ON(page_mapcount(page) < 0); + /* + * It would be tidy to reset the PageAnon mapping here, + * but that might overwrite a racing page_add_anon_rmap + * which increments mapcount after us but sets mapping + * before us: so leave the reset to free_hot_cold_page, + * and remember that it's only reliable while mapped. + * Leaving it set also helps swapoff to reinstate ptes + * faster for those pages still in swapcache. + */ + if (page_test_and_clear_dirty(page)) + set_page_dirty(page); + dec_page_state(nr_mapped); + } +} + +/* + * Subfunctions of try_to_unmap: try_to_unmap_one called + * repeatedly from either try_to_unmap_anon or try_to_unmap_file. + */ +static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long address; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + pte_t pteval; + int ret = SWAP_AGAIN; + + if (!get_mm_counter(mm, rss)) + goto out; + address = vma_address(page, vma); + if (address == -EFAULT) + goto out; + + /* + * We need the page_table_lock to protect us from page faults, + * munmap, fork, etc... + */ + spin_lock(&mm->page_table_lock); + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + goto out_unlock; + + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + goto out_unlock; + + pmd = pmd_offset(pud, address); + if (!pmd_present(*pmd)) + goto out_unlock; + + pte = pte_offset_map(pmd, address); + if (!pte_present(*pte)) + goto out_unmap; + + if (page_to_pfn(page) != pte_pfn(*pte)) + goto out_unmap; + + /* + * If the page is mlock()d, we cannot swap it out. + * If it's recently referenced (perhaps page_referenced + * skipped over this mm) then we should reactivate it. + */ + if ((vma->vm_flags & (VM_LOCKED|VM_RESERVED)) || + ptep_clear_flush_young(vma, address, pte)) { + ret = SWAP_FAIL; + goto out_unmap; + } + + /* + * Don't pull an anonymous page out from under get_user_pages. + * GUP carefully breaks COW and raises page count (while holding + * page_table_lock, as we have here) to make sure that the page + * cannot be freed. If we unmap that page here, a user write + * access to the virtual address will bring back the page, but + * its raised count will (ironically) be taken to mean it's not + * an exclusive swap page, do_wp_page will replace it by a copy + * page, and the user never get to see the data GUP was holding + * the original page for. + * + * This test is also useful for when swapoff (unuse_process) has + * to drop page lock: its reference to the page stops existing + * ptes from being unmapped, so swapoff can make progress. + */ + if (PageSwapCache(page) && + page_count(page) != page_mapcount(page) + 2) { + ret = SWAP_FAIL; + goto out_unmap; + } + + /* Nuke the page table entry. */ + flush_cache_page(vma, address, page_to_pfn(page)); + pteval = ptep_clear_flush(vma, address, pte); + + /* Move the dirty bit to the physical page now the pte is gone. */ + if (pte_dirty(pteval)) + set_page_dirty(page); + + if (PageAnon(page)) { + swp_entry_t entry = { .val = page->private }; + /* + * Store the swap location in the pte. + * See handle_pte_fault() ... + */ + BUG_ON(!PageSwapCache(page)); + swap_duplicate(entry); + if (list_empty(&mm->mmlist)) { + spin_lock(&mmlist_lock); + list_add(&mm->mmlist, &init_mm.mmlist); + spin_unlock(&mmlist_lock); + } + set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); + BUG_ON(pte_file(*pte)); + dec_mm_counter(mm, anon_rss); + } + + inc_mm_counter(mm, rss); + page_remove_rmap(page); + page_cache_release(page); + +out_unmap: + pte_unmap(pte); +out_unlock: + spin_unlock(&mm->page_table_lock); +out: + return ret; +} + +/* + * objrmap doesn't work for nonlinear VMAs because the assumption that + * offset-into-file correlates with offset-into-virtual-addresses does not hold. + * Consequently, given a particular page and its ->index, we cannot locate the + * ptes which are mapping that page without an exhaustive linear search. + * + * So what this code does is a mini "virtual scan" of each nonlinear VMA which + * maps the file to which the target page belongs. The ->vm_private_data field + * holds the current cursor into that scan. Successive searches will circulate + * around the vma's virtual address space. + * + * So as more replacement pressure is applied to the pages in a nonlinear VMA, + * more scanning pressure is placed against them as well. Eventually pages + * will become fully unmapped and are eligible for eviction. + * + * For very sparsely populated VMAs this is a little inefficient - chances are + * there there won't be many ptes located within the scan cluster. In this case + * maybe we could scan further - to the end of the pte page, perhaps. + */ +#define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE) +#define CLUSTER_MASK (~(CLUSTER_SIZE - 1)) + +static void try_to_unmap_cluster(unsigned long cursor, + unsigned int *mapcount, struct vm_area_struct *vma) +{ + struct mm_struct *mm = vma->vm_mm; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + pte_t pteval; + struct page *page; + unsigned long address; + unsigned long end; + unsigned long pfn; + + /* + * We need the page_table_lock to protect us from page faults, + * munmap, fork, etc... + */ + spin_lock(&mm->page_table_lock); + + address = (vma->vm_start + cursor) & CLUSTER_MASK; + end = address + CLUSTER_SIZE; + if (address < vma->vm_start) + address = vma->vm_start; + if (end > vma->vm_end) + end = vma->vm_end; + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + goto out_unlock; + + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + goto out_unlock; + + pmd = pmd_offset(pud, address); + if (!pmd_present(*pmd)) + goto out_unlock; + + for (pte = pte_offset_map(pmd, address); + address < end; pte++, address += PAGE_SIZE) { + + if (!pte_present(*pte)) + continue; + + pfn = pte_pfn(*pte); + if (!pfn_valid(pfn)) + continue; + + page = pfn_to_page(pfn); + BUG_ON(PageAnon(page)); + if (PageReserved(page)) + continue; + + if (ptep_clear_flush_young(vma, address, pte)) + continue; + + /* Nuke the page table entry. */ + flush_cache_page(vma, address, pfn); + pteval = ptep_clear_flush(vma, address, pte); + + /* If nonlinear, store the file page offset in the pte. */ + if (page->index != linear_page_index(vma, address)) + set_pte_at(mm, address, pte, pgoff_to_pte(page->index)); + + /* Move the dirty bit to the physical page now the pte is gone. */ + if (pte_dirty(pteval)) + set_page_dirty(page); + + page_remove_rmap(page); + page_cache_release(page); + dec_mm_counter(mm, rss); + (*mapcount)--; + } + + pte_unmap(pte); + +out_unlock: + spin_unlock(&mm->page_table_lock); +} + +static int try_to_unmap_anon(struct page *page) +{ + struct anon_vma *anon_vma; + struct vm_area_struct *vma; + int ret = SWAP_AGAIN; + + anon_vma = page_lock_anon_vma(page); + if (!anon_vma) + return ret; + + list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { + ret = try_to_unmap_one(page, vma); + if (ret == SWAP_FAIL || !page_mapped(page)) + break; + } + spin_unlock(&anon_vma->lock); + return ret; +} + +/** + * try_to_unmap_file - unmap file page using the object-based rmap method + * @page: the page to unmap + * + * Find all the mappings of a page using the mapping pointer and the vma chains + * contained in the address_space struct it points to. + * + * This function is only called from try_to_unmap for object-based pages. + */ +static int try_to_unmap_file(struct page *page) +{ + struct address_space *mapping = page->mapping; + pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + struct vm_area_struct *vma; + struct prio_tree_iter iter; + int ret = SWAP_AGAIN; + unsigned long cursor; + unsigned long max_nl_cursor = 0; + unsigned long max_nl_size = 0; + unsigned int mapcount; + + spin_lock(&mapping->i_mmap_lock); + vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + ret = try_to_unmap_one(page, vma); + if (ret == SWAP_FAIL || !page_mapped(page)) + goto out; + } + + if (list_empty(&mapping->i_mmap_nonlinear)) + goto out; + + list_for_each_entry(vma, &mapping->i_mmap_nonlinear, + shared.vm_set.list) { + if (vma->vm_flags & (VM_LOCKED|VM_RESERVED)) + continue; + cursor = (unsigned long) vma->vm_private_data; + if (cursor > max_nl_cursor) + max_nl_cursor = cursor; + cursor = vma->vm_end - vma->vm_start; + if (cursor > max_nl_size) + max_nl_size = cursor; + } + + if (max_nl_size == 0) { /* any nonlinears locked or reserved */ + ret = SWAP_FAIL; + goto out; + } + + /* + * We don't try to search for this page in the nonlinear vmas, + * and page_referenced wouldn't have found it anyway. Instead + * just walk the nonlinear vmas trying to age and unmap some. + * The mapcount of the page we came in with is irrelevant, + * but even so use it as a guide to how hard we should try? + */ + mapcount = page_mapcount(page); + if (!mapcount) + goto out; + cond_resched_lock(&mapping->i_mmap_lock); + + max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK; + if (max_nl_cursor == 0) + max_nl_cursor = CLUSTER_SIZE; + + do { + list_for_each_entry(vma, &mapping->i_mmap_nonlinear, + shared.vm_set.list) { + if (vma->vm_flags & (VM_LOCKED|VM_RESERVED)) + continue; + cursor = (unsigned long) vma->vm_private_data; + while (get_mm_counter(vma->vm_mm, rss) && + cursor < max_nl_cursor && + cursor < vma->vm_end - vma->vm_start) { + try_to_unmap_cluster(cursor, &mapcount, vma); + cursor += CLUSTER_SIZE; + vma->vm_private_data = (void *) cursor; + if ((int)mapcount <= 0) + goto out; + } + vma->vm_private_data = (void *) max_nl_cursor; + } + cond_resched_lock(&mapping->i_mmap_lock); + max_nl_cursor += CLUSTER_SIZE; + } while (max_nl_cursor <= max_nl_size); + + /* + * Don't loop forever (perhaps all the remaining pages are + * in locked vmas). Reset cursor on all unreserved nonlinear + * vmas, now forgetting on which ones it had fallen behind. + */ + list_for_each_entry(vma, &mapping->i_mmap_nonlinear, + shared.vm_set.list) { + if (!(vma->vm_flags & VM_RESERVED)) + vma->vm_private_data = NULL; + } +out: + spin_unlock(&mapping->i_mmap_lock); + return ret; +} + +/** + * try_to_unmap - try to remove all page table mappings to a page + * @page: the page to get unmapped + * + * Tries to remove all the page table entries which are mapping this + * page, used in the pageout path. Caller must hold the page lock. + * Return values are: + * + * SWAP_SUCCESS - we succeeded in removing all mappings + * SWAP_AGAIN - we missed a mapping, try again later + * SWAP_FAIL - the page is unswappable + */ +int try_to_unmap(struct page *page) +{ + int ret; + + BUG_ON(PageReserved(page)); + BUG_ON(!PageLocked(page)); + + if (PageAnon(page)) + ret = try_to_unmap_anon(page); + else + ret = try_to_unmap_file(page); + + if (!page_mapped(page)) + ret = SWAP_SUCCESS; + return ret; +} diff --git a/mm/shmem.c b/mm/shmem.c new file mode 100644 index 000000000000..61574b81d979 --- /dev/null +++ b/mm/shmem.c @@ -0,0 +1,2326 @@ +/* + * Resizable virtual memory filesystem for Linux. + * + * Copyright (C) 2000 Linus Torvalds. + * 2000 Transmeta Corp. + * 2000-2001 Christoph Rohland + * 2000-2001 SAP AG + * 2002 Red Hat Inc. + * Copyright (C) 2002-2004 Hugh Dickins. + * Copyright (C) 2002-2004 VERITAS Software Corporation. + * Copyright (C) 2004 Andi Kleen, SuSE Labs + * + * Extended attribute support for tmpfs: + * Copyright (c) 2004, Luke Kenneth Casson Leighton + * Copyright (c) 2004 Red Hat, Inc., James Morris + * + * This file is released under the GPL. + */ + +/* + * This virtual memory filesystem is heavily based on the ramfs. It + * extends ramfs by the ability to use swap and honor resource limits + * which makes it a completely usable filesystem. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* This magic number is used in glibc for posix shared memory */ +#define TMPFS_MAGIC 0x01021994 + +#define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long)) +#define ENTRIES_PER_PAGEPAGE (ENTRIES_PER_PAGE*ENTRIES_PER_PAGE) +#define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512) + +#define SHMEM_MAX_INDEX (SHMEM_NR_DIRECT + (ENTRIES_PER_PAGEPAGE/2) * (ENTRIES_PER_PAGE+1)) +#define SHMEM_MAX_BYTES ((unsigned long long)SHMEM_MAX_INDEX << PAGE_CACHE_SHIFT) + +#define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT) + +/* info->flags needs VM_flags to handle pagein/truncate races efficiently */ +#define SHMEM_PAGEIN VM_READ +#define SHMEM_TRUNCATE VM_WRITE + +/* Definition to limit shmem_truncate's steps between cond_rescheds */ +#define LATENCY_LIMIT 64 + +/* Pretend that each entry is of this size in directory's i_size */ +#define BOGO_DIRENT_SIZE 20 + +/* Keep swapped page count in private field of indirect struct page */ +#define nr_swapped private + +/* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */ +enum sgp_type { + SGP_QUICK, /* don't try more than file page cache lookup */ + SGP_READ, /* don't exceed i_size, don't allocate page */ + SGP_CACHE, /* don't exceed i_size, may allocate page */ + SGP_WRITE, /* may exceed i_size, may allocate page */ +}; + +static int shmem_getpage(struct inode *inode, unsigned long idx, + struct page **pagep, enum sgp_type sgp, int *type); + +static inline struct page *shmem_dir_alloc(unsigned int gfp_mask) +{ + /* + * The above definition of ENTRIES_PER_PAGE, and the use of + * BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE: + * might be reconsidered if it ever diverges from PAGE_SIZE. + */ + return alloc_pages(gfp_mask, PAGE_CACHE_SHIFT-PAGE_SHIFT); +} + +static inline void shmem_dir_free(struct page *page) +{ + __free_pages(page, PAGE_CACHE_SHIFT-PAGE_SHIFT); +} + +static struct page **shmem_dir_map(struct page *page) +{ + return (struct page **)kmap_atomic(page, KM_USER0); +} + +static inline void shmem_dir_unmap(struct page **dir) +{ + kunmap_atomic(dir, KM_USER0); +} + +static swp_entry_t *shmem_swp_map(struct page *page) +{ + return (swp_entry_t *)kmap_atomic(page, KM_USER1); +} + +static inline void shmem_swp_balance_unmap(void) +{ + /* + * When passing a pointer to an i_direct entry, to code which + * also handles indirect entries and so will shmem_swp_unmap, + * we must arrange for the preempt count to remain in balance. + * What kmap_atomic of a lowmem page does depends on config + * and architecture, so pretend to kmap_atomic some lowmem page. + */ + (void) kmap_atomic(ZERO_PAGE(0), KM_USER1); +} + +static inline void shmem_swp_unmap(swp_entry_t *entry) +{ + kunmap_atomic(entry, KM_USER1); +} + +static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} + +/* + * shmem_file_setup pre-accounts the whole fixed size of a VM object, + * for shared memory and for shared anonymous (/dev/zero) mappings + * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1), + * consistent with the pre-accounting of private mappings ... + */ +static inline int shmem_acct_size(unsigned long flags, loff_t size) +{ + return (flags & VM_ACCOUNT)? + security_vm_enough_memory(VM_ACCT(size)): 0; +} + +static inline void shmem_unacct_size(unsigned long flags, loff_t size) +{ + if (flags & VM_ACCOUNT) + vm_unacct_memory(VM_ACCT(size)); +} + +/* + * ... whereas tmpfs objects are accounted incrementally as + * pages are allocated, in order to allow huge sparse files. + * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM, + * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM. + */ +static inline int shmem_acct_block(unsigned long flags) +{ + return (flags & VM_ACCOUNT)? + 0: security_vm_enough_memory(VM_ACCT(PAGE_CACHE_SIZE)); +} + +static inline void shmem_unacct_blocks(unsigned long flags, long pages) +{ + if (!(flags & VM_ACCOUNT)) + vm_unacct_memory(pages * VM_ACCT(PAGE_CACHE_SIZE)); +} + +static struct super_operations shmem_ops; +static struct address_space_operations shmem_aops; +static struct file_operations shmem_file_operations; +static struct inode_operations shmem_inode_operations; +static struct inode_operations shmem_dir_inode_operations; +static struct inode_operations shmem_special_inode_operations; +static struct vm_operations_struct shmem_vm_ops; + +static struct backing_dev_info shmem_backing_dev_info = { + .ra_pages = 0, /* No readahead */ + .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK, + .unplug_io_fn = default_unplug_io_fn, +}; + +static LIST_HEAD(shmem_swaplist); +static DEFINE_SPINLOCK(shmem_swaplist_lock); + +static void shmem_free_blocks(struct inode *inode, long pages) +{ + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + if (sbinfo) { + spin_lock(&sbinfo->stat_lock); + sbinfo->free_blocks += pages; + inode->i_blocks -= pages*BLOCKS_PER_PAGE; + spin_unlock(&sbinfo->stat_lock); + } +} + +/* + * shmem_recalc_inode - recalculate the size of an inode + * + * @inode: inode to recalc + * + * We have to calculate the free blocks since the mm can drop + * undirtied hole pages behind our back. + * + * But normally info->alloced == inode->i_mapping->nrpages + info->swapped + * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped) + * + * It has to be called with the spinlock held. + */ +static void shmem_recalc_inode(struct inode *inode) +{ + struct shmem_inode_info *info = SHMEM_I(inode); + long freed; + + freed = info->alloced - info->swapped - inode->i_mapping->nrpages; + if (freed > 0) { + info->alloced -= freed; + shmem_unacct_blocks(info->flags, freed); + shmem_free_blocks(inode, freed); + } +} + +/* + * shmem_swp_entry - find the swap vector position in the info structure + * + * @info: info structure for the inode + * @index: index of the page to find + * @page: optional page to add to the structure. Has to be preset to + * all zeros + * + * If there is no space allocated yet it will return NULL when + * page is NULL, else it will use the page for the needed block, + * setting it to NULL on return to indicate that it has been used. + * + * The swap vector is organized the following way: + * + * There are SHMEM_NR_DIRECT entries directly stored in the + * shmem_inode_info structure. So small files do not need an addional + * allocation. + * + * For pages with index > SHMEM_NR_DIRECT there is the pointer + * i_indirect which points to a page which holds in the first half + * doubly indirect blocks, in the second half triple indirect blocks: + * + * For an artificial ENTRIES_PER_PAGE = 4 this would lead to the + * following layout (for SHMEM_NR_DIRECT == 16): + * + * i_indirect -> dir --> 16-19 + * | +-> 20-23 + * | + * +-->dir2 --> 24-27 + * | +-> 28-31 + * | +-> 32-35 + * | +-> 36-39 + * | + * +-->dir3 --> 40-43 + * +-> 44-47 + * +-> 48-51 + * +-> 52-55 + */ +static swp_entry_t *shmem_swp_entry(struct shmem_inode_info *info, unsigned long index, struct page **page) +{ + unsigned long offset; + struct page **dir; + struct page *subdir; + + if (index < SHMEM_NR_DIRECT) { + shmem_swp_balance_unmap(); + return info->i_direct+index; + } + if (!info->i_indirect) { + if (page) { + info->i_indirect = *page; + *page = NULL; + } + return NULL; /* need another page */ + } + + index -= SHMEM_NR_DIRECT; + offset = index % ENTRIES_PER_PAGE; + index /= ENTRIES_PER_PAGE; + dir = shmem_dir_map(info->i_indirect); + + if (index >= ENTRIES_PER_PAGE/2) { + index -= ENTRIES_PER_PAGE/2; + dir += ENTRIES_PER_PAGE/2 + index/ENTRIES_PER_PAGE; + index %= ENTRIES_PER_PAGE; + subdir = *dir; + if (!subdir) { + if (page) { + *dir = *page; + *page = NULL; + } + shmem_dir_unmap(dir); + return NULL; /* need another page */ + } + shmem_dir_unmap(dir); + dir = shmem_dir_map(subdir); + } + + dir += index; + subdir = *dir; + if (!subdir) { + if (!page || !(subdir = *page)) { + shmem_dir_unmap(dir); + return NULL; /* need a page */ + } + *dir = subdir; + *page = NULL; + } + shmem_dir_unmap(dir); + return shmem_swp_map(subdir) + offset; +} + +static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, unsigned long value) +{ + long incdec = value? 1: -1; + + entry->val = value; + info->swapped += incdec; + if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT) + kmap_atomic_to_page(entry)->nr_swapped += incdec; +} + +/* + * shmem_swp_alloc - get the position of the swap entry for the page. + * If it does not exist allocate the entry. + * + * @info: info structure for the inode + * @index: index of the page to find + * @sgp: check and recheck i_size? skip allocation? + */ +static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long index, enum sgp_type sgp) +{ + struct inode *inode = &info->vfs_inode; + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + struct page *page = NULL; + swp_entry_t *entry; + + if (sgp != SGP_WRITE && + ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) + return ERR_PTR(-EINVAL); + + while (!(entry = shmem_swp_entry(info, index, &page))) { + if (sgp == SGP_READ) + return shmem_swp_map(ZERO_PAGE(0)); + /* + * Test free_blocks against 1 not 0, since we have 1 data + * page (and perhaps indirect index pages) yet to allocate: + * a waste to allocate index if we cannot allocate data. + */ + if (sbinfo) { + spin_lock(&sbinfo->stat_lock); + if (sbinfo->free_blocks <= 1) { + spin_unlock(&sbinfo->stat_lock); + return ERR_PTR(-ENOSPC); + } + sbinfo->free_blocks--; + inode->i_blocks += BLOCKS_PER_PAGE; + spin_unlock(&sbinfo->stat_lock); + } + + spin_unlock(&info->lock); + page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping) | __GFP_ZERO); + if (page) { + page->nr_swapped = 0; + } + spin_lock(&info->lock); + + if (!page) { + shmem_free_blocks(inode, 1); + return ERR_PTR(-ENOMEM); + } + if (sgp != SGP_WRITE && + ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { + entry = ERR_PTR(-EINVAL); + break; + } + if (info->next_index <= index) + info->next_index = index + 1; + } + if (page) { + /* another task gave its page, or truncated the file */ + shmem_free_blocks(inode, 1); + shmem_dir_free(page); + } + if (info->next_index <= index && !IS_ERR(entry)) + info->next_index = index + 1; + return entry; +} + +/* + * shmem_free_swp - free some swap entries in a directory + * + * @dir: pointer to the directory + * @edir: pointer after last entry of the directory + */ +static int shmem_free_swp(swp_entry_t *dir, swp_entry_t *edir) +{ + swp_entry_t *ptr; + int freed = 0; + + for (ptr = dir; ptr < edir; ptr++) { + if (ptr->val) { + free_swap_and_cache(*ptr); + *ptr = (swp_entry_t){0}; + freed++; + } + } + return freed; +} + +static int shmem_map_and_free_swp(struct page *subdir, + int offset, int limit, struct page ***dir) +{ + swp_entry_t *ptr; + int freed = 0; + + ptr = shmem_swp_map(subdir); + for (; offset < limit; offset += LATENCY_LIMIT) { + int size = limit - offset; + if (size > LATENCY_LIMIT) + size = LATENCY_LIMIT; + freed += shmem_free_swp(ptr+offset, ptr+offset+size); + if (need_resched()) { + shmem_swp_unmap(ptr); + if (*dir) { + shmem_dir_unmap(*dir); + *dir = NULL; + } + cond_resched(); + ptr = shmem_swp_map(subdir); + } + } + shmem_swp_unmap(ptr); + return freed; +} + +static void shmem_free_pages(struct list_head *next) +{ + struct page *page; + int freed = 0; + + do { + page = container_of(next, struct page, lru); + next = next->next; + shmem_dir_free(page); + freed++; + if (freed >= LATENCY_LIMIT) { + cond_resched(); + freed = 0; + } + } while (next); +} + +static void shmem_truncate(struct inode *inode) +{ + struct shmem_inode_info *info = SHMEM_I(inode); + unsigned long idx; + unsigned long size; + unsigned long limit; + unsigned long stage; + unsigned long diroff; + struct page **dir; + struct page *topdir; + struct page *middir; + struct page *subdir; + swp_entry_t *ptr; + LIST_HEAD(pages_to_free); + long nr_pages_to_free = 0; + long nr_swaps_freed = 0; + int offset; + int freed; + + inode->i_ctime = inode->i_mtime = CURRENT_TIME; + idx = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + if (idx >= info->next_index) + return; + + spin_lock(&info->lock); + info->flags |= SHMEM_TRUNCATE; + limit = info->next_index; + info->next_index = idx; + topdir = info->i_indirect; + if (topdir && idx <= SHMEM_NR_DIRECT) { + info->i_indirect = NULL; + nr_pages_to_free++; + list_add(&topdir->lru, &pages_to_free); + } + spin_unlock(&info->lock); + + if (info->swapped && idx < SHMEM_NR_DIRECT) { + ptr = info->i_direct; + size = limit; + if (size > SHMEM_NR_DIRECT) + size = SHMEM_NR_DIRECT; + nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size); + } + if (!topdir) + goto done2; + + BUG_ON(limit <= SHMEM_NR_DIRECT); + limit -= SHMEM_NR_DIRECT; + idx = (idx > SHMEM_NR_DIRECT)? (idx - SHMEM_NR_DIRECT): 0; + offset = idx % ENTRIES_PER_PAGE; + idx -= offset; + + dir = shmem_dir_map(topdir); + stage = ENTRIES_PER_PAGEPAGE/2; + if (idx < ENTRIES_PER_PAGEPAGE/2) { + middir = topdir; + diroff = idx/ENTRIES_PER_PAGE; + } else { + dir += ENTRIES_PER_PAGE/2; + dir += (idx - ENTRIES_PER_PAGEPAGE/2)/ENTRIES_PER_PAGEPAGE; + while (stage <= idx) + stage += ENTRIES_PER_PAGEPAGE; + middir = *dir; + if (*dir) { + diroff = ((idx - ENTRIES_PER_PAGEPAGE/2) % + ENTRIES_PER_PAGEPAGE) / ENTRIES_PER_PAGE; + if (!diroff && !offset) { + *dir = NULL; + nr_pages_to_free++; + list_add(&middir->lru, &pages_to_free); + } + shmem_dir_unmap(dir); + dir = shmem_dir_map(middir); + } else { + diroff = 0; + offset = 0; + idx = stage; + } + } + + for (; idx < limit; idx += ENTRIES_PER_PAGE, diroff++) { + if (unlikely(idx == stage)) { + shmem_dir_unmap(dir); + dir = shmem_dir_map(topdir) + + ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE; + while (!*dir) { + dir++; + idx += ENTRIES_PER_PAGEPAGE; + if (idx >= limit) + goto done1; + } + stage = idx + ENTRIES_PER_PAGEPAGE; + middir = *dir; + *dir = NULL; + nr_pages_to_free++; + list_add(&middir->lru, &pages_to_free); + shmem_dir_unmap(dir); + cond_resched(); + dir = shmem_dir_map(middir); + diroff = 0; + } + subdir = dir[diroff]; + if (subdir && subdir->nr_swapped) { + size = limit - idx; + if (size > ENTRIES_PER_PAGE) + size = ENTRIES_PER_PAGE; + freed = shmem_map_and_free_swp(subdir, + offset, size, &dir); + if (!dir) + dir = shmem_dir_map(middir); + nr_swaps_freed += freed; + if (offset) + spin_lock(&info->lock); + subdir->nr_swapped -= freed; + if (offset) + spin_unlock(&info->lock); + BUG_ON(subdir->nr_swapped > offset); + } + if (offset) + offset = 0; + else if (subdir) { + dir[diroff] = NULL; + nr_pages_to_free++; + list_add(&subdir->lru, &pages_to_free); + } + } +done1: + shmem_dir_unmap(dir); +done2: + if (inode->i_mapping->nrpages && (info->flags & SHMEM_PAGEIN)) { + /* + * Call truncate_inode_pages again: racing shmem_unuse_inode + * may have swizzled a page in from swap since vmtruncate or + * generic_delete_inode did it, before we lowered next_index. + * Also, though shmem_getpage checks i_size before adding to + * cache, no recheck after: so fix the narrow window there too. + */ + truncate_inode_pages(inode->i_mapping, inode->i_size); + } + + spin_lock(&info->lock); + info->flags &= ~SHMEM_TRUNCATE; + info->swapped -= nr_swaps_freed; + if (nr_pages_to_free) + shmem_free_blocks(inode, nr_pages_to_free); + shmem_recalc_inode(inode); + spin_unlock(&info->lock); + + /* + * Empty swap vector directory pages to be freed? + */ + if (!list_empty(&pages_to_free)) { + pages_to_free.prev->next = NULL; + shmem_free_pages(pages_to_free.next); + } +} + +static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = dentry->d_inode; + struct page *page = NULL; + int error; + + if (attr->ia_valid & ATTR_SIZE) { + if (attr->ia_size < inode->i_size) { + /* + * If truncating down to a partial page, then + * if that page is already allocated, hold it + * in memory until the truncation is over, so + * truncate_partial_page cannnot miss it were + * it assigned to swap. + */ + if (attr->ia_size & (PAGE_CACHE_SIZE-1)) { + (void) shmem_getpage(inode, + attr->ia_size>>PAGE_CACHE_SHIFT, + &page, SGP_READ, NULL); + } + /* + * Reset SHMEM_PAGEIN flag so that shmem_truncate can + * detect if any pages might have been added to cache + * after truncate_inode_pages. But we needn't bother + * if it's being fully truncated to zero-length: the + * nrpages check is efficient enough in that case. + */ + if (attr->ia_size) { + struct shmem_inode_info *info = SHMEM_I(inode); + spin_lock(&info->lock); + info->flags &= ~SHMEM_PAGEIN; + spin_unlock(&info->lock); + } + } + } + + error = inode_change_ok(inode, attr); + if (!error) + error = inode_setattr(inode, attr); + if (page) + page_cache_release(page); + return error; +} + +static void shmem_delete_inode(struct inode *inode) +{ + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + struct shmem_inode_info *info = SHMEM_I(inode); + + if (inode->i_op->truncate == shmem_truncate) { + shmem_unacct_size(info->flags, inode->i_size); + inode->i_size = 0; + shmem_truncate(inode); + if (!list_empty(&info->swaplist)) { + spin_lock(&shmem_swaplist_lock); + list_del_init(&info->swaplist); + spin_unlock(&shmem_swaplist_lock); + } + } + if (sbinfo) { + BUG_ON(inode->i_blocks); + spin_lock(&sbinfo->stat_lock); + sbinfo->free_inodes++; + spin_unlock(&sbinfo->stat_lock); + } + clear_inode(inode); +} + +static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *dir, swp_entry_t *edir) +{ + swp_entry_t *ptr; + + for (ptr = dir; ptr < edir; ptr++) { + if (ptr->val == entry.val) + return ptr - dir; + } + return -1; +} + +static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, struct page *page) +{ + struct inode *inode; + unsigned long idx; + unsigned long size; + unsigned long limit; + unsigned long stage; + struct page **dir; + struct page *subdir; + swp_entry_t *ptr; + int offset; + + idx = 0; + ptr = info->i_direct; + spin_lock(&info->lock); + limit = info->next_index; + size = limit; + if (size > SHMEM_NR_DIRECT) + size = SHMEM_NR_DIRECT; + offset = shmem_find_swp(entry, ptr, ptr+size); + if (offset >= 0) { + shmem_swp_balance_unmap(); + goto found; + } + if (!info->i_indirect) + goto lost2; + + dir = shmem_dir_map(info->i_indirect); + stage = SHMEM_NR_DIRECT + ENTRIES_PER_PAGEPAGE/2; + + for (idx = SHMEM_NR_DIRECT; idx < limit; idx += ENTRIES_PER_PAGE, dir++) { + if (unlikely(idx == stage)) { + shmem_dir_unmap(dir-1); + dir = shmem_dir_map(info->i_indirect) + + ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE; + while (!*dir) { + dir++; + idx += ENTRIES_PER_PAGEPAGE; + if (idx >= limit) + goto lost1; + } + stage = idx + ENTRIES_PER_PAGEPAGE; + subdir = *dir; + shmem_dir_unmap(dir); + dir = shmem_dir_map(subdir); + } + subdir = *dir; + if (subdir && subdir->nr_swapped) { + ptr = shmem_swp_map(subdir); + size = limit - idx; + if (size > ENTRIES_PER_PAGE) + size = ENTRIES_PER_PAGE; + offset = shmem_find_swp(entry, ptr, ptr+size); + if (offset >= 0) { + shmem_dir_unmap(dir); + goto found; + } + shmem_swp_unmap(ptr); + } + } +lost1: + shmem_dir_unmap(dir-1); +lost2: + spin_unlock(&info->lock); + return 0; +found: + idx += offset; + inode = &info->vfs_inode; + if (move_from_swap_cache(page, idx, inode->i_mapping) == 0) { + info->flags |= SHMEM_PAGEIN; + shmem_swp_set(info, ptr + offset, 0); + } + shmem_swp_unmap(ptr); + spin_unlock(&info->lock); + /* + * Decrement swap count even when the entry is left behind: + * try_to_unuse will skip over mms, then reincrement count. + */ + swap_free(entry); + return 1; +} + +/* + * shmem_unuse() search for an eventually swapped out shmem page. + */ +int shmem_unuse(swp_entry_t entry, struct page *page) +{ + struct list_head *p, *next; + struct shmem_inode_info *info; + int found = 0; + + spin_lock(&shmem_swaplist_lock); + list_for_each_safe(p, next, &shmem_swaplist) { + info = list_entry(p, struct shmem_inode_info, swaplist); + if (!info->swapped) + list_del_init(&info->swaplist); + else if (shmem_unuse_inode(info, entry, page)) { + /* move head to start search for next from here */ + list_move_tail(&shmem_swaplist, &info->swaplist); + found = 1; + break; + } + } + spin_unlock(&shmem_swaplist_lock); + return found; +} + +/* + * Move the page from the page cache to the swap cache. + */ +static int shmem_writepage(struct page *page, struct writeback_control *wbc) +{ + struct shmem_inode_info *info; + swp_entry_t *entry, swap; + struct address_space *mapping; + unsigned long index; + struct inode *inode; + + BUG_ON(!PageLocked(page)); + BUG_ON(page_mapped(page)); + + mapping = page->mapping; + index = page->index; + inode = mapping->host; + info = SHMEM_I(inode); + if (info->flags & VM_LOCKED) + goto redirty; + swap = get_swap_page(); + if (!swap.val) + goto redirty; + + spin_lock(&info->lock); + shmem_recalc_inode(inode); + if (index >= info->next_index) { + BUG_ON(!(info->flags & SHMEM_TRUNCATE)); + goto unlock; + } + entry = shmem_swp_entry(info, index, NULL); + BUG_ON(!entry); + BUG_ON(entry->val); + + if (move_to_swap_cache(page, swap) == 0) { + shmem_swp_set(info, entry, swap.val); + shmem_swp_unmap(entry); + spin_unlock(&info->lock); + if (list_empty(&info->swaplist)) { + spin_lock(&shmem_swaplist_lock); + /* move instead of add in case we're racing */ + list_move_tail(&info->swaplist, &shmem_swaplist); + spin_unlock(&shmem_swaplist_lock); + } + unlock_page(page); + return 0; + } + + shmem_swp_unmap(entry); +unlock: + spin_unlock(&info->lock); + swap_free(swap); +redirty: + set_page_dirty(page); + return WRITEPAGE_ACTIVATE; /* Return with the page locked */ +} + +#ifdef CONFIG_NUMA +static struct page *shmem_swapin_async(struct shared_policy *p, + swp_entry_t entry, unsigned long idx) +{ + struct page *page; + struct vm_area_struct pvma; + + /* Create a pseudo vma that just contains the policy */ + memset(&pvma, 0, sizeof(struct vm_area_struct)); + pvma.vm_end = PAGE_SIZE; + pvma.vm_pgoff = idx; + pvma.vm_policy = mpol_shared_policy_lookup(p, idx); + page = read_swap_cache_async(entry, &pvma, 0); + mpol_free(pvma.vm_policy); + return page; +} + +struct page *shmem_swapin(struct shmem_inode_info *info, swp_entry_t entry, + unsigned long idx) +{ + struct shared_policy *p = &info->policy; + int i, num; + struct page *page; + unsigned long offset; + + num = valid_swaphandles(entry, &offset); + for (i = 0; i < num; offset++, i++) { + page = shmem_swapin_async(p, + swp_entry(swp_type(entry), offset), idx); + if (!page) + break; + page_cache_release(page); + } + lru_add_drain(); /* Push any new pages onto the LRU now */ + return shmem_swapin_async(p, entry, idx); +} + +static struct page * +shmem_alloc_page(unsigned long gfp, struct shmem_inode_info *info, + unsigned long idx) +{ + struct vm_area_struct pvma; + struct page *page; + + memset(&pvma, 0, sizeof(struct vm_area_struct)); + pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx); + pvma.vm_pgoff = idx; + pvma.vm_end = PAGE_SIZE; + page = alloc_page_vma(gfp | __GFP_ZERO, &pvma, 0); + mpol_free(pvma.vm_policy); + return page; +} +#else +static inline struct page * +shmem_swapin(struct shmem_inode_info *info,swp_entry_t entry,unsigned long idx) +{ + swapin_readahead(entry, 0, NULL); + return read_swap_cache_async(entry, NULL, 0); +} + +static inline struct page * +shmem_alloc_page(unsigned int __nocast gfp,struct shmem_inode_info *info, + unsigned long idx) +{ + return alloc_page(gfp | __GFP_ZERO); +} +#endif + +/* + * shmem_getpage - either get the page from swap or allocate a new one + * + * If we allocate a new one we do not mark it dirty. That's up to the + * vm. If we swap it in we mark it dirty since we also free the swap + * entry since a page cannot live in both the swap and page cache + */ +static int shmem_getpage(struct inode *inode, unsigned long idx, + struct page **pagep, enum sgp_type sgp, int *type) +{ + struct address_space *mapping = inode->i_mapping; + struct shmem_inode_info *info = SHMEM_I(inode); + struct shmem_sb_info *sbinfo; + struct page *filepage = *pagep; + struct page *swappage; + swp_entry_t *entry; + swp_entry_t swap; + int error; + + if (idx >= SHMEM_MAX_INDEX) + return -EFBIG; + /* + * Normally, filepage is NULL on entry, and either found + * uptodate immediately, or allocated and zeroed, or read + * in under swappage, which is then assigned to filepage. + * But shmem_prepare_write passes in a locked filepage, + * which may be found not uptodate by other callers too, + * and may need to be copied from the swappage read in. + */ +repeat: + if (!filepage) + filepage = find_lock_page(mapping, idx); + if (filepage && PageUptodate(filepage)) + goto done; + error = 0; + if (sgp == SGP_QUICK) + goto failed; + + spin_lock(&info->lock); + shmem_recalc_inode(inode); + entry = shmem_swp_alloc(info, idx, sgp); + if (IS_ERR(entry)) { + spin_unlock(&info->lock); + error = PTR_ERR(entry); + goto failed; + } + swap = *entry; + + if (swap.val) { + /* Look it up and read it in.. */ + swappage = lookup_swap_cache(swap); + if (!swappage) { + shmem_swp_unmap(entry); + spin_unlock(&info->lock); + /* here we actually do the io */ + if (type && *type == VM_FAULT_MINOR) { + inc_page_state(pgmajfault); + *type = VM_FAULT_MAJOR; + } + swappage = shmem_swapin(info, swap, idx); + if (!swappage) { + spin_lock(&info->lock); + entry = shmem_swp_alloc(info, idx, sgp); + if (IS_ERR(entry)) + error = PTR_ERR(entry); + else { + if (entry->val == swap.val) + error = -ENOMEM; + shmem_swp_unmap(entry); + } + spin_unlock(&info->lock); + if (error) + goto failed; + goto repeat; + } + wait_on_page_locked(swappage); + page_cache_release(swappage); + goto repeat; + } + + /* We have to do this with page locked to prevent races */ + if (TestSetPageLocked(swappage)) { + shmem_swp_unmap(entry); + spin_unlock(&info->lock); + wait_on_page_locked(swappage); + page_cache_release(swappage); + goto repeat; + } + if (PageWriteback(swappage)) { + shmem_swp_unmap(entry); + spin_unlock(&info->lock); + wait_on_page_writeback(swappage); + unlock_page(swappage); + page_cache_release(swappage); + goto repeat; + } + if (!PageUptodate(swappage)) { + shmem_swp_unmap(entry); + spin_unlock(&info->lock); + unlock_page(swappage); + page_cache_release(swappage); + error = -EIO; + goto failed; + } + + if (filepage) { + shmem_swp_set(info, entry, 0); + shmem_swp_unmap(entry); + delete_from_swap_cache(swappage); + spin_unlock(&info->lock); + copy_highpage(filepage, swappage); + unlock_page(swappage); + page_cache_release(swappage); + flush_dcache_page(filepage); + SetPageUptodate(filepage); + set_page_dirty(filepage); + swap_free(swap); + } else if (!(error = move_from_swap_cache( + swappage, idx, mapping))) { + info->flags |= SHMEM_PAGEIN; + shmem_swp_set(info, entry, 0); + shmem_swp_unmap(entry); + spin_unlock(&info->lock); + filepage = swappage; + swap_free(swap); + } else { + shmem_swp_unmap(entry); + spin_unlock(&info->lock); + unlock_page(swappage); + page_cache_release(swappage); + if (error == -ENOMEM) { + /* let kswapd refresh zone for GFP_ATOMICs */ + blk_congestion_wait(WRITE, HZ/50); + } + goto repeat; + } + } else if (sgp == SGP_READ && !filepage) { + shmem_swp_unmap(entry); + filepage = find_get_page(mapping, idx); + if (filepage && + (!PageUptodate(filepage) || TestSetPageLocked(filepage))) { + spin_unlock(&info->lock); + wait_on_page_locked(filepage); + page_cache_release(filepage); + filepage = NULL; + goto repeat; + } + spin_unlock(&info->lock); + } else { + shmem_swp_unmap(entry); + sbinfo = SHMEM_SB(inode->i_sb); + if (sbinfo) { + spin_lock(&sbinfo->stat_lock); + if (sbinfo->free_blocks == 0 || + shmem_acct_block(info->flags)) { + spin_unlock(&sbinfo->stat_lock); + spin_unlock(&info->lock); + error = -ENOSPC; + goto failed; + } + sbinfo->free_blocks--; + inode->i_blocks += BLOCKS_PER_PAGE; + spin_unlock(&sbinfo->stat_lock); + } else if (shmem_acct_block(info->flags)) { + spin_unlock(&info->lock); + error = -ENOSPC; + goto failed; + } + + if (!filepage) { + spin_unlock(&info->lock); + filepage = shmem_alloc_page(mapping_gfp_mask(mapping), + info, + idx); + if (!filepage) { + shmem_unacct_blocks(info->flags, 1); + shmem_free_blocks(inode, 1); + error = -ENOMEM; + goto failed; + } + + spin_lock(&info->lock); + entry = shmem_swp_alloc(info, idx, sgp); + if (IS_ERR(entry)) + error = PTR_ERR(entry); + else { + swap = *entry; + shmem_swp_unmap(entry); + } + if (error || swap.val || 0 != add_to_page_cache_lru( + filepage, mapping, idx, GFP_ATOMIC)) { + spin_unlock(&info->lock); + page_cache_release(filepage); + shmem_unacct_blocks(info->flags, 1); + shmem_free_blocks(inode, 1); + filepage = NULL; + if (error) + goto failed; + goto repeat; + } + info->flags |= SHMEM_PAGEIN; + } + + info->alloced++; + spin_unlock(&info->lock); + flush_dcache_page(filepage); + SetPageUptodate(filepage); + } +done: + if (*pagep != filepage) { + unlock_page(filepage); + *pagep = filepage; + } + return 0; + +failed: + if (*pagep != filepage) { + unlock_page(filepage); + page_cache_release(filepage); + } + return error; +} + +struct page *shmem_nopage(struct vm_area_struct *vma, unsigned long address, int *type) +{ + struct inode *inode = vma->vm_file->f_dentry->d_inode; + struct page *page = NULL; + unsigned long idx; + int error; + + idx = (address - vma->vm_start) >> PAGE_SHIFT; + idx += vma->vm_pgoff; + idx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT; + if (((loff_t) idx << PAGE_CACHE_SHIFT) >= i_size_read(inode)) + return NOPAGE_SIGBUS; + + error = shmem_getpage(inode, idx, &page, SGP_CACHE, type); + if (error) + return (error == -ENOMEM)? NOPAGE_OOM: NOPAGE_SIGBUS; + + mark_page_accessed(page); + return page; +} + +static int shmem_populate(struct vm_area_struct *vma, + unsigned long addr, unsigned long len, + pgprot_t prot, unsigned long pgoff, int nonblock) +{ + struct inode *inode = vma->vm_file->f_dentry->d_inode; + struct mm_struct *mm = vma->vm_mm; + enum sgp_type sgp = nonblock? SGP_QUICK: SGP_CACHE; + unsigned long size; + + size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (pgoff >= size || pgoff + (len >> PAGE_SHIFT) > size) + return -EINVAL; + + while ((long) len > 0) { + struct page *page = NULL; + int err; + /* + * Will need changing if PAGE_CACHE_SIZE != PAGE_SIZE + */ + err = shmem_getpage(inode, pgoff, &page, sgp, NULL); + if (err) + return err; + if (page) { + mark_page_accessed(page); + err = install_page(mm, vma, addr, page, prot); + if (err) { + page_cache_release(page); + return err; + } + } else if (nonblock) { + err = install_file_pte(mm, vma, addr, pgoff, prot); + if (err) + return err; + } + + len -= PAGE_SIZE; + addr += PAGE_SIZE; + pgoff++; + } + return 0; +} + +#ifdef CONFIG_NUMA +int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new) +{ + struct inode *i = vma->vm_file->f_dentry->d_inode; + return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new); +} + +struct mempolicy * +shmem_get_policy(struct vm_area_struct *vma, unsigned long addr) +{ + struct inode *i = vma->vm_file->f_dentry->d_inode; + unsigned long idx; + + idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + return mpol_shared_policy_lookup(&SHMEM_I(i)->policy, idx); +} +#endif + +int shmem_lock(struct file *file, int lock, struct user_struct *user) +{ + struct inode *inode = file->f_dentry->d_inode; + struct shmem_inode_info *info = SHMEM_I(inode); + int retval = -ENOMEM; + + spin_lock(&info->lock); + if (lock && !(info->flags & VM_LOCKED)) { + if (!user_shm_lock(inode->i_size, user)) + goto out_nomem; + info->flags |= VM_LOCKED; + } + if (!lock && (info->flags & VM_LOCKED) && user) { + user_shm_unlock(inode->i_size, user); + info->flags &= ~VM_LOCKED; + } + retval = 0; +out_nomem: + spin_unlock(&info->lock); + return retval; +} + +static int shmem_mmap(struct file *file, struct vm_area_struct *vma) +{ + file_accessed(file); + vma->vm_ops = &shmem_vm_ops; + return 0; +} + +static struct inode * +shmem_get_inode(struct super_block *sb, int mode, dev_t dev) +{ + struct inode *inode; + struct shmem_inode_info *info; + struct shmem_sb_info *sbinfo = SHMEM_SB(sb); + + if (sbinfo) { + spin_lock(&sbinfo->stat_lock); + if (!sbinfo->free_inodes) { + spin_unlock(&sbinfo->stat_lock); + return NULL; + } + sbinfo->free_inodes--; + spin_unlock(&sbinfo->stat_lock); + } + + inode = new_inode(sb); + if (inode) { + inode->i_mode = mode; + inode->i_uid = current->fsuid; + inode->i_gid = current->fsgid; + inode->i_blksize = PAGE_CACHE_SIZE; + inode->i_blocks = 0; + inode->i_mapping->a_ops = &shmem_aops; + inode->i_mapping->backing_dev_info = &shmem_backing_dev_info; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + info = SHMEM_I(inode); + memset(info, 0, (char *)inode - (char *)info); + spin_lock_init(&info->lock); + INIT_LIST_HEAD(&info->swaplist); + + switch (mode & S_IFMT) { + default: + inode->i_op = &shmem_special_inode_operations; + init_special_inode(inode, mode, dev); + break; + case S_IFREG: + inode->i_op = &shmem_inode_operations; + inode->i_fop = &shmem_file_operations; + mpol_shared_policy_init(&info->policy); + break; + case S_IFDIR: + inode->i_nlink++; + /* Some things misbehave if size == 0 on a directory */ + inode->i_size = 2 * BOGO_DIRENT_SIZE; + inode->i_op = &shmem_dir_inode_operations; + inode->i_fop = &simple_dir_operations; + break; + case S_IFLNK: + /* + * Must not load anything in the rbtree, + * mpol_free_shared_policy will not be called. + */ + mpol_shared_policy_init(&info->policy); + break; + } + } else if (sbinfo) { + spin_lock(&sbinfo->stat_lock); + sbinfo->free_inodes++; + spin_unlock(&sbinfo->stat_lock); + } + return inode; +} + +#ifdef CONFIG_TMPFS + +static int shmem_set_size(struct shmem_sb_info *sbinfo, + unsigned long max_blocks, unsigned long max_inodes) +{ + int error; + unsigned long blocks, inodes; + + spin_lock(&sbinfo->stat_lock); + blocks = sbinfo->max_blocks - sbinfo->free_blocks; + inodes = sbinfo->max_inodes - sbinfo->free_inodes; + error = -EINVAL; + if (max_blocks < blocks) + goto out; + if (max_inodes < inodes) + goto out; + error = 0; + sbinfo->max_blocks = max_blocks; + sbinfo->free_blocks = max_blocks - blocks; + sbinfo->max_inodes = max_inodes; + sbinfo->free_inodes = max_inodes - inodes; +out: + spin_unlock(&sbinfo->stat_lock); + return error; +} + +static struct inode_operations shmem_symlink_inode_operations; +static struct inode_operations shmem_symlink_inline_operations; + +/* + * Normally tmpfs makes no use of shmem_prepare_write, but it + * lets a tmpfs file be used read-write below the loop driver. + */ +static int +shmem_prepare_write(struct file *file, struct page *page, unsigned offset, unsigned to) +{ + struct inode *inode = page->mapping->host; + return shmem_getpage(inode, page->index, &page, SGP_WRITE, NULL); +} + +static ssize_t +shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) +{ + struct inode *inode = file->f_dentry->d_inode; + loff_t pos; + unsigned long written; + ssize_t err; + + if ((ssize_t) count < 0) + return -EINVAL; + + if (!access_ok(VERIFY_READ, buf, count)) + return -EFAULT; + + down(&inode->i_sem); + + pos = *ppos; + written = 0; + + err = generic_write_checks(file, &pos, &count, 0); + if (err || !count) + goto out; + + err = remove_suid(file->f_dentry); + if (err) + goto out; + + inode->i_ctime = inode->i_mtime = CURRENT_TIME; + + do { + struct page *page = NULL; + unsigned long bytes, index, offset; + char *kaddr; + int left; + + offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ + index = pos >> PAGE_CACHE_SHIFT; + bytes = PAGE_CACHE_SIZE - offset; + if (bytes > count) + bytes = count; + + /* + * We don't hold page lock across copy from user - + * what would it guard against? - so no deadlock here. + * But it still may be a good idea to prefault below. + */ + + err = shmem_getpage(inode, index, &page, SGP_WRITE, NULL); + if (err) + break; + + left = bytes; + if (PageHighMem(page)) { + volatile unsigned char dummy; + __get_user(dummy, buf); + __get_user(dummy, buf + bytes - 1); + + kaddr = kmap_atomic(page, KM_USER0); + left = __copy_from_user_inatomic(kaddr + offset, + buf, bytes); + kunmap_atomic(kaddr, KM_USER0); + } + if (left) { + kaddr = kmap(page); + left = __copy_from_user(kaddr + offset, buf, bytes); + kunmap(page); + } + + written += bytes; + count -= bytes; + pos += bytes; + buf += bytes; + if (pos > inode->i_size) + i_size_write(inode, pos); + + flush_dcache_page(page); + set_page_dirty(page); + mark_page_accessed(page); + page_cache_release(page); + + if (left) { + pos -= left; + written -= left; + err = -EFAULT; + break; + } + + /* + * Our dirty pages are not counted in nr_dirty, + * and we do not attempt to balance dirty pages. + */ + + cond_resched(); + } while (count); + + *ppos = pos; + if (written) + err = written; +out: + up(&inode->i_sem); + return err; +} + +static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor) +{ + struct inode *inode = filp->f_dentry->d_inode; + struct address_space *mapping = inode->i_mapping; + unsigned long index, offset; + + index = *ppos >> PAGE_CACHE_SHIFT; + offset = *ppos & ~PAGE_CACHE_MASK; + + for (;;) { + struct page *page = NULL; + unsigned long end_index, nr, ret; + loff_t i_size = i_size_read(inode); + + end_index = i_size >> PAGE_CACHE_SHIFT; + if (index > end_index) + break; + if (index == end_index) { + nr = i_size & ~PAGE_CACHE_MASK; + if (nr <= offset) + break; + } + + desc->error = shmem_getpage(inode, index, &page, SGP_READ, NULL); + if (desc->error) { + if (desc->error == -EINVAL) + desc->error = 0; + break; + } + + /* + * We must evaluate after, since reads (unlike writes) + * are called without i_sem protection against truncate + */ + nr = PAGE_CACHE_SIZE; + i_size = i_size_read(inode); + end_index = i_size >> PAGE_CACHE_SHIFT; + if (index == end_index) { + nr = i_size & ~PAGE_CACHE_MASK; + if (nr <= offset) { + if (page) + page_cache_release(page); + break; + } + } + nr -= offset; + + if (page) { + /* + * If users can be writing to this page using arbitrary + * virtual addresses, take care about potential aliasing + * before reading the page on the kernel side. + */ + if (mapping_writably_mapped(mapping)) + flush_dcache_page(page); + /* + * Mark the page accessed if we read the beginning. + */ + if (!offset) + mark_page_accessed(page); + } else + page = ZERO_PAGE(0); + + /* + * Ok, we have the page, and it's up-to-date, so + * now we can copy it to user space... + * + * The actor routine returns how many bytes were actually used.. + * NOTE! This may not be the same as how much of a user buffer + * we filled up (we may be padding etc), so we can only update + * "pos" here (the actor routine has to update the user buffer + * pointers and the remaining count). + */ + ret = actor(desc, page, offset, nr); + offset += ret; + index += offset >> PAGE_CACHE_SHIFT; + offset &= ~PAGE_CACHE_MASK; + + page_cache_release(page); + if (ret != nr || !desc->count) + break; + + cond_resched(); + } + + *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset; + file_accessed(filp); +} + +static ssize_t shmem_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos) +{ + read_descriptor_t desc; + + if ((ssize_t) count < 0) + return -EINVAL; + if (!access_ok(VERIFY_WRITE, buf, count)) + return -EFAULT; + if (!count) + return 0; + + desc.written = 0; + desc.count = count; + desc.arg.buf = buf; + desc.error = 0; + + do_shmem_file_read(filp, ppos, &desc, file_read_actor); + if (desc.written) + return desc.written; + return desc.error; +} + +static ssize_t shmem_file_sendfile(struct file *in_file, loff_t *ppos, + size_t count, read_actor_t actor, void *target) +{ + read_descriptor_t desc; + + if (!count) + return 0; + + desc.written = 0; + desc.count = count; + desc.arg.data = target; + desc.error = 0; + + do_shmem_file_read(in_file, ppos, &desc, actor); + if (desc.written) + return desc.written; + return desc.error; +} + +static int shmem_statfs(struct super_block *sb, struct kstatfs *buf) +{ + struct shmem_sb_info *sbinfo = SHMEM_SB(sb); + + buf->f_type = TMPFS_MAGIC; + buf->f_bsize = PAGE_CACHE_SIZE; + buf->f_namelen = NAME_MAX; + if (sbinfo) { + spin_lock(&sbinfo->stat_lock); + buf->f_blocks = sbinfo->max_blocks; + buf->f_bavail = buf->f_bfree = sbinfo->free_blocks; + buf->f_files = sbinfo->max_inodes; + buf->f_ffree = sbinfo->free_inodes; + spin_unlock(&sbinfo->stat_lock); + } + /* else leave those fields 0 like simple_statfs */ + return 0; +} + +/* + * File creation. Allocate an inode, and we're done.. + */ +static int +shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) +{ + struct inode *inode = shmem_get_inode(dir->i_sb, mode, dev); + int error = -ENOSPC; + + if (inode) { + if (dir->i_mode & S_ISGID) { + inode->i_gid = dir->i_gid; + if (S_ISDIR(mode)) + inode->i_mode |= S_ISGID; + } + dir->i_size += BOGO_DIRENT_SIZE; + dir->i_ctime = dir->i_mtime = CURRENT_TIME; + d_instantiate(dentry, inode); + dget(dentry); /* Extra count - pin the dentry in core */ + error = 0; + } + return error; +} + +static int shmem_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + int error; + + if ((error = shmem_mknod(dir, dentry, mode | S_IFDIR, 0))) + return error; + dir->i_nlink++; + return 0; +} + +static int shmem_create(struct inode *dir, struct dentry *dentry, int mode, + struct nameidata *nd) +{ + return shmem_mknod(dir, dentry, mode | S_IFREG, 0); +} + +/* + * Link a file.. + */ +static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = old_dentry->d_inode; + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + + /* + * No ordinary (disk based) filesystem counts links as inodes; + * but each new link needs a new dentry, pinning lowmem, and + * tmpfs dentries cannot be pruned until they are unlinked. + */ + if (sbinfo) { + spin_lock(&sbinfo->stat_lock); + if (!sbinfo->free_inodes) { + spin_unlock(&sbinfo->stat_lock); + return -ENOSPC; + } + sbinfo->free_inodes--; + spin_unlock(&sbinfo->stat_lock); + } + + dir->i_size += BOGO_DIRENT_SIZE; + inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; + inode->i_nlink++; + atomic_inc(&inode->i_count); /* New dentry reference */ + dget(dentry); /* Extra pinning count for the created dentry */ + d_instantiate(dentry, inode); + return 0; +} + +static int shmem_unlink(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + + if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) { + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + if (sbinfo) { + spin_lock(&sbinfo->stat_lock); + sbinfo->free_inodes++; + spin_unlock(&sbinfo->stat_lock); + } + } + + dir->i_size -= BOGO_DIRENT_SIZE; + inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; + inode->i_nlink--; + dput(dentry); /* Undo the count from "create" - this does all the work */ + return 0; +} + +static int shmem_rmdir(struct inode *dir, struct dentry *dentry) +{ + if (!simple_empty(dentry)) + return -ENOTEMPTY; + + dir->i_nlink--; + return shmem_unlink(dir, dentry); +} + +/* + * The VFS layer already does all the dentry stuff for rename, + * we just have to decrement the usage count for the target if + * it exists so that the VFS layer correctly free's it when it + * gets overwritten. + */ +static int shmem_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) +{ + struct inode *inode = old_dentry->d_inode; + int they_are_dirs = S_ISDIR(inode->i_mode); + + if (!simple_empty(new_dentry)) + return -ENOTEMPTY; + + if (new_dentry->d_inode) { + (void) shmem_unlink(new_dir, new_dentry); + if (they_are_dirs) + old_dir->i_nlink--; + } else if (they_are_dirs) { + old_dir->i_nlink--; + new_dir->i_nlink++; + } + + old_dir->i_size -= BOGO_DIRENT_SIZE; + new_dir->i_size += BOGO_DIRENT_SIZE; + old_dir->i_ctime = old_dir->i_mtime = + new_dir->i_ctime = new_dir->i_mtime = + inode->i_ctime = CURRENT_TIME; + return 0; +} + +static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *symname) +{ + int error; + int len; + struct inode *inode; + struct page *page = NULL; + char *kaddr; + struct shmem_inode_info *info; + + len = strlen(symname) + 1; + if (len > PAGE_CACHE_SIZE) + return -ENAMETOOLONG; + + inode = shmem_get_inode(dir->i_sb, S_IFLNK|S_IRWXUGO, 0); + if (!inode) + return -ENOSPC; + + info = SHMEM_I(inode); + inode->i_size = len-1; + if (len <= (char *)inode - (char *)info) { + /* do it inline */ + memcpy(info, symname, len); + inode->i_op = &shmem_symlink_inline_operations; + } else { + error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL); + if (error) { + iput(inode); + return error; + } + inode->i_op = &shmem_symlink_inode_operations; + kaddr = kmap_atomic(page, KM_USER0); + memcpy(kaddr, symname, len); + kunmap_atomic(kaddr, KM_USER0); + set_page_dirty(page); + page_cache_release(page); + } + if (dir->i_mode & S_ISGID) + inode->i_gid = dir->i_gid; + dir->i_size += BOGO_DIRENT_SIZE; + dir->i_ctime = dir->i_mtime = CURRENT_TIME; + d_instantiate(dentry, inode); + dget(dentry); + return 0; +} + +static int shmem_follow_link_inline(struct dentry *dentry, struct nameidata *nd) +{ + nd_set_link(nd, (char *)SHMEM_I(dentry->d_inode)); + return 0; +} + +static int shmem_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct page *page = NULL; + int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL); + nd_set_link(nd, res ? ERR_PTR(res) : kmap(page)); + return 0; +} + +static void shmem_put_link(struct dentry *dentry, struct nameidata *nd) +{ + if (!IS_ERR(nd_get_link(nd))) { + struct page *page; + + page = find_get_page(dentry->d_inode->i_mapping, 0); + if (!page) + BUG(); + kunmap(page); + mark_page_accessed(page); + page_cache_release(page); + page_cache_release(page); + } +} + +static struct inode_operations shmem_symlink_inline_operations = { + .readlink = generic_readlink, + .follow_link = shmem_follow_link_inline, +#ifdef CONFIG_TMPFS_XATTR + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = generic_listxattr, + .removexattr = generic_removexattr, +#endif +}; + +static struct inode_operations shmem_symlink_inode_operations = { + .truncate = shmem_truncate, + .readlink = generic_readlink, + .follow_link = shmem_follow_link, + .put_link = shmem_put_link, +#ifdef CONFIG_TMPFS_XATTR + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = generic_listxattr, + .removexattr = generic_removexattr, +#endif +}; + +static int shmem_parse_options(char *options, int *mode, uid_t *uid, gid_t *gid, unsigned long *blocks, unsigned long *inodes) +{ + char *this_char, *value, *rest; + + while ((this_char = strsep(&options, ",")) != NULL) { + if (!*this_char) + continue; + if ((value = strchr(this_char,'=')) != NULL) { + *value++ = 0; + } else { + printk(KERN_ERR + "tmpfs: No value for mount option '%s'\n", + this_char); + return 1; + } + + if (!strcmp(this_char,"size")) { + unsigned long long size; + size = memparse(value,&rest); + if (*rest == '%') { + size <<= PAGE_SHIFT; + size *= totalram_pages; + do_div(size, 100); + rest++; + } + if (*rest) + goto bad_val; + *blocks = size >> PAGE_CACHE_SHIFT; + } else if (!strcmp(this_char,"nr_blocks")) { + *blocks = memparse(value,&rest); + if (*rest) + goto bad_val; + } else if (!strcmp(this_char,"nr_inodes")) { + *inodes = memparse(value,&rest); + if (*rest) + goto bad_val; + } else if (!strcmp(this_char,"mode")) { + if (!mode) + continue; + *mode = simple_strtoul(value,&rest,8); + if (*rest) + goto bad_val; + } else if (!strcmp(this_char,"uid")) { + if (!uid) + continue; + *uid = simple_strtoul(value,&rest,0); + if (*rest) + goto bad_val; + } else if (!strcmp(this_char,"gid")) { + if (!gid) + continue; + *gid = simple_strtoul(value,&rest,0); + if (*rest) + goto bad_val; + } else { + printk(KERN_ERR "tmpfs: Bad mount option %s\n", + this_char); + return 1; + } + } + return 0; + +bad_val: + printk(KERN_ERR "tmpfs: Bad value '%s' for mount option '%s'\n", + value, this_char); + return 1; + +} + +static int shmem_remount_fs(struct super_block *sb, int *flags, char *data) +{ + struct shmem_sb_info *sbinfo = SHMEM_SB(sb); + unsigned long max_blocks = 0; + unsigned long max_inodes = 0; + + if (sbinfo) { + max_blocks = sbinfo->max_blocks; + max_inodes = sbinfo->max_inodes; + } + if (shmem_parse_options(data, NULL, NULL, NULL, &max_blocks, &max_inodes)) + return -EINVAL; + /* Keep it simple: disallow limited <-> unlimited remount */ + if ((max_blocks || max_inodes) == !sbinfo) + return -EINVAL; + /* But allow the pointless unlimited -> unlimited remount */ + if (!sbinfo) + return 0; + return shmem_set_size(sbinfo, max_blocks, max_inodes); +} +#endif + +static void shmem_put_super(struct super_block *sb) +{ + kfree(sb->s_fs_info); + sb->s_fs_info = NULL; +} + +#ifdef CONFIG_TMPFS_XATTR +static struct xattr_handler *shmem_xattr_handlers[]; +#else +#define shmem_xattr_handlers NULL +#endif + +static int shmem_fill_super(struct super_block *sb, + void *data, int silent) +{ + struct inode *inode; + struct dentry *root; + int mode = S_IRWXUGO | S_ISVTX; + uid_t uid = current->fsuid; + gid_t gid = current->fsgid; + int err = -ENOMEM; + +#ifdef CONFIG_TMPFS + unsigned long blocks = 0; + unsigned long inodes = 0; + + /* + * Per default we only allow half of the physical ram per + * tmpfs instance, limiting inodes to one per page of lowmem; + * but the internal instance is left unlimited. + */ + if (!(sb->s_flags & MS_NOUSER)) { + blocks = totalram_pages / 2; + inodes = totalram_pages - totalhigh_pages; + if (inodes > blocks) + inodes = blocks; + + if (shmem_parse_options(data, &mode, + &uid, &gid, &blocks, &inodes)) + return -EINVAL; + } + + if (blocks || inodes) { + struct shmem_sb_info *sbinfo; + sbinfo = kmalloc(sizeof(struct shmem_sb_info), GFP_KERNEL); + if (!sbinfo) + return -ENOMEM; + sb->s_fs_info = sbinfo; + spin_lock_init(&sbinfo->stat_lock); + sbinfo->max_blocks = blocks; + sbinfo->free_blocks = blocks; + sbinfo->max_inodes = inodes; + sbinfo->free_inodes = inodes; + } + sb->s_xattr = shmem_xattr_handlers; +#else + sb->s_flags |= MS_NOUSER; +#endif + + sb->s_maxbytes = SHMEM_MAX_BYTES; + sb->s_blocksize = PAGE_CACHE_SIZE; + sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + sb->s_magic = TMPFS_MAGIC; + sb->s_op = &shmem_ops; + inode = shmem_get_inode(sb, S_IFDIR | mode, 0); + if (!inode) + goto failed; + inode->i_uid = uid; + inode->i_gid = gid; + root = d_alloc_root(inode); + if (!root) + goto failed_iput; + sb->s_root = root; + return 0; + +failed_iput: + iput(inode); +failed: + shmem_put_super(sb); + return err; +} + +static kmem_cache_t *shmem_inode_cachep; + +static struct inode *shmem_alloc_inode(struct super_block *sb) +{ + struct shmem_inode_info *p; + p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, SLAB_KERNEL); + if (!p) + return NULL; + return &p->vfs_inode; +} + +static void shmem_destroy_inode(struct inode *inode) +{ + if ((inode->i_mode & S_IFMT) == S_IFREG) { + /* only struct inode is valid if it's an inline symlink */ + mpol_free_shared_policy(&SHMEM_I(inode)->policy); + } + kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); +} + +static void init_once(void *foo, kmem_cache_t *cachep, unsigned long flags) +{ + struct shmem_inode_info *p = (struct shmem_inode_info *) foo; + + if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == + SLAB_CTOR_CONSTRUCTOR) { + inode_init_once(&p->vfs_inode); + } +} + +static int init_inodecache(void) +{ + shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", + sizeof(struct shmem_inode_info), + 0, 0, init_once, NULL); + if (shmem_inode_cachep == NULL) + return -ENOMEM; + return 0; +} + +static void destroy_inodecache(void) +{ + if (kmem_cache_destroy(shmem_inode_cachep)) + printk(KERN_INFO "shmem_inode_cache: not all structures were freed\n"); +} + +static struct address_space_operations shmem_aops = { + .writepage = shmem_writepage, + .set_page_dirty = __set_page_dirty_nobuffers, +#ifdef CONFIG_TMPFS + .prepare_write = shmem_prepare_write, + .commit_write = simple_commit_write, +#endif +}; + +static struct file_operations shmem_file_operations = { + .mmap = shmem_mmap, +#ifdef CONFIG_TMPFS + .llseek = generic_file_llseek, + .read = shmem_file_read, + .write = shmem_file_write, + .fsync = simple_sync_file, + .sendfile = shmem_file_sendfile, +#endif +}; + +static struct inode_operations shmem_inode_operations = { + .truncate = shmem_truncate, + .setattr = shmem_notify_change, +#ifdef CONFIG_TMPFS_XATTR + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = generic_listxattr, + .removexattr = generic_removexattr, +#endif +}; + +static struct inode_operations shmem_dir_inode_operations = { +#ifdef CONFIG_TMPFS + .create = shmem_create, + .lookup = simple_lookup, + .link = shmem_link, + .unlink = shmem_unlink, + .symlink = shmem_symlink, + .mkdir = shmem_mkdir, + .rmdir = shmem_rmdir, + .mknod = shmem_mknod, + .rename = shmem_rename, +#ifdef CONFIG_TMPFS_XATTR + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = generic_listxattr, + .removexattr = generic_removexattr, +#endif +#endif +}; + +static struct inode_operations shmem_special_inode_operations = { +#ifdef CONFIG_TMPFS_XATTR + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = generic_listxattr, + .removexattr = generic_removexattr, +#endif +}; + +static struct super_operations shmem_ops = { + .alloc_inode = shmem_alloc_inode, + .destroy_inode = shmem_destroy_inode, +#ifdef CONFIG_TMPFS + .statfs = shmem_statfs, + .remount_fs = shmem_remount_fs, +#endif + .delete_inode = shmem_delete_inode, + .drop_inode = generic_delete_inode, + .put_super = shmem_put_super, +}; + +static struct vm_operations_struct shmem_vm_ops = { + .nopage = shmem_nopage, + .populate = shmem_populate, +#ifdef CONFIG_NUMA + .set_policy = shmem_set_policy, + .get_policy = shmem_get_policy, +#endif +}; + + +#ifdef CONFIG_TMPFS_SECURITY + +static size_t shmem_xattr_security_list(struct inode *inode, char *list, size_t list_len, + const char *name, size_t name_len) +{ + return security_inode_listsecurity(inode, list, list_len); +} + +static int shmem_xattr_security_get(struct inode *inode, const char *name, void *buffer, size_t size) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + return security_inode_getsecurity(inode, name, buffer, size); +} + +static int shmem_xattr_security_set(struct inode *inode, const char *name, const void *value, size_t size, int flags) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + return security_inode_setsecurity(inode, name, value, size, flags); +} + +static struct xattr_handler shmem_xattr_security_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .list = shmem_xattr_security_list, + .get = shmem_xattr_security_get, + .set = shmem_xattr_security_set, +}; + +#endif /* CONFIG_TMPFS_SECURITY */ + +#ifdef CONFIG_TMPFS_XATTR + +static struct xattr_handler *shmem_xattr_handlers[] = { +#ifdef CONFIG_TMPFS_SECURITY + &shmem_xattr_security_handler, +#endif + NULL +}; + +#endif /* CONFIG_TMPFS_XATTR */ + +static struct super_block *shmem_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return get_sb_nodev(fs_type, flags, data, shmem_fill_super); +} + +static struct file_system_type tmpfs_fs_type = { + .owner = THIS_MODULE, + .name = "tmpfs", + .get_sb = shmem_get_sb, + .kill_sb = kill_litter_super, +}; +static struct vfsmount *shm_mnt; + +static int __init init_tmpfs(void) +{ + int error; + + error = init_inodecache(); + if (error) + goto out3; + + error = register_filesystem(&tmpfs_fs_type); + if (error) { + printk(KERN_ERR "Could not register tmpfs\n"); + goto out2; + } +#ifdef CONFIG_TMPFS + devfs_mk_dir("shm"); +#endif + shm_mnt = do_kern_mount(tmpfs_fs_type.name, MS_NOUSER, + tmpfs_fs_type.name, NULL); + if (IS_ERR(shm_mnt)) { + error = PTR_ERR(shm_mnt); + printk(KERN_ERR "Could not kern_mount tmpfs\n"); + goto out1; + } + return 0; + +out1: + unregister_filesystem(&tmpfs_fs_type); +out2: + destroy_inodecache(); +out3: + shm_mnt = ERR_PTR(error); + return error; +} +module_init(init_tmpfs) + +/* + * shmem_file_setup - get an unlinked file living in tmpfs + * + * @name: name for dentry (to be seen in /proc//maps + * @size: size to be set for the file + * + */ +struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) +{ + int error; + struct file *file; + struct inode *inode; + struct dentry *dentry, *root; + struct qstr this; + + if (IS_ERR(shm_mnt)) + return (void *)shm_mnt; + + if (size < 0 || size > SHMEM_MAX_BYTES) + return ERR_PTR(-EINVAL); + + if (shmem_acct_size(flags, size)) + return ERR_PTR(-ENOMEM); + + error = -ENOMEM; + this.name = name; + this.len = strlen(name); + this.hash = 0; /* will go */ + root = shm_mnt->mnt_root; + dentry = d_alloc(root, &this); + if (!dentry) + goto put_memory; + + error = -ENFILE; + file = get_empty_filp(); + if (!file) + goto put_dentry; + + error = -ENOSPC; + inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0); + if (!inode) + goto close_file; + + SHMEM_I(inode)->flags = flags & VM_ACCOUNT; + d_instantiate(dentry, inode); + inode->i_size = size; + inode->i_nlink = 0; /* It is unlinked */ + file->f_vfsmnt = mntget(shm_mnt); + file->f_dentry = dentry; + file->f_mapping = inode->i_mapping; + file->f_op = &shmem_file_operations; + file->f_mode = FMODE_WRITE | FMODE_READ; + return file; + +close_file: + put_filp(file); +put_dentry: + dput(dentry); +put_memory: + shmem_unacct_size(flags, size); + return ERR_PTR(error); +} + +/* + * shmem_zero_setup - setup a shared anonymous mapping + * + * @vma: the vma to be mmapped is prepared by do_mmap_pgoff + */ +int shmem_zero_setup(struct vm_area_struct *vma) +{ + struct file *file; + loff_t size = vma->vm_end - vma->vm_start; + + file = shmem_file_setup("dev/zero", size, vma->vm_flags); + if (IS_ERR(file)) + return PTR_ERR(file); + + if (vma->vm_file) + fput(vma->vm_file); + vma->vm_file = file; + vma->vm_ops = &shmem_vm_ops; + return 0; +} diff --git a/mm/slab.c b/mm/slab.c new file mode 100644 index 000000000000..ec660d85ddd7 --- /dev/null +++ b/mm/slab.c @@ -0,0 +1,3060 @@ +/* + * linux/mm/slab.c + * Written by Mark Hemment, 1996/97. + * (markhe@nextd.demon.co.uk) + * + * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli + * + * Major cleanup, different bufctl logic, per-cpu arrays + * (c) 2000 Manfred Spraul + * + * Cleanup, make the head arrays unconditional, preparation for NUMA + * (c) 2002 Manfred Spraul + * + * An implementation of the Slab Allocator as described in outline in; + * UNIX Internals: The New Frontiers by Uresh Vahalia + * Pub: Prentice Hall ISBN 0-13-101908-2 + * or with a little more detail in; + * The Slab Allocator: An Object-Caching Kernel Memory Allocator + * Jeff Bonwick (Sun Microsystems). + * Presented at: USENIX Summer 1994 Technical Conference + * + * The memory is organized in caches, one cache for each object type. + * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct) + * Each cache consists out of many slabs (they are small (usually one + * page long) and always contiguous), and each slab contains multiple + * initialized objects. + * + * This means, that your constructor is used only for newly allocated + * slabs and you must pass objects with the same intializations to + * kmem_cache_free. + * + * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM, + * normal). If you need a special memory type, then must create a new + * cache for that memory type. + * + * In order to reduce fragmentation, the slabs are sorted in 3 groups: + * full slabs with 0 free objects + * partial slabs + * empty slabs with no allocated objects + * + * If partial slabs exist, then new allocations come from these slabs, + * otherwise from empty slabs or new slabs are allocated. + * + * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache + * during kmem_cache_destroy(). The caller must prevent concurrent allocs. + * + * Each cache has a short per-cpu head array, most allocs + * and frees go into that array, and if that array overflows, then 1/2 + * of the entries in the array are given back into the global cache. + * The head array is strictly LIFO and should improve the cache hit rates. + * On SMP, it additionally reduces the spinlock operations. + * + * The c_cpuarray may not be read with enabled local interrupts - + * it's changed with a smp_call_function(). + * + * SMP synchronization: + * constructors and destructors are called without any locking. + * Several members in kmem_cache_t and struct slab never change, they + * are accessed without any locking. + * The per-cpu arrays are never accessed from the wrong cpu, no locking, + * and local interrupts are disabled so slab code is preempt-safe. + * The non-constant members are protected with a per-cache irq spinlock. + * + * Many thanks to Mark Hemment, who wrote another per-cpu slab patch + * in 2000 - many ideas in the current implementation are derived from + * his patch. + * + * Further notes from the original documentation: + * + * 11 April '97. Started multi-threading - markhe + * The global cache-chain is protected by the semaphore 'cache_chain_sem'. + * The sem is only needed when accessing/extending the cache-chain, which + * can never happen inside an interrupt (kmem_cache_create(), + * kmem_cache_shrink() and kmem_cache_reap()). + * + * At present, each engine can be growing a cache. This should be blocked. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +/* + * DEBUG - 1 for kmem_cache_create() to honour; SLAB_DEBUG_INITIAL, + * SLAB_RED_ZONE & SLAB_POISON. + * 0 for faster, smaller code (especially in the critical paths). + * + * STATS - 1 to collect stats for /proc/slabinfo. + * 0 for faster, smaller code (especially in the critical paths). + * + * FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible) + */ + +#ifdef CONFIG_DEBUG_SLAB +#define DEBUG 1 +#define STATS 1 +#define FORCED_DEBUG 1 +#else +#define DEBUG 0 +#define STATS 0 +#define FORCED_DEBUG 0 +#endif + + +/* Shouldn't this be in a header file somewhere? */ +#define BYTES_PER_WORD sizeof(void *) + +#ifndef cache_line_size +#define cache_line_size() L1_CACHE_BYTES +#endif + +#ifndef ARCH_KMALLOC_MINALIGN +/* + * Enforce a minimum alignment for the kmalloc caches. + * Usually, the kmalloc caches are cache_line_size() aligned, except when + * DEBUG and FORCED_DEBUG are enabled, then they are BYTES_PER_WORD aligned. + * Some archs want to perform DMA into kmalloc caches and need a guaranteed + * alignment larger than BYTES_PER_WORD. ARCH_KMALLOC_MINALIGN allows that. + * Note that this flag disables some debug features. + */ +#define ARCH_KMALLOC_MINALIGN 0 +#endif + +#ifndef ARCH_SLAB_MINALIGN +/* + * Enforce a minimum alignment for all caches. + * Intended for archs that get misalignment faults even for BYTES_PER_WORD + * aligned buffers. Includes ARCH_KMALLOC_MINALIGN. + * If possible: Do not enable this flag for CONFIG_DEBUG_SLAB, it disables + * some debug features. + */ +#define ARCH_SLAB_MINALIGN 0 +#endif + +#ifndef ARCH_KMALLOC_FLAGS +#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN +#endif + +/* Legal flag mask for kmem_cache_create(). */ +#if DEBUG +# define CREATE_MASK (SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \ + SLAB_POISON | SLAB_HWCACHE_ALIGN | \ + SLAB_NO_REAP | SLAB_CACHE_DMA | \ + SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \ + SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ + SLAB_DESTROY_BY_RCU) +#else +# define CREATE_MASK (SLAB_HWCACHE_ALIGN | SLAB_NO_REAP | \ + SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \ + SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ + SLAB_DESTROY_BY_RCU) +#endif + +/* + * kmem_bufctl_t: + * + * Bufctl's are used for linking objs within a slab + * linked offsets. + * + * This implementation relies on "struct page" for locating the cache & + * slab an object belongs to. + * This allows the bufctl structure to be small (one int), but limits + * the number of objects a slab (not a cache) can contain when off-slab + * bufctls are used. The limit is the size of the largest general cache + * that does not use off-slab slabs. + * For 32bit archs with 4 kB pages, is this 56. + * This is not serious, as it is only for large objects, when it is unwise + * to have too many per slab. + * Note: This limit can be raised by introducing a general cache whose size + * is less than 512 (PAGE_SIZE<<3), but greater than 256. + */ + +#define BUFCTL_END (((kmem_bufctl_t)(~0U))-0) +#define BUFCTL_FREE (((kmem_bufctl_t)(~0U))-1) +#define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-2) + +/* Max number of objs-per-slab for caches which use off-slab slabs. + * Needed to avoid a possible looping condition in cache_grow(). + */ +static unsigned long offslab_limit; + +/* + * struct slab + * + * Manages the objs in a slab. Placed either at the beginning of mem allocated + * for a slab, or allocated from an general cache. + * Slabs are chained into three list: fully used, partial, fully free slabs. + */ +struct slab { + struct list_head list; + unsigned long colouroff; + void *s_mem; /* including colour offset */ + unsigned int inuse; /* num of objs active in slab */ + kmem_bufctl_t free; +}; + +/* + * struct slab_rcu + * + * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to + * arrange for kmem_freepages to be called via RCU. This is useful if + * we need to approach a kernel structure obliquely, from its address + * obtained without the usual locking. We can lock the structure to + * stabilize it and check it's still at the given address, only if we + * can be sure that the memory has not been meanwhile reused for some + * other kind of object (which our subsystem's lock might corrupt). + * + * rcu_read_lock before reading the address, then rcu_read_unlock after + * taking the spinlock within the structure expected at that address. + * + * We assume struct slab_rcu can overlay struct slab when destroying. + */ +struct slab_rcu { + struct rcu_head head; + kmem_cache_t *cachep; + void *addr; +}; + +/* + * struct array_cache + * + * Per cpu structures + * Purpose: + * - LIFO ordering, to hand out cache-warm objects from _alloc + * - reduce the number of linked list operations + * - reduce spinlock operations + * + * The limit is stored in the per-cpu structure to reduce the data cache + * footprint. + * + */ +struct array_cache { + unsigned int avail; + unsigned int limit; + unsigned int batchcount; + unsigned int touched; +}; + +/* bootstrap: The caches do not work without cpuarrays anymore, + * but the cpuarrays are allocated from the generic caches... + */ +#define BOOT_CPUCACHE_ENTRIES 1 +struct arraycache_init { + struct array_cache cache; + void * entries[BOOT_CPUCACHE_ENTRIES]; +}; + +/* + * The slab lists of all objects. + * Hopefully reduce the internal fragmentation + * NUMA: The spinlock could be moved from the kmem_cache_t + * into this structure, too. Figure out what causes + * fewer cross-node spinlock operations. + */ +struct kmem_list3 { + struct list_head slabs_partial; /* partial list first, better asm code */ + struct list_head slabs_full; + struct list_head slabs_free; + unsigned long free_objects; + int free_touched; + unsigned long next_reap; + struct array_cache *shared; +}; + +#define LIST3_INIT(parent) \ + { \ + .slabs_full = LIST_HEAD_INIT(parent.slabs_full), \ + .slabs_partial = LIST_HEAD_INIT(parent.slabs_partial), \ + .slabs_free = LIST_HEAD_INIT(parent.slabs_free) \ + } +#define list3_data(cachep) \ + (&(cachep)->lists) + +/* NUMA: per-node */ +#define list3_data_ptr(cachep, ptr) \ + list3_data(cachep) + +/* + * kmem_cache_t + * + * manages a cache. + */ + +struct kmem_cache_s { +/* 1) per-cpu data, touched during every alloc/free */ + struct array_cache *array[NR_CPUS]; + unsigned int batchcount; + unsigned int limit; +/* 2) touched by every alloc & free from the backend */ + struct kmem_list3 lists; + /* NUMA: kmem_3list_t *nodelists[MAX_NUMNODES] */ + unsigned int objsize; + unsigned int flags; /* constant flags */ + unsigned int num; /* # of objs per slab */ + unsigned int free_limit; /* upper limit of objects in the lists */ + spinlock_t spinlock; + +/* 3) cache_grow/shrink */ + /* order of pgs per slab (2^n) */ + unsigned int gfporder; + + /* force GFP flags, e.g. GFP_DMA */ + unsigned int gfpflags; + + size_t colour; /* cache colouring range */ + unsigned int colour_off; /* colour offset */ + unsigned int colour_next; /* cache colouring */ + kmem_cache_t *slabp_cache; + unsigned int slab_size; + unsigned int dflags; /* dynamic flags */ + + /* constructor func */ + void (*ctor)(void *, kmem_cache_t *, unsigned long); + + /* de-constructor func */ + void (*dtor)(void *, kmem_cache_t *, unsigned long); + +/* 4) cache creation/removal */ + const char *name; + struct list_head next; + +/* 5) statistics */ +#if STATS + unsigned long num_active; + unsigned long num_allocations; + unsigned long high_mark; + unsigned long grown; + unsigned long reaped; + unsigned long errors; + unsigned long max_freeable; + unsigned long node_allocs; + atomic_t allochit; + atomic_t allocmiss; + atomic_t freehit; + atomic_t freemiss; +#endif +#if DEBUG + int dbghead; + int reallen; +#endif +}; + +#define CFLGS_OFF_SLAB (0x80000000UL) +#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) + +#define BATCHREFILL_LIMIT 16 +/* Optimization question: fewer reaps means less + * probability for unnessary cpucache drain/refill cycles. + * + * OTHO the cpuarrays can contain lots of objects, + * which could lock up otherwise freeable slabs. + */ +#define REAPTIMEOUT_CPUC (2*HZ) +#define REAPTIMEOUT_LIST3 (4*HZ) + +#if STATS +#define STATS_INC_ACTIVE(x) ((x)->num_active++) +#define STATS_DEC_ACTIVE(x) ((x)->num_active--) +#define STATS_INC_ALLOCED(x) ((x)->num_allocations++) +#define STATS_INC_GROWN(x) ((x)->grown++) +#define STATS_INC_REAPED(x) ((x)->reaped++) +#define STATS_SET_HIGH(x) do { if ((x)->num_active > (x)->high_mark) \ + (x)->high_mark = (x)->num_active; \ + } while (0) +#define STATS_INC_ERR(x) ((x)->errors++) +#define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++) +#define STATS_SET_FREEABLE(x, i) \ + do { if ((x)->max_freeable < i) \ + (x)->max_freeable = i; \ + } while (0) + +#define STATS_INC_ALLOCHIT(x) atomic_inc(&(x)->allochit) +#define STATS_INC_ALLOCMISS(x) atomic_inc(&(x)->allocmiss) +#define STATS_INC_FREEHIT(x) atomic_inc(&(x)->freehit) +#define STATS_INC_FREEMISS(x) atomic_inc(&(x)->freemiss) +#else +#define STATS_INC_ACTIVE(x) do { } while (0) +#define STATS_DEC_ACTIVE(x) do { } while (0) +#define STATS_INC_ALLOCED(x) do { } while (0) +#define STATS_INC_GROWN(x) do { } while (0) +#define STATS_INC_REAPED(x) do { } while (0) +#define STATS_SET_HIGH(x) do { } while (0) +#define STATS_INC_ERR(x) do { } while (0) +#define STATS_INC_NODEALLOCS(x) do { } while (0) +#define STATS_SET_FREEABLE(x, i) \ + do { } while (0) + +#define STATS_INC_ALLOCHIT(x) do { } while (0) +#define STATS_INC_ALLOCMISS(x) do { } while (0) +#define STATS_INC_FREEHIT(x) do { } while (0) +#define STATS_INC_FREEMISS(x) do { } while (0) +#endif + +#if DEBUG +/* Magic nums for obj red zoning. + * Placed in the first word before and the first word after an obj. + */ +#define RED_INACTIVE 0x5A2CF071UL /* when obj is inactive */ +#define RED_ACTIVE 0x170FC2A5UL /* when obj is active */ + +/* ...and for poisoning */ +#define POISON_INUSE 0x5a /* for use-uninitialised poisoning */ +#define POISON_FREE 0x6b /* for use-after-free poisoning */ +#define POISON_END 0xa5 /* end-byte of poisoning */ + +/* memory layout of objects: + * 0 : objp + * 0 .. cachep->dbghead - BYTES_PER_WORD - 1: padding. This ensures that + * the end of an object is aligned with the end of the real + * allocation. Catches writes behind the end of the allocation. + * cachep->dbghead - BYTES_PER_WORD .. cachep->dbghead - 1: + * redzone word. + * cachep->dbghead: The real object. + * cachep->objsize - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long] + * cachep->objsize - 1* BYTES_PER_WORD: last caller address [BYTES_PER_WORD long] + */ +static int obj_dbghead(kmem_cache_t *cachep) +{ + return cachep->dbghead; +} + +static int obj_reallen(kmem_cache_t *cachep) +{ + return cachep->reallen; +} + +static unsigned long *dbg_redzone1(kmem_cache_t *cachep, void *objp) +{ + BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); + return (unsigned long*) (objp+obj_dbghead(cachep)-BYTES_PER_WORD); +} + +static unsigned long *dbg_redzone2(kmem_cache_t *cachep, void *objp) +{ + BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); + if (cachep->flags & SLAB_STORE_USER) + return (unsigned long*) (objp+cachep->objsize-2*BYTES_PER_WORD); + return (unsigned long*) (objp+cachep->objsize-BYTES_PER_WORD); +} + +static void **dbg_userword(kmem_cache_t *cachep, void *objp) +{ + BUG_ON(!(cachep->flags & SLAB_STORE_USER)); + return (void**)(objp+cachep->objsize-BYTES_PER_WORD); +} + +#else + +#define obj_dbghead(x) 0 +#define obj_reallen(cachep) (cachep->objsize) +#define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long *)NULL;}) +#define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long *)NULL;}) +#define dbg_userword(cachep, objp) ({BUG(); (void **)NULL;}) + +#endif + +/* + * Maximum size of an obj (in 2^order pages) + * and absolute limit for the gfp order. + */ +#if defined(CONFIG_LARGE_ALLOCS) +#define MAX_OBJ_ORDER 13 /* up to 32Mb */ +#define MAX_GFP_ORDER 13 /* up to 32Mb */ +#elif defined(CONFIG_MMU) +#define MAX_OBJ_ORDER 5 /* 32 pages */ +#define MAX_GFP_ORDER 5 /* 32 pages */ +#else +#define MAX_OBJ_ORDER 8 /* up to 1Mb */ +#define MAX_GFP_ORDER 8 /* up to 1Mb */ +#endif + +/* + * Do not go above this order unless 0 objects fit into the slab. + */ +#define BREAK_GFP_ORDER_HI 1 +#define BREAK_GFP_ORDER_LO 0 +static int slab_break_gfp_order = BREAK_GFP_ORDER_LO; + +/* Macros for storing/retrieving the cachep and or slab from the + * global 'mem_map'. These are used to find the slab an obj belongs to. + * With kfree(), these are used to find the cache which an obj belongs to. + */ +#define SET_PAGE_CACHE(pg,x) ((pg)->lru.next = (struct list_head *)(x)) +#define GET_PAGE_CACHE(pg) ((kmem_cache_t *)(pg)->lru.next) +#define SET_PAGE_SLAB(pg,x) ((pg)->lru.prev = (struct list_head *)(x)) +#define GET_PAGE_SLAB(pg) ((struct slab *)(pg)->lru.prev) + +/* These are the default caches for kmalloc. Custom caches can have other sizes. */ +struct cache_sizes malloc_sizes[] = { +#define CACHE(x) { .cs_size = (x) }, +#include + CACHE(ULONG_MAX) +#undef CACHE +}; +EXPORT_SYMBOL(malloc_sizes); + +/* Must match cache_sizes above. Out of line to keep cache footprint low. */ +struct cache_names { + char *name; + char *name_dma; +}; + +static struct cache_names __initdata cache_names[] = { +#define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" }, +#include + { NULL, } +#undef CACHE +}; + +static struct arraycache_init initarray_cache __initdata = + { { 0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; +static struct arraycache_init initarray_generic = + { { 0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; + +/* internal cache of cache description objs */ +static kmem_cache_t cache_cache = { + .lists = LIST3_INIT(cache_cache.lists), + .batchcount = 1, + .limit = BOOT_CPUCACHE_ENTRIES, + .objsize = sizeof(kmem_cache_t), + .flags = SLAB_NO_REAP, + .spinlock = SPIN_LOCK_UNLOCKED, + .name = "kmem_cache", +#if DEBUG + .reallen = sizeof(kmem_cache_t), +#endif +}; + +/* Guard access to the cache-chain. */ +static struct semaphore cache_chain_sem; +static struct list_head cache_chain; + +/* + * vm_enough_memory() looks at this to determine how many + * slab-allocated pages are possibly freeable under pressure + * + * SLAB_RECLAIM_ACCOUNT turns this on per-slab + */ +atomic_t slab_reclaim_pages; +EXPORT_SYMBOL(slab_reclaim_pages); + +/* + * chicken and egg problem: delay the per-cpu array allocation + * until the general caches are up. + */ +static enum { + NONE, + PARTIAL, + FULL +} g_cpucache_up; + +static DEFINE_PER_CPU(struct work_struct, reap_work); + +static void free_block(kmem_cache_t* cachep, void** objpp, int len); +static void enable_cpucache (kmem_cache_t *cachep); +static void cache_reap (void *unused); + +static inline void **ac_entry(struct array_cache *ac) +{ + return (void**)(ac+1); +} + +static inline struct array_cache *ac_data(kmem_cache_t *cachep) +{ + return cachep->array[smp_processor_id()]; +} + +static inline kmem_cache_t *kmem_find_general_cachep(size_t size, int gfpflags) +{ + struct cache_sizes *csizep = malloc_sizes; + +#if DEBUG + /* This happens if someone tries to call + * kmem_cache_create(), or __kmalloc(), before + * the generic caches are initialized. + */ + BUG_ON(csizep->cs_cachep == NULL); +#endif + while (size > csizep->cs_size) + csizep++; + + /* + * Really subtile: The last entry with cs->cs_size==ULONG_MAX + * has cs_{dma,}cachep==NULL. Thus no special case + * for large kmalloc calls required. + */ + if (unlikely(gfpflags & GFP_DMA)) + return csizep->cs_dmacachep; + return csizep->cs_cachep; +} + +/* Cal the num objs, wastage, and bytes left over for a given slab size. */ +static void cache_estimate(unsigned long gfporder, size_t size, size_t align, + int flags, size_t *left_over, unsigned int *num) +{ + int i; + size_t wastage = PAGE_SIZE< 0) + i--; + + if (i > SLAB_LIMIT) + i = SLAB_LIMIT; + + *num = i; + wastage -= i*size; + wastage -= ALIGN(base+i*extra, align); + *left_over = wastage; +} + +#define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg) + +static void __slab_error(const char *function, kmem_cache_t *cachep, char *msg) +{ + printk(KERN_ERR "slab error in %s(): cache `%s': %s\n", + function, cachep->name, msg); + dump_stack(); +} + +/* + * Initiate the reap timer running on the target CPU. We run at around 1 to 2Hz + * via the workqueue/eventd. + * Add the CPU number into the expiration time to minimize the possibility of + * the CPUs getting into lockstep and contending for the global cache chain + * lock. + */ +static void __devinit start_cpu_timer(int cpu) +{ + struct work_struct *reap_work = &per_cpu(reap_work, cpu); + + /* + * When this gets called from do_initcalls via cpucache_init(), + * init_workqueues() has already run, so keventd will be setup + * at that time. + */ + if (keventd_up() && reap_work->func == NULL) { + INIT_WORK(reap_work, cache_reap, NULL); + schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu); + } +} + +static struct array_cache *alloc_arraycache(int cpu, int entries, + int batchcount) +{ + int memsize = sizeof(void*)*entries+sizeof(struct array_cache); + struct array_cache *nc = NULL; + + if (cpu != -1) { + kmem_cache_t *cachep; + cachep = kmem_find_general_cachep(memsize, GFP_KERNEL); + if (cachep) + nc = kmem_cache_alloc_node(cachep, cpu_to_node(cpu)); + } + if (!nc) + nc = kmalloc(memsize, GFP_KERNEL); + if (nc) { + nc->avail = 0; + nc->limit = entries; + nc->batchcount = batchcount; + nc->touched = 0; + } + return nc; +} + +static int __devinit cpuup_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + long cpu = (long)hcpu; + kmem_cache_t* cachep; + + switch (action) { + case CPU_UP_PREPARE: + down(&cache_chain_sem); + list_for_each_entry(cachep, &cache_chain, next) { + struct array_cache *nc; + + nc = alloc_arraycache(cpu, cachep->limit, cachep->batchcount); + if (!nc) + goto bad; + + spin_lock_irq(&cachep->spinlock); + cachep->array[cpu] = nc; + cachep->free_limit = (1+num_online_cpus())*cachep->batchcount + + cachep->num; + spin_unlock_irq(&cachep->spinlock); + + } + up(&cache_chain_sem); + break; + case CPU_ONLINE: + start_cpu_timer(cpu); + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_DEAD: + /* fall thru */ + case CPU_UP_CANCELED: + down(&cache_chain_sem); + + list_for_each_entry(cachep, &cache_chain, next) { + struct array_cache *nc; + + spin_lock_irq(&cachep->spinlock); + /* cpu is dead; no one can alloc from it. */ + nc = cachep->array[cpu]; + cachep->array[cpu] = NULL; + cachep->free_limit -= cachep->batchcount; + free_block(cachep, ac_entry(nc), nc->avail); + spin_unlock_irq(&cachep->spinlock); + kfree(nc); + } + up(&cache_chain_sem); + break; +#endif + } + return NOTIFY_OK; +bad: + up(&cache_chain_sem); + return NOTIFY_BAD; +} + +static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 }; + +/* Initialisation. + * Called after the gfp() functions have been enabled, and before smp_init(). + */ +void __init kmem_cache_init(void) +{ + size_t left_over; + struct cache_sizes *sizes; + struct cache_names *names; + + /* + * Fragmentation resistance on low memory - only use bigger + * page orders on machines with more than 32MB of memory. + */ + if (num_physpages > (32 << 20) >> PAGE_SHIFT) + slab_break_gfp_order = BREAK_GFP_ORDER_HI; + + + /* Bootstrap is tricky, because several objects are allocated + * from caches that do not exist yet: + * 1) initialize the cache_cache cache: it contains the kmem_cache_t + * structures of all caches, except cache_cache itself: cache_cache + * is statically allocated. + * Initially an __init data area is used for the head array, it's + * replaced with a kmalloc allocated array at the end of the bootstrap. + * 2) Create the first kmalloc cache. + * The kmem_cache_t for the new cache is allocated normally. An __init + * data area is used for the head array. + * 3) Create the remaining kmalloc caches, with minimally sized head arrays. + * 4) Replace the __init data head arrays for cache_cache and the first + * kmalloc cache with kmalloc allocated arrays. + * 5) Resize the head arrays of the kmalloc caches to their final sizes. + */ + + /* 1) create the cache_cache */ + init_MUTEX(&cache_chain_sem); + INIT_LIST_HEAD(&cache_chain); + list_add(&cache_cache.next, &cache_chain); + cache_cache.colour_off = cache_line_size(); + cache_cache.array[smp_processor_id()] = &initarray_cache.cache; + + cache_cache.objsize = ALIGN(cache_cache.objsize, cache_line_size()); + + cache_estimate(0, cache_cache.objsize, cache_line_size(), 0, + &left_over, &cache_cache.num); + if (!cache_cache.num) + BUG(); + + cache_cache.colour = left_over/cache_cache.colour_off; + cache_cache.colour_next = 0; + cache_cache.slab_size = ALIGN(cache_cache.num*sizeof(kmem_bufctl_t) + + sizeof(struct slab), cache_line_size()); + + /* 2+3) create the kmalloc caches */ + sizes = malloc_sizes; + names = cache_names; + + while (sizes->cs_size != ULONG_MAX) { + /* For performance, all the general caches are L1 aligned. + * This should be particularly beneficial on SMP boxes, as it + * eliminates "false sharing". + * Note for systems short on memory removing the alignment will + * allow tighter packing of the smaller caches. */ + sizes->cs_cachep = kmem_cache_create(names->name, + sizes->cs_size, ARCH_KMALLOC_MINALIGN, + (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL); + + /* Inc off-slab bufctl limit until the ceiling is hit. */ + if (!(OFF_SLAB(sizes->cs_cachep))) { + offslab_limit = sizes->cs_size-sizeof(struct slab); + offslab_limit /= sizeof(kmem_bufctl_t); + } + + sizes->cs_dmacachep = kmem_cache_create(names->name_dma, + sizes->cs_size, ARCH_KMALLOC_MINALIGN, + (ARCH_KMALLOC_FLAGS | SLAB_CACHE_DMA | SLAB_PANIC), + NULL, NULL); + + sizes++; + names++; + } + /* 4) Replace the bootstrap head arrays */ + { + void * ptr; + + ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); + local_irq_disable(); + BUG_ON(ac_data(&cache_cache) != &initarray_cache.cache); + memcpy(ptr, ac_data(&cache_cache), sizeof(struct arraycache_init)); + cache_cache.array[smp_processor_id()] = ptr; + local_irq_enable(); + + ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); + local_irq_disable(); + BUG_ON(ac_data(malloc_sizes[0].cs_cachep) != &initarray_generic.cache); + memcpy(ptr, ac_data(malloc_sizes[0].cs_cachep), + sizeof(struct arraycache_init)); + malloc_sizes[0].cs_cachep->array[smp_processor_id()] = ptr; + local_irq_enable(); + } + + /* 5) resize the head arrays to their final sizes */ + { + kmem_cache_t *cachep; + down(&cache_chain_sem); + list_for_each_entry(cachep, &cache_chain, next) + enable_cpucache(cachep); + up(&cache_chain_sem); + } + + /* Done! */ + g_cpucache_up = FULL; + + /* Register a cpu startup notifier callback + * that initializes ac_data for all new cpus + */ + register_cpu_notifier(&cpucache_notifier); + + + /* The reap timers are started later, with a module init call: + * That part of the kernel is not yet operational. + */ +} + +static int __init cpucache_init(void) +{ + int cpu; + + /* + * Register the timers that return unneeded + * pages to gfp. + */ + for (cpu = 0; cpu < NR_CPUS; cpu++) { + if (cpu_online(cpu)) + start_cpu_timer(cpu); + } + + return 0; +} + +__initcall(cpucache_init); + +/* + * Interface to system's page allocator. No need to hold the cache-lock. + * + * If we requested dmaable memory, we will get it. Even if we + * did not request dmaable memory, we might get it, but that + * would be relatively rare and ignorable. + */ +static void *kmem_getpages(kmem_cache_t *cachep, unsigned int __nocast flags, int nodeid) +{ + struct page *page; + void *addr; + int i; + + flags |= cachep->gfpflags; + if (likely(nodeid == -1)) { + page = alloc_pages(flags, cachep->gfporder); + } else { + page = alloc_pages_node(nodeid, flags, cachep->gfporder); + } + if (!page) + return NULL; + addr = page_address(page); + + i = (1 << cachep->gfporder); + if (cachep->flags & SLAB_RECLAIM_ACCOUNT) + atomic_add(i, &slab_reclaim_pages); + add_page_state(nr_slab, i); + while (i--) { + SetPageSlab(page); + page++; + } + return addr; +} + +/* + * Interface to system's page release. + */ +static void kmem_freepages(kmem_cache_t *cachep, void *addr) +{ + unsigned long i = (1<gfporder); + struct page *page = virt_to_page(addr); + const unsigned long nr_freed = i; + + while (i--) { + if (!TestClearPageSlab(page)) + BUG(); + page++; + } + sub_page_state(nr_slab, nr_freed); + if (current->reclaim_state) + current->reclaim_state->reclaimed_slab += nr_freed; + free_pages((unsigned long)addr, cachep->gfporder); + if (cachep->flags & SLAB_RECLAIM_ACCOUNT) + atomic_sub(1<gfporder, &slab_reclaim_pages); +} + +static void kmem_rcu_free(struct rcu_head *head) +{ + struct slab_rcu *slab_rcu = (struct slab_rcu *) head; + kmem_cache_t *cachep = slab_rcu->cachep; + + kmem_freepages(cachep, slab_rcu->addr); + if (OFF_SLAB(cachep)) + kmem_cache_free(cachep->slabp_cache, slab_rcu); +} + +#if DEBUG + +#ifdef CONFIG_DEBUG_PAGEALLOC +static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr, + unsigned long caller) +{ + int size = obj_reallen(cachep); + + addr = (unsigned long *)&((char*)addr)[obj_dbghead(cachep)]; + + if (size < 5*sizeof(unsigned long)) + return; + + *addr++=0x12345678; + *addr++=caller; + *addr++=smp_processor_id(); + size -= 3*sizeof(unsigned long); + { + unsigned long *sptr = &caller; + unsigned long svalue; + + while (!kstack_end(sptr)) { + svalue = *sptr++; + if (kernel_text_address(svalue)) { + *addr++=svalue; + size -= sizeof(unsigned long); + if (size <= sizeof(unsigned long)) + break; + } + } + + } + *addr++=0x87654321; +} +#endif + +static void poison_obj(kmem_cache_t *cachep, void *addr, unsigned char val) +{ + int size = obj_reallen(cachep); + addr = &((char*)addr)[obj_dbghead(cachep)]; + + memset(addr, val, size); + *(unsigned char *)(addr+size-1) = POISON_END; +} + +static void dump_line(char *data, int offset, int limit) +{ + int i; + printk(KERN_ERR "%03x:", offset); + for (i=0;iflags & SLAB_RED_ZONE) { + printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n", + *dbg_redzone1(cachep, objp), + *dbg_redzone2(cachep, objp)); + } + + if (cachep->flags & SLAB_STORE_USER) { + printk(KERN_ERR "Last user: [<%p>]", + *dbg_userword(cachep, objp)); + print_symbol("(%s)", + (unsigned long)*dbg_userword(cachep, objp)); + printk("\n"); + } + realobj = (char*)objp+obj_dbghead(cachep); + size = obj_reallen(cachep); + for (i=0; i size) + limit = size-i; + dump_line(realobj, i, limit); + } +} + +static void check_poison_obj(kmem_cache_t *cachep, void *objp) +{ + char *realobj; + int size, i; + int lines = 0; + + realobj = (char*)objp+obj_dbghead(cachep); + size = obj_reallen(cachep); + + for (i=0;i size) + limit = size-i; + dump_line(realobj, i, limit); + i += 16; + lines++; + /* Limit to 5 lines */ + if (lines > 5) + break; + } + } + if (lines != 0) { + /* Print some data about the neighboring objects, if they + * exist: + */ + struct slab *slabp = GET_PAGE_SLAB(virt_to_page(objp)); + int objnr; + + objnr = (objp-slabp->s_mem)/cachep->objsize; + if (objnr) { + objp = slabp->s_mem+(objnr-1)*cachep->objsize; + realobj = (char*)objp+obj_dbghead(cachep); + printk(KERN_ERR "Prev obj: start=%p, len=%d\n", + realobj, size); + print_objinfo(cachep, objp, 2); + } + if (objnr+1 < cachep->num) { + objp = slabp->s_mem+(objnr+1)*cachep->objsize; + realobj = (char*)objp+obj_dbghead(cachep); + printk(KERN_ERR "Next obj: start=%p, len=%d\n", + realobj, size); + print_objinfo(cachep, objp, 2); + } + } +} +#endif + +/* Destroy all the objs in a slab, and release the mem back to the system. + * Before calling the slab must have been unlinked from the cache. + * The cache-lock is not held/needed. + */ +static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp) +{ + void *addr = slabp->s_mem - slabp->colouroff; + +#if DEBUG + int i; + for (i = 0; i < cachep->num; i++) { + void *objp = slabp->s_mem + cachep->objsize * i; + + if (cachep->flags & SLAB_POISON) { +#ifdef CONFIG_DEBUG_PAGEALLOC + if ((cachep->objsize%PAGE_SIZE)==0 && OFF_SLAB(cachep)) + kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE,1); + else + check_poison_obj(cachep, objp); +#else + check_poison_obj(cachep, objp); +#endif + } + if (cachep->flags & SLAB_RED_ZONE) { + if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) + slab_error(cachep, "start of a freed object " + "was overwritten"); + if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) + slab_error(cachep, "end of a freed object " + "was overwritten"); + } + if (cachep->dtor && !(cachep->flags & SLAB_POISON)) + (cachep->dtor)(objp+obj_dbghead(cachep), cachep, 0); + } +#else + if (cachep->dtor) { + int i; + for (i = 0; i < cachep->num; i++) { + void* objp = slabp->s_mem+cachep->objsize*i; + (cachep->dtor)(objp, cachep, 0); + } + } +#endif + + if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) { + struct slab_rcu *slab_rcu; + + slab_rcu = (struct slab_rcu *) slabp; + slab_rcu->cachep = cachep; + slab_rcu->addr = addr; + call_rcu(&slab_rcu->head, kmem_rcu_free); + } else { + kmem_freepages(cachep, addr); + if (OFF_SLAB(cachep)) + kmem_cache_free(cachep->slabp_cache, slabp); + } +} + +/** + * kmem_cache_create - Create a cache. + * @name: A string which is used in /proc/slabinfo to identify this cache. + * @size: The size of objects to be created in this cache. + * @align: The required alignment for the objects. + * @flags: SLAB flags + * @ctor: A constructor for the objects. + * @dtor: A destructor for the objects. + * + * Returns a ptr to the cache on success, NULL on failure. + * Cannot be called within a int, but can be interrupted. + * The @ctor is run when new pages are allocated by the cache + * and the @dtor is run before the pages are handed back. + * + * @name must be valid until the cache is destroyed. This implies that + * the module calling this has to destroy the cache before getting + * unloaded. + * + * The flags are + * + * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) + * to catch references to uninitialised memory. + * + * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check + * for buffer overruns. + * + * %SLAB_NO_REAP - Don't automatically reap this cache when we're under + * memory pressure. + * + * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware + * cacheline. This can be beneficial if you're counting cycles as closely + * as davem. + */ +kmem_cache_t * +kmem_cache_create (const char *name, size_t size, size_t align, + unsigned long flags, void (*ctor)(void*, kmem_cache_t *, unsigned long), + void (*dtor)(void*, kmem_cache_t *, unsigned long)) +{ + size_t left_over, slab_size, ralign; + kmem_cache_t *cachep = NULL; + + /* + * Sanity checks... these are all serious usage bugs. + */ + if ((!name) || + in_interrupt() || + (size < BYTES_PER_WORD) || + (size > (1< BYTES_PER_WORD) + flags &= ~(SLAB_RED_ZONE|SLAB_STORE_USER); + } + /* 3) caller mandated alignment: disables debug if necessary */ + if (ralign < align) { + ralign = align; + if (ralign > BYTES_PER_WORD) + flags &= ~(SLAB_RED_ZONE|SLAB_STORE_USER); + } + /* 4) Store it. Note that the debug code below can reduce + * the alignment to BYTES_PER_WORD. + */ + align = ralign; + + /* Get cache's description obj. */ + cachep = (kmem_cache_t *) kmem_cache_alloc(&cache_cache, SLAB_KERNEL); + if (!cachep) + goto opps; + memset(cachep, 0, sizeof(kmem_cache_t)); + +#if DEBUG + cachep->reallen = size; + + if (flags & SLAB_RED_ZONE) { + /* redzoning only works with word aligned caches */ + align = BYTES_PER_WORD; + + /* add space for red zone words */ + cachep->dbghead += BYTES_PER_WORD; + size += 2*BYTES_PER_WORD; + } + if (flags & SLAB_STORE_USER) { + /* user store requires word alignment and + * one word storage behind the end of the real + * object. + */ + align = BYTES_PER_WORD; + size += BYTES_PER_WORD; + } +#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) + if (size > 128 && cachep->reallen > cache_line_size() && size < PAGE_SIZE) { + cachep->dbghead += PAGE_SIZE - size; + size = PAGE_SIZE; + } +#endif +#endif + + /* Determine if the slab management is 'on' or 'off' slab. */ + if (size >= (PAGE_SIZE>>3)) + /* + * Size is large, assume best to place the slab management obj + * off-slab (should allow better packing of objs). + */ + flags |= CFLGS_OFF_SLAB; + + size = ALIGN(size, align); + + if ((flags & SLAB_RECLAIM_ACCOUNT) && size <= PAGE_SIZE) { + /* + * A VFS-reclaimable slab tends to have most allocations + * as GFP_NOFS and we really don't want to have to be allocating + * higher-order pages when we are unable to shrink dcache. + */ + cachep->gfporder = 0; + cache_estimate(cachep->gfporder, size, align, flags, + &left_over, &cachep->num); + } else { + /* + * Calculate size (in pages) of slabs, and the num of objs per + * slab. This could be made much more intelligent. For now, + * try to avoid using high page-orders for slabs. When the + * gfp() funcs are more friendly towards high-order requests, + * this should be changed. + */ + do { + unsigned int break_flag = 0; +cal_wastage: + cache_estimate(cachep->gfporder, size, align, flags, + &left_over, &cachep->num); + if (break_flag) + break; + if (cachep->gfporder >= MAX_GFP_ORDER) + break; + if (!cachep->num) + goto next; + if (flags & CFLGS_OFF_SLAB && + cachep->num > offslab_limit) { + /* This num of objs will cause problems. */ + cachep->gfporder--; + break_flag++; + goto cal_wastage; + } + + /* + * Large num of objs is good, but v. large slabs are + * currently bad for the gfp()s. + */ + if (cachep->gfporder >= slab_break_gfp_order) + break; + + if ((left_over*8) <= (PAGE_SIZE<gfporder)) + break; /* Acceptable internal fragmentation. */ +next: + cachep->gfporder++; + } while (1); + } + + if (!cachep->num) { + printk("kmem_cache_create: couldn't create cache %s.\n", name); + kmem_cache_free(&cache_cache, cachep); + cachep = NULL; + goto opps; + } + slab_size = ALIGN(cachep->num*sizeof(kmem_bufctl_t) + + sizeof(struct slab), align); + + /* + * If the slab has been placed off-slab, and we have enough space then + * move it on-slab. This is at the expense of any extra colouring. + */ + if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) { + flags &= ~CFLGS_OFF_SLAB; + left_over -= slab_size; + } + + if (flags & CFLGS_OFF_SLAB) { + /* really off slab. No need for manual alignment */ + slab_size = cachep->num*sizeof(kmem_bufctl_t)+sizeof(struct slab); + } + + cachep->colour_off = cache_line_size(); + /* Offset must be a multiple of the alignment. */ + if (cachep->colour_off < align) + cachep->colour_off = align; + cachep->colour = left_over/cachep->colour_off; + cachep->slab_size = slab_size; + cachep->flags = flags; + cachep->gfpflags = 0; + if (flags & SLAB_CACHE_DMA) + cachep->gfpflags |= GFP_DMA; + spin_lock_init(&cachep->spinlock); + cachep->objsize = size; + /* NUMA */ + INIT_LIST_HEAD(&cachep->lists.slabs_full); + INIT_LIST_HEAD(&cachep->lists.slabs_partial); + INIT_LIST_HEAD(&cachep->lists.slabs_free); + + if (flags & CFLGS_OFF_SLAB) + cachep->slabp_cache = kmem_find_general_cachep(slab_size,0); + cachep->ctor = ctor; + cachep->dtor = dtor; + cachep->name = name; + + /* Don't let CPUs to come and go */ + lock_cpu_hotplug(); + + if (g_cpucache_up == FULL) { + enable_cpucache(cachep); + } else { + if (g_cpucache_up == NONE) { + /* Note: the first kmem_cache_create must create + * the cache that's used by kmalloc(24), otherwise + * the creation of further caches will BUG(). + */ + cachep->array[smp_processor_id()] = &initarray_generic.cache; + g_cpucache_up = PARTIAL; + } else { + cachep->array[smp_processor_id()] = kmalloc(sizeof(struct arraycache_init),GFP_KERNEL); + } + BUG_ON(!ac_data(cachep)); + ac_data(cachep)->avail = 0; + ac_data(cachep)->limit = BOOT_CPUCACHE_ENTRIES; + ac_data(cachep)->batchcount = 1; + ac_data(cachep)->touched = 0; + cachep->batchcount = 1; + cachep->limit = BOOT_CPUCACHE_ENTRIES; + cachep->free_limit = (1+num_online_cpus())*cachep->batchcount + + cachep->num; + } + + cachep->lists.next_reap = jiffies + REAPTIMEOUT_LIST3 + + ((unsigned long)cachep)%REAPTIMEOUT_LIST3; + + /* Need the semaphore to access the chain. */ + down(&cache_chain_sem); + { + struct list_head *p; + mm_segment_t old_fs; + + old_fs = get_fs(); + set_fs(KERNEL_DS); + list_for_each(p, &cache_chain) { + kmem_cache_t *pc = list_entry(p, kmem_cache_t, next); + char tmp; + /* This happens when the module gets unloaded and doesn't + destroy its slab cache and noone else reuses the vmalloc + area of the module. Print a warning. */ + if (__get_user(tmp,pc->name)) { + printk("SLAB: cache with size %d has lost its name\n", + pc->objsize); + continue; + } + if (!strcmp(pc->name,name)) { + printk("kmem_cache_create: duplicate cache %s\n",name); + up(&cache_chain_sem); + unlock_cpu_hotplug(); + BUG(); + } + } + set_fs(old_fs); + } + + /* cache setup completed, link it into the list */ + list_add(&cachep->next, &cache_chain); + up(&cache_chain_sem); + unlock_cpu_hotplug(); +opps: + if (!cachep && (flags & SLAB_PANIC)) + panic("kmem_cache_create(): failed to create slab `%s'\n", + name); + return cachep; +} +EXPORT_SYMBOL(kmem_cache_create); + +#if DEBUG +static void check_irq_off(void) +{ + BUG_ON(!irqs_disabled()); +} + +static void check_irq_on(void) +{ + BUG_ON(irqs_disabled()); +} + +static void check_spinlock_acquired(kmem_cache_t *cachep) +{ +#ifdef CONFIG_SMP + check_irq_off(); + BUG_ON(spin_trylock(&cachep->spinlock)); +#endif +} +#else +#define check_irq_off() do { } while(0) +#define check_irq_on() do { } while(0) +#define check_spinlock_acquired(x) do { } while(0) +#endif + +/* + * Waits for all CPUs to execute func(). + */ +static void smp_call_function_all_cpus(void (*func) (void *arg), void *arg) +{ + check_irq_on(); + preempt_disable(); + + local_irq_disable(); + func(arg); + local_irq_enable(); + + if (smp_call_function(func, arg, 1, 1)) + BUG(); + + preempt_enable(); +} + +static void drain_array_locked(kmem_cache_t* cachep, + struct array_cache *ac, int force); + +static void do_drain(void *arg) +{ + kmem_cache_t *cachep = (kmem_cache_t*)arg; + struct array_cache *ac; + + check_irq_off(); + ac = ac_data(cachep); + spin_lock(&cachep->spinlock); + free_block(cachep, &ac_entry(ac)[0], ac->avail); + spin_unlock(&cachep->spinlock); + ac->avail = 0; +} + +static void drain_cpu_caches(kmem_cache_t *cachep) +{ + smp_call_function_all_cpus(do_drain, cachep); + check_irq_on(); + spin_lock_irq(&cachep->spinlock); + if (cachep->lists.shared) + drain_array_locked(cachep, cachep->lists.shared, 1); + spin_unlock_irq(&cachep->spinlock); +} + + +/* NUMA shrink all list3s */ +static int __cache_shrink(kmem_cache_t *cachep) +{ + struct slab *slabp; + int ret; + + drain_cpu_caches(cachep); + + check_irq_on(); + spin_lock_irq(&cachep->spinlock); + + for(;;) { + struct list_head *p; + + p = cachep->lists.slabs_free.prev; + if (p == &cachep->lists.slabs_free) + break; + + slabp = list_entry(cachep->lists.slabs_free.prev, struct slab, list); +#if DEBUG + if (slabp->inuse) + BUG(); +#endif + list_del(&slabp->list); + + cachep->lists.free_objects -= cachep->num; + spin_unlock_irq(&cachep->spinlock); + slab_destroy(cachep, slabp); + spin_lock_irq(&cachep->spinlock); + } + ret = !list_empty(&cachep->lists.slabs_full) || + !list_empty(&cachep->lists.slabs_partial); + spin_unlock_irq(&cachep->spinlock); + return ret; +} + +/** + * kmem_cache_shrink - Shrink a cache. + * @cachep: The cache to shrink. + * + * Releases as many slabs as possible for a cache. + * To help debugging, a zero exit status indicates all slabs were released. + */ +int kmem_cache_shrink(kmem_cache_t *cachep) +{ + if (!cachep || in_interrupt()) + BUG(); + + return __cache_shrink(cachep); +} +EXPORT_SYMBOL(kmem_cache_shrink); + +/** + * kmem_cache_destroy - delete a cache + * @cachep: the cache to destroy + * + * Remove a kmem_cache_t object from the slab cache. + * Returns 0 on success. + * + * It is expected this function will be called by a module when it is + * unloaded. This will remove the cache completely, and avoid a duplicate + * cache being allocated each time a module is loaded and unloaded, if the + * module doesn't have persistent in-kernel storage across loads and unloads. + * + * The cache must be empty before calling this function. + * + * The caller must guarantee that noone will allocate memory from the cache + * during the kmem_cache_destroy(). + */ +int kmem_cache_destroy(kmem_cache_t * cachep) +{ + int i; + + if (!cachep || in_interrupt()) + BUG(); + + /* Don't let CPUs to come and go */ + lock_cpu_hotplug(); + + /* Find the cache in the chain of caches. */ + down(&cache_chain_sem); + /* + * the chain is never empty, cache_cache is never destroyed + */ + list_del(&cachep->next); + up(&cache_chain_sem); + + if (__cache_shrink(cachep)) { + slab_error(cachep, "Can't free all objects"); + down(&cache_chain_sem); + list_add(&cachep->next,&cache_chain); + up(&cache_chain_sem); + unlock_cpu_hotplug(); + return 1; + } + + if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) + synchronize_kernel(); + + /* no cpu_online check required here since we clear the percpu + * array on cpu offline and set this to NULL. + */ + for (i = 0; i < NR_CPUS; i++) + kfree(cachep->array[i]); + + /* NUMA: free the list3 structures */ + kfree(cachep->lists.shared); + cachep->lists.shared = NULL; + kmem_cache_free(&cache_cache, cachep); + + unlock_cpu_hotplug(); + + return 0; +} +EXPORT_SYMBOL(kmem_cache_destroy); + +/* Get the memory for a slab management obj. */ +static struct slab* alloc_slabmgmt(kmem_cache_t *cachep, + void *objp, int colour_off, unsigned int __nocast local_flags) +{ + struct slab *slabp; + + if (OFF_SLAB(cachep)) { + /* Slab management obj is off-slab. */ + slabp = kmem_cache_alloc(cachep->slabp_cache, local_flags); + if (!slabp) + return NULL; + } else { + slabp = objp+colour_off; + colour_off += cachep->slab_size; + } + slabp->inuse = 0; + slabp->colouroff = colour_off; + slabp->s_mem = objp+colour_off; + + return slabp; +} + +static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp) +{ + return (kmem_bufctl_t *)(slabp+1); +} + +static void cache_init_objs(kmem_cache_t *cachep, + struct slab *slabp, unsigned long ctor_flags) +{ + int i; + + for (i = 0; i < cachep->num; i++) { + void* objp = slabp->s_mem+cachep->objsize*i; +#if DEBUG + /* need to poison the objs? */ + if (cachep->flags & SLAB_POISON) + poison_obj(cachep, objp, POISON_FREE); + if (cachep->flags & SLAB_STORE_USER) + *dbg_userword(cachep, objp) = NULL; + + if (cachep->flags & SLAB_RED_ZONE) { + *dbg_redzone1(cachep, objp) = RED_INACTIVE; + *dbg_redzone2(cachep, objp) = RED_INACTIVE; + } + /* + * Constructors are not allowed to allocate memory from + * the same cache which they are a constructor for. + * Otherwise, deadlock. They must also be threaded. + */ + if (cachep->ctor && !(cachep->flags & SLAB_POISON)) + cachep->ctor(objp+obj_dbghead(cachep), cachep, ctor_flags); + + if (cachep->flags & SLAB_RED_ZONE) { + if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) + slab_error(cachep, "constructor overwrote the" + " end of an object"); + if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) + slab_error(cachep, "constructor overwrote the" + " start of an object"); + } + if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep) && cachep->flags & SLAB_POISON) + kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 0); +#else + if (cachep->ctor) + cachep->ctor(objp, cachep, ctor_flags); +#endif + slab_bufctl(slabp)[i] = i+1; + } + slab_bufctl(slabp)[i-1] = BUFCTL_END; + slabp->free = 0; +} + +static void kmem_flagcheck(kmem_cache_t *cachep, unsigned int flags) +{ + if (flags & SLAB_DMA) { + if (!(cachep->gfpflags & GFP_DMA)) + BUG(); + } else { + if (cachep->gfpflags & GFP_DMA) + BUG(); + } +} + +static void set_slab_attr(kmem_cache_t *cachep, struct slab *slabp, void *objp) +{ + int i; + struct page *page; + + /* Nasty!!!!!! I hope this is OK. */ + i = 1 << cachep->gfporder; + page = virt_to_page(objp); + do { + SET_PAGE_CACHE(page, cachep); + SET_PAGE_SLAB(page, slabp); + page++; + } while (--i); +} + +/* + * Grow (by 1) the number of slabs within a cache. This is called by + * kmem_cache_alloc() when there are no active objs left in a cache. + */ +static int cache_grow(kmem_cache_t *cachep, unsigned int __nocast flags, int nodeid) +{ + struct slab *slabp; + void *objp; + size_t offset; + unsigned int local_flags; + unsigned long ctor_flags; + + /* Be lazy and only check for valid flags here, + * keeping it out of the critical path in kmem_cache_alloc(). + */ + if (flags & ~(SLAB_DMA|SLAB_LEVEL_MASK|SLAB_NO_GROW)) + BUG(); + if (flags & SLAB_NO_GROW) + return 0; + + ctor_flags = SLAB_CTOR_CONSTRUCTOR; + local_flags = (flags & SLAB_LEVEL_MASK); + if (!(local_flags & __GFP_WAIT)) + /* + * Not allowed to sleep. Need to tell a constructor about + * this - it might need to know... + */ + ctor_flags |= SLAB_CTOR_ATOMIC; + + /* About to mess with non-constant members - lock. */ + check_irq_off(); + spin_lock(&cachep->spinlock); + + /* Get colour for the slab, and cal the next value. */ + offset = cachep->colour_next; + cachep->colour_next++; + if (cachep->colour_next >= cachep->colour) + cachep->colour_next = 0; + offset *= cachep->colour_off; + + spin_unlock(&cachep->spinlock); + + if (local_flags & __GFP_WAIT) + local_irq_enable(); + + /* + * The test for missing atomic flag is performed here, rather than + * the more obvious place, simply to reduce the critical path length + * in kmem_cache_alloc(). If a caller is seriously mis-behaving they + * will eventually be caught here (where it matters). + */ + kmem_flagcheck(cachep, flags); + + + /* Get mem for the objs. */ + if (!(objp = kmem_getpages(cachep, flags, nodeid))) + goto failed; + + /* Get slab management. */ + if (!(slabp = alloc_slabmgmt(cachep, objp, offset, local_flags))) + goto opps1; + + set_slab_attr(cachep, slabp, objp); + + cache_init_objs(cachep, slabp, ctor_flags); + + if (local_flags & __GFP_WAIT) + local_irq_disable(); + check_irq_off(); + spin_lock(&cachep->spinlock); + + /* Make slab active. */ + list_add_tail(&slabp->list, &(list3_data(cachep)->slabs_free)); + STATS_INC_GROWN(cachep); + list3_data(cachep)->free_objects += cachep->num; + spin_unlock(&cachep->spinlock); + return 1; +opps1: + kmem_freepages(cachep, objp); +failed: + if (local_flags & __GFP_WAIT) + local_irq_disable(); + return 0; +} + +#if DEBUG + +/* + * Perform extra freeing checks: + * - detect bad pointers. + * - POISON/RED_ZONE checking + * - destructor calls, for caches with POISON+dtor + */ +static void kfree_debugcheck(const void *objp) +{ + struct page *page; + + if (!virt_addr_valid(objp)) { + printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n", + (unsigned long)objp); + BUG(); + } + page = virt_to_page(objp); + if (!PageSlab(page)) { + printk(KERN_ERR "kfree_debugcheck: bad ptr %lxh.\n", (unsigned long)objp); + BUG(); + } +} + +static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp, + void *caller) +{ + struct page *page; + unsigned int objnr; + struct slab *slabp; + + objp -= obj_dbghead(cachep); + kfree_debugcheck(objp); + page = virt_to_page(objp); + + if (GET_PAGE_CACHE(page) != cachep) { + printk(KERN_ERR "mismatch in kmem_cache_free: expected cache %p, got %p\n", + GET_PAGE_CACHE(page),cachep); + printk(KERN_ERR "%p is %s.\n", cachep, cachep->name); + printk(KERN_ERR "%p is %s.\n", GET_PAGE_CACHE(page), GET_PAGE_CACHE(page)->name); + WARN_ON(1); + } + slabp = GET_PAGE_SLAB(page); + + if (cachep->flags & SLAB_RED_ZONE) { + if (*dbg_redzone1(cachep, objp) != RED_ACTIVE || *dbg_redzone2(cachep, objp) != RED_ACTIVE) { + slab_error(cachep, "double free, or memory outside" + " object was overwritten"); + printk(KERN_ERR "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n", + objp, *dbg_redzone1(cachep, objp), *dbg_redzone2(cachep, objp)); + } + *dbg_redzone1(cachep, objp) = RED_INACTIVE; + *dbg_redzone2(cachep, objp) = RED_INACTIVE; + } + if (cachep->flags & SLAB_STORE_USER) + *dbg_userword(cachep, objp) = caller; + + objnr = (objp-slabp->s_mem)/cachep->objsize; + + BUG_ON(objnr >= cachep->num); + BUG_ON(objp != slabp->s_mem + objnr*cachep->objsize); + + if (cachep->flags & SLAB_DEBUG_INITIAL) { + /* Need to call the slab's constructor so the + * caller can perform a verify of its state (debugging). + * Called without the cache-lock held. + */ + cachep->ctor(objp+obj_dbghead(cachep), + cachep, SLAB_CTOR_CONSTRUCTOR|SLAB_CTOR_VERIFY); + } + if (cachep->flags & SLAB_POISON && cachep->dtor) { + /* we want to cache poison the object, + * call the destruction callback + */ + cachep->dtor(objp+obj_dbghead(cachep), cachep, 0); + } + if (cachep->flags & SLAB_POISON) { +#ifdef CONFIG_DEBUG_PAGEALLOC + if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) { + store_stackinfo(cachep, objp, (unsigned long)caller); + kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 0); + } else { + poison_obj(cachep, objp, POISON_FREE); + } +#else + poison_obj(cachep, objp, POISON_FREE); +#endif + } + return objp; +} + +static void check_slabp(kmem_cache_t *cachep, struct slab *slabp) +{ + kmem_bufctl_t i; + int entries = 0; + + check_spinlock_acquired(cachep); + /* Check slab's freelist to see if this obj is there. */ + for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) { + entries++; + if (entries > cachep->num || i >= cachep->num) + goto bad; + } + if (entries != cachep->num - slabp->inuse) { +bad: + printk(KERN_ERR "slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n", + cachep->name, cachep->num, slabp, slabp->inuse); + for (i=0;inum*sizeof(kmem_bufctl_t);i++) { + if ((i%16)==0) + printk("\n%03x:", i); + printk(" %02x", ((unsigned char*)slabp)[i]); + } + printk("\n"); + BUG(); + } +} +#else +#define kfree_debugcheck(x) do { } while(0) +#define cache_free_debugcheck(x,objp,z) (objp) +#define check_slabp(x,y) do { } while(0) +#endif + +static void *cache_alloc_refill(kmem_cache_t *cachep, unsigned int __nocast flags) +{ + int batchcount; + struct kmem_list3 *l3; + struct array_cache *ac; + + check_irq_off(); + ac = ac_data(cachep); +retry: + batchcount = ac->batchcount; + if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { + /* if there was little recent activity on this + * cache, then perform only a partial refill. + * Otherwise we could generate refill bouncing. + */ + batchcount = BATCHREFILL_LIMIT; + } + l3 = list3_data(cachep); + + BUG_ON(ac->avail > 0); + spin_lock(&cachep->spinlock); + if (l3->shared) { + struct array_cache *shared_array = l3->shared; + if (shared_array->avail) { + if (batchcount > shared_array->avail) + batchcount = shared_array->avail; + shared_array->avail -= batchcount; + ac->avail = batchcount; + memcpy(ac_entry(ac), &ac_entry(shared_array)[shared_array->avail], + sizeof(void*)*batchcount); + shared_array->touched = 1; + goto alloc_done; + } + } + while (batchcount > 0) { + struct list_head *entry; + struct slab *slabp; + /* Get slab alloc is to come from. */ + entry = l3->slabs_partial.next; + if (entry == &l3->slabs_partial) { + l3->free_touched = 1; + entry = l3->slabs_free.next; + if (entry == &l3->slabs_free) + goto must_grow; + } + + slabp = list_entry(entry, struct slab, list); + check_slabp(cachep, slabp); + check_spinlock_acquired(cachep); + while (slabp->inuse < cachep->num && batchcount--) { + kmem_bufctl_t next; + STATS_INC_ALLOCED(cachep); + STATS_INC_ACTIVE(cachep); + STATS_SET_HIGH(cachep); + + /* get obj pointer */ + ac_entry(ac)[ac->avail++] = slabp->s_mem + slabp->free*cachep->objsize; + + slabp->inuse++; + next = slab_bufctl(slabp)[slabp->free]; +#if DEBUG + slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE; +#endif + slabp->free = next; + } + check_slabp(cachep, slabp); + + /* move slabp to correct slabp list: */ + list_del(&slabp->list); + if (slabp->free == BUFCTL_END) + list_add(&slabp->list, &l3->slabs_full); + else + list_add(&slabp->list, &l3->slabs_partial); + } + +must_grow: + l3->free_objects -= ac->avail; +alloc_done: + spin_unlock(&cachep->spinlock); + + if (unlikely(!ac->avail)) { + int x; + x = cache_grow(cachep, flags, -1); + + // cache_grow can reenable interrupts, then ac could change. + ac = ac_data(cachep); + if (!x && ac->avail == 0) // no objects in sight? abort + return NULL; + + if (!ac->avail) // objects refilled by interrupt? + goto retry; + } + ac->touched = 1; + return ac_entry(ac)[--ac->avail]; +} + +static inline void +cache_alloc_debugcheck_before(kmem_cache_t *cachep, unsigned int __nocast flags) +{ + might_sleep_if(flags & __GFP_WAIT); +#if DEBUG + kmem_flagcheck(cachep, flags); +#endif +} + +#if DEBUG +static void * +cache_alloc_debugcheck_after(kmem_cache_t *cachep, + unsigned long flags, void *objp, void *caller) +{ + if (!objp) + return objp; + if (cachep->flags & SLAB_POISON) { +#ifdef CONFIG_DEBUG_PAGEALLOC + if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) + kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 1); + else + check_poison_obj(cachep, objp); +#else + check_poison_obj(cachep, objp); +#endif + poison_obj(cachep, objp, POISON_INUSE); + } + if (cachep->flags & SLAB_STORE_USER) + *dbg_userword(cachep, objp) = caller; + + if (cachep->flags & SLAB_RED_ZONE) { + if (*dbg_redzone1(cachep, objp) != RED_INACTIVE || *dbg_redzone2(cachep, objp) != RED_INACTIVE) { + slab_error(cachep, "double free, or memory outside" + " object was overwritten"); + printk(KERN_ERR "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n", + objp, *dbg_redzone1(cachep, objp), *dbg_redzone2(cachep, objp)); + } + *dbg_redzone1(cachep, objp) = RED_ACTIVE; + *dbg_redzone2(cachep, objp) = RED_ACTIVE; + } + objp += obj_dbghead(cachep); + if (cachep->ctor && cachep->flags & SLAB_POISON) { + unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR; + + if (!(flags & __GFP_WAIT)) + ctor_flags |= SLAB_CTOR_ATOMIC; + + cachep->ctor(objp, cachep, ctor_flags); + } + return objp; +} +#else +#define cache_alloc_debugcheck_after(a,b,objp,d) (objp) +#endif + + +static inline void *__cache_alloc(kmem_cache_t *cachep, unsigned int __nocast flags) +{ + unsigned long save_flags; + void* objp; + struct array_cache *ac; + + cache_alloc_debugcheck_before(cachep, flags); + + local_irq_save(save_flags); + ac = ac_data(cachep); + if (likely(ac->avail)) { + STATS_INC_ALLOCHIT(cachep); + ac->touched = 1; + objp = ac_entry(ac)[--ac->avail]; + } else { + STATS_INC_ALLOCMISS(cachep); + objp = cache_alloc_refill(cachep, flags); + } + local_irq_restore(save_flags); + objp = cache_alloc_debugcheck_after(cachep, flags, objp, __builtin_return_address(0)); + return objp; +} + +/* + * NUMA: different approach needed if the spinlock is moved into + * the l3 structure + */ + +static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects) +{ + int i; + + check_spinlock_acquired(cachep); + + /* NUMA: move add into loop */ + cachep->lists.free_objects += nr_objects; + + for (i = 0; i < nr_objects; i++) { + void *objp = objpp[i]; + struct slab *slabp; + unsigned int objnr; + + slabp = GET_PAGE_SLAB(virt_to_page(objp)); + list_del(&slabp->list); + objnr = (objp - slabp->s_mem) / cachep->objsize; + check_slabp(cachep, slabp); +#if DEBUG + if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) { + printk(KERN_ERR "slab: double free detected in cache '%s', objp %p.\n", + cachep->name, objp); + BUG(); + } +#endif + slab_bufctl(slabp)[objnr] = slabp->free; + slabp->free = objnr; + STATS_DEC_ACTIVE(cachep); + slabp->inuse--; + check_slabp(cachep, slabp); + + /* fixup slab chains */ + if (slabp->inuse == 0) { + if (cachep->lists.free_objects > cachep->free_limit) { + cachep->lists.free_objects -= cachep->num; + slab_destroy(cachep, slabp); + } else { + list_add(&slabp->list, + &list3_data_ptr(cachep, objp)->slabs_free); + } + } else { + /* Unconditionally move a slab to the end of the + * partial list on free - maximum time for the + * other objects to be freed, too. + */ + list_add_tail(&slabp->list, + &list3_data_ptr(cachep, objp)->slabs_partial); + } + } +} + +static void cache_flusharray(kmem_cache_t *cachep, struct array_cache *ac) +{ + int batchcount; + + batchcount = ac->batchcount; +#if DEBUG + BUG_ON(!batchcount || batchcount > ac->avail); +#endif + check_irq_off(); + spin_lock(&cachep->spinlock); + if (cachep->lists.shared) { + struct array_cache *shared_array = cachep->lists.shared; + int max = shared_array->limit-shared_array->avail; + if (max) { + if (batchcount > max) + batchcount = max; + memcpy(&ac_entry(shared_array)[shared_array->avail], + &ac_entry(ac)[0], + sizeof(void*)*batchcount); + shared_array->avail += batchcount; + goto free_done; + } + } + + free_block(cachep, &ac_entry(ac)[0], batchcount); +free_done: +#if STATS + { + int i = 0; + struct list_head *p; + + p = list3_data(cachep)->slabs_free.next; + while (p != &(list3_data(cachep)->slabs_free)) { + struct slab *slabp; + + slabp = list_entry(p, struct slab, list); + BUG_ON(slabp->inuse); + + i++; + p = p->next; + } + STATS_SET_FREEABLE(cachep, i); + } +#endif + spin_unlock(&cachep->spinlock); + ac->avail -= batchcount; + memmove(&ac_entry(ac)[0], &ac_entry(ac)[batchcount], + sizeof(void*)*ac->avail); +} + +/* + * __cache_free + * Release an obj back to its cache. If the obj has a constructed + * state, it must be in this state _before_ it is released. + * + * Called with disabled ints. + */ +static inline void __cache_free(kmem_cache_t *cachep, void *objp) +{ + struct array_cache *ac = ac_data(cachep); + + check_irq_off(); + objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); + + if (likely(ac->avail < ac->limit)) { + STATS_INC_FREEHIT(cachep); + ac_entry(ac)[ac->avail++] = objp; + return; + } else { + STATS_INC_FREEMISS(cachep); + cache_flusharray(cachep, ac); + ac_entry(ac)[ac->avail++] = objp; + } +} + +/** + * kmem_cache_alloc - Allocate an object + * @cachep: The cache to allocate from. + * @flags: See kmalloc(). + * + * Allocate an object from this cache. The flags are only relevant + * if the cache has no available objects. + */ +void *kmem_cache_alloc(kmem_cache_t *cachep, unsigned int __nocast flags) +{ + return __cache_alloc(cachep, flags); +} +EXPORT_SYMBOL(kmem_cache_alloc); + +/** + * kmem_ptr_validate - check if an untrusted pointer might + * be a slab entry. + * @cachep: the cache we're checking against + * @ptr: pointer to validate + * + * This verifies that the untrusted pointer looks sane: + * it is _not_ a guarantee that the pointer is actually + * part of the slab cache in question, but it at least + * validates that the pointer can be dereferenced and + * looks half-way sane. + * + * Currently only used for dentry validation. + */ +int fastcall kmem_ptr_validate(kmem_cache_t *cachep, void *ptr) +{ + unsigned long addr = (unsigned long) ptr; + unsigned long min_addr = PAGE_OFFSET; + unsigned long align_mask = BYTES_PER_WORD-1; + unsigned long size = cachep->objsize; + struct page *page; + + if (unlikely(addr < min_addr)) + goto out; + if (unlikely(addr > (unsigned long)high_memory - size)) + goto out; + if (unlikely(addr & align_mask)) + goto out; + if (unlikely(!kern_addr_valid(addr))) + goto out; + if (unlikely(!kern_addr_valid(addr + size - 1))) + goto out; + page = virt_to_page(ptr); + if (unlikely(!PageSlab(page))) + goto out; + if (unlikely(GET_PAGE_CACHE(page) != cachep)) + goto out; + return 1; +out: + return 0; +} + +#ifdef CONFIG_NUMA +/** + * kmem_cache_alloc_node - Allocate an object on the specified node + * @cachep: The cache to allocate from. + * @flags: See kmalloc(). + * @nodeid: node number of the target node. + * + * Identical to kmem_cache_alloc, except that this function is slow + * and can sleep. And it will allocate memory on the given node, which + * can improve the performance for cpu bound structures. + */ +void *kmem_cache_alloc_node(kmem_cache_t *cachep, int nodeid) +{ + int loop; + void *objp; + struct slab *slabp; + kmem_bufctl_t next; + + for (loop = 0;;loop++) { + struct list_head *q; + + objp = NULL; + check_irq_on(); + spin_lock_irq(&cachep->spinlock); + /* walk through all partial and empty slab and find one + * from the right node */ + list_for_each(q,&cachep->lists.slabs_partial) { + slabp = list_entry(q, struct slab, list); + + if (page_to_nid(virt_to_page(slabp->s_mem)) == nodeid || + loop > 2) + goto got_slabp; + } + list_for_each(q, &cachep->lists.slabs_free) { + slabp = list_entry(q, struct slab, list); + + if (page_to_nid(virt_to_page(slabp->s_mem)) == nodeid || + loop > 2) + goto got_slabp; + } + spin_unlock_irq(&cachep->spinlock); + + local_irq_disable(); + if (!cache_grow(cachep, GFP_KERNEL, nodeid)) { + local_irq_enable(); + return NULL; + } + local_irq_enable(); + } +got_slabp: + /* found one: allocate object */ + check_slabp(cachep, slabp); + check_spinlock_acquired(cachep); + + STATS_INC_ALLOCED(cachep); + STATS_INC_ACTIVE(cachep); + STATS_SET_HIGH(cachep); + STATS_INC_NODEALLOCS(cachep); + + objp = slabp->s_mem + slabp->free*cachep->objsize; + + slabp->inuse++; + next = slab_bufctl(slabp)[slabp->free]; +#if DEBUG + slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE; +#endif + slabp->free = next; + check_slabp(cachep, slabp); + + /* move slabp to correct slabp list: */ + list_del(&slabp->list); + if (slabp->free == BUFCTL_END) + list_add(&slabp->list, &cachep->lists.slabs_full); + else + list_add(&slabp->list, &cachep->lists.slabs_partial); + + list3_data(cachep)->free_objects--; + spin_unlock_irq(&cachep->spinlock); + + objp = cache_alloc_debugcheck_after(cachep, GFP_KERNEL, objp, + __builtin_return_address(0)); + return objp; +} +EXPORT_SYMBOL(kmem_cache_alloc_node); + +#endif + +/** + * kmalloc - allocate memory + * @size: how many bytes of memory are required. + * @flags: the type of memory to allocate. + * + * kmalloc is the normal method of allocating memory + * in the kernel. + * + * The @flags argument may be one of: + * + * %GFP_USER - Allocate memory on behalf of user. May sleep. + * + * %GFP_KERNEL - Allocate normal kernel ram. May sleep. + * + * %GFP_ATOMIC - Allocation will not sleep. Use inside interrupt handlers. + * + * Additionally, the %GFP_DMA flag may be set to indicate the memory + * must be suitable for DMA. This can mean different things on different + * platforms. For example, on i386, it means that the memory must come + * from the first 16MB. + */ +void *__kmalloc(size_t size, unsigned int __nocast flags) +{ + kmem_cache_t *cachep; + + cachep = kmem_find_general_cachep(size, flags); + if (unlikely(cachep == NULL)) + return NULL; + return __cache_alloc(cachep, flags); +} +EXPORT_SYMBOL(__kmalloc); + +#ifdef CONFIG_SMP +/** + * __alloc_percpu - allocate one copy of the object for every present + * cpu in the system, zeroing them. + * Objects should be dereferenced using the per_cpu_ptr macro only. + * + * @size: how many bytes of memory are required. + * @align: the alignment, which can't be greater than SMP_CACHE_BYTES. + */ +void *__alloc_percpu(size_t size, size_t align) +{ + int i; + struct percpu_data *pdata = kmalloc(sizeof (*pdata), GFP_KERNEL); + + if (!pdata) + return NULL; + + for (i = 0; i < NR_CPUS; i++) { + if (!cpu_possible(i)) + continue; + pdata->ptrs[i] = kmem_cache_alloc_node( + kmem_find_general_cachep(size, GFP_KERNEL), + cpu_to_node(i)); + + if (!pdata->ptrs[i]) + goto unwind_oom; + memset(pdata->ptrs[i], 0, size); + } + + /* Catch derefs w/o wrappers */ + return (void *) (~(unsigned long) pdata); + +unwind_oom: + while (--i >= 0) { + if (!cpu_possible(i)) + continue; + kfree(pdata->ptrs[i]); + } + kfree(pdata); + return NULL; +} +EXPORT_SYMBOL(__alloc_percpu); +#endif + +/** + * kmem_cache_free - Deallocate an object + * @cachep: The cache the allocation was from. + * @objp: The previously allocated object. + * + * Free an object which was previously allocated from this + * cache. + */ +void kmem_cache_free(kmem_cache_t *cachep, void *objp) +{ + unsigned long flags; + + local_irq_save(flags); + __cache_free(cachep, objp); + local_irq_restore(flags); +} +EXPORT_SYMBOL(kmem_cache_free); + +/** + * kcalloc - allocate memory for an array. The memory is set to zero. + * @n: number of elements. + * @size: element size. + * @flags: the type of memory to allocate. + */ +void *kcalloc(size_t n, size_t size, unsigned int __nocast flags) +{ + void *ret = NULL; + + if (n != 0 && size > INT_MAX / n) + return ret; + + ret = kmalloc(n * size, flags); + if (ret) + memset(ret, 0, n * size); + return ret; +} +EXPORT_SYMBOL(kcalloc); + +/** + * kfree - free previously allocated memory + * @objp: pointer returned by kmalloc. + * + * Don't free memory not originally allocated by kmalloc() + * or you will run into trouble. + */ +void kfree(const void *objp) +{ + kmem_cache_t *c; + unsigned long flags; + + if (unlikely(!objp)) + return; + local_irq_save(flags); + kfree_debugcheck(objp); + c = GET_PAGE_CACHE(virt_to_page(objp)); + __cache_free(c, (void*)objp); + local_irq_restore(flags); +} +EXPORT_SYMBOL(kfree); + +#ifdef CONFIG_SMP +/** + * free_percpu - free previously allocated percpu memory + * @objp: pointer returned by alloc_percpu. + * + * Don't free memory not originally allocated by alloc_percpu() + * The complemented objp is to check for that. + */ +void +free_percpu(const void *objp) +{ + int i; + struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp); + + for (i = 0; i < NR_CPUS; i++) { + if (!cpu_possible(i)) + continue; + kfree(p->ptrs[i]); + } + kfree(p); +} +EXPORT_SYMBOL(free_percpu); +#endif + +unsigned int kmem_cache_size(kmem_cache_t *cachep) +{ + return obj_reallen(cachep); +} +EXPORT_SYMBOL(kmem_cache_size); + +struct ccupdate_struct { + kmem_cache_t *cachep; + struct array_cache *new[NR_CPUS]; +}; + +static void do_ccupdate_local(void *info) +{ + struct ccupdate_struct *new = (struct ccupdate_struct *)info; + struct array_cache *old; + + check_irq_off(); + old = ac_data(new->cachep); + + new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()]; + new->new[smp_processor_id()] = old; +} + + +static int do_tune_cpucache(kmem_cache_t *cachep, int limit, int batchcount, + int shared) +{ + struct ccupdate_struct new; + struct array_cache *new_shared; + int i; + + memset(&new.new,0,sizeof(new.new)); + for (i = 0; i < NR_CPUS; i++) { + if (cpu_online(i)) { + new.new[i] = alloc_arraycache(i, limit, batchcount); + if (!new.new[i]) { + for (i--; i >= 0; i--) kfree(new.new[i]); + return -ENOMEM; + } + } else { + new.new[i] = NULL; + } + } + new.cachep = cachep; + + smp_call_function_all_cpus(do_ccupdate_local, (void *)&new); + + check_irq_on(); + spin_lock_irq(&cachep->spinlock); + cachep->batchcount = batchcount; + cachep->limit = limit; + cachep->free_limit = (1+num_online_cpus())*cachep->batchcount + cachep->num; + spin_unlock_irq(&cachep->spinlock); + + for (i = 0; i < NR_CPUS; i++) { + struct array_cache *ccold = new.new[i]; + if (!ccold) + continue; + spin_lock_irq(&cachep->spinlock); + free_block(cachep, ac_entry(ccold), ccold->avail); + spin_unlock_irq(&cachep->spinlock); + kfree(ccold); + } + new_shared = alloc_arraycache(-1, batchcount*shared, 0xbaadf00d); + if (new_shared) { + struct array_cache *old; + + spin_lock_irq(&cachep->spinlock); + old = cachep->lists.shared; + cachep->lists.shared = new_shared; + if (old) + free_block(cachep, ac_entry(old), old->avail); + spin_unlock_irq(&cachep->spinlock); + kfree(old); + } + + return 0; +} + + +static void enable_cpucache(kmem_cache_t *cachep) +{ + int err; + int limit, shared; + + /* The head array serves three purposes: + * - create a LIFO ordering, i.e. return objects that are cache-warm + * - reduce the number of spinlock operations. + * - reduce the number of linked list operations on the slab and + * bufctl chains: array operations are cheaper. + * The numbers are guessed, we should auto-tune as described by + * Bonwick. + */ + if (cachep->objsize > 131072) + limit = 1; + else if (cachep->objsize > PAGE_SIZE) + limit = 8; + else if (cachep->objsize > 1024) + limit = 24; + else if (cachep->objsize > 256) + limit = 54; + else + limit = 120; + + /* Cpu bound tasks (e.g. network routing) can exhibit cpu bound + * allocation behaviour: Most allocs on one cpu, most free operations + * on another cpu. For these cases, an efficient object passing between + * cpus is necessary. This is provided by a shared array. The array + * replaces Bonwick's magazine layer. + * On uniprocessor, it's functionally equivalent (but less efficient) + * to a larger limit. Thus disabled by default. + */ + shared = 0; +#ifdef CONFIG_SMP + if (cachep->objsize <= PAGE_SIZE) + shared = 8; +#endif + +#if DEBUG + /* With debugging enabled, large batchcount lead to excessively + * long periods with disabled local interrupts. Limit the + * batchcount + */ + if (limit > 32) + limit = 32; +#endif + err = do_tune_cpucache(cachep, limit, (limit+1)/2, shared); + if (err) + printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", + cachep->name, -err); +} + +static void drain_array_locked(kmem_cache_t *cachep, + struct array_cache *ac, int force) +{ + int tofree; + + check_spinlock_acquired(cachep); + if (ac->touched && !force) { + ac->touched = 0; + } else if (ac->avail) { + tofree = force ? ac->avail : (ac->limit+4)/5; + if (tofree > ac->avail) { + tofree = (ac->avail+1)/2; + } + free_block(cachep, ac_entry(ac), tofree); + ac->avail -= tofree; + memmove(&ac_entry(ac)[0], &ac_entry(ac)[tofree], + sizeof(void*)*ac->avail); + } +} + +/** + * cache_reap - Reclaim memory from caches. + * + * Called from workqueue/eventd every few seconds. + * Purpose: + * - clear the per-cpu caches for this CPU. + * - return freeable pages to the main free memory pool. + * + * If we cannot acquire the cache chain semaphore then just give up - we'll + * try again on the next iteration. + */ +static void cache_reap(void *unused) +{ + struct list_head *walk; + + if (down_trylock(&cache_chain_sem)) { + /* Give up. Setup the next iteration. */ + schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC + smp_processor_id()); + return; + } + + list_for_each(walk, &cache_chain) { + kmem_cache_t *searchp; + struct list_head* p; + int tofree; + struct slab *slabp; + + searchp = list_entry(walk, kmem_cache_t, next); + + if (searchp->flags & SLAB_NO_REAP) + goto next; + + check_irq_on(); + + spin_lock_irq(&searchp->spinlock); + + drain_array_locked(searchp, ac_data(searchp), 0); + + if(time_after(searchp->lists.next_reap, jiffies)) + goto next_unlock; + + searchp->lists.next_reap = jiffies + REAPTIMEOUT_LIST3; + + if (searchp->lists.shared) + drain_array_locked(searchp, searchp->lists.shared, 0); + + if (searchp->lists.free_touched) { + searchp->lists.free_touched = 0; + goto next_unlock; + } + + tofree = (searchp->free_limit+5*searchp->num-1)/(5*searchp->num); + do { + p = list3_data(searchp)->slabs_free.next; + if (p == &(list3_data(searchp)->slabs_free)) + break; + + slabp = list_entry(p, struct slab, list); + BUG_ON(slabp->inuse); + list_del(&slabp->list); + STATS_INC_REAPED(searchp); + + /* Safe to drop the lock. The slab is no longer + * linked to the cache. + * searchp cannot disappear, we hold + * cache_chain_lock + */ + searchp->lists.free_objects -= searchp->num; + spin_unlock_irq(&searchp->spinlock); + slab_destroy(searchp, slabp); + spin_lock_irq(&searchp->spinlock); + } while(--tofree > 0); +next_unlock: + spin_unlock_irq(&searchp->spinlock); +next: + cond_resched(); + } + check_irq_on(); + up(&cache_chain_sem); + /* Setup the next iteration */ + schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC + smp_processor_id()); +} + +#ifdef CONFIG_PROC_FS + +static void *s_start(struct seq_file *m, loff_t *pos) +{ + loff_t n = *pos; + struct list_head *p; + + down(&cache_chain_sem); + if (!n) { + /* + * Output format version, so at least we can change it + * without _too_ many complaints. + */ +#if STATS + seq_puts(m, "slabinfo - version: 2.1 (statistics)\n"); +#else + seq_puts(m, "slabinfo - version: 2.1\n"); +#endif + seq_puts(m, "# name "); + seq_puts(m, " : tunables "); + seq_puts(m, " : slabdata "); +#if STATS + seq_puts(m, " : globalstat " + " "); + seq_puts(m, " : cpustat "); +#endif + seq_putc(m, '\n'); + } + p = cache_chain.next; + while (n--) { + p = p->next; + if (p == &cache_chain) + return NULL; + } + return list_entry(p, kmem_cache_t, next); +} + +static void *s_next(struct seq_file *m, void *p, loff_t *pos) +{ + kmem_cache_t *cachep = p; + ++*pos; + return cachep->next.next == &cache_chain ? NULL + : list_entry(cachep->next.next, kmem_cache_t, next); +} + +static void s_stop(struct seq_file *m, void *p) +{ + up(&cache_chain_sem); +} + +static int s_show(struct seq_file *m, void *p) +{ + kmem_cache_t *cachep = p; + struct list_head *q; + struct slab *slabp; + unsigned long active_objs; + unsigned long num_objs; + unsigned long active_slabs = 0; + unsigned long num_slabs; + const char *name; + char *error = NULL; + + check_irq_on(); + spin_lock_irq(&cachep->spinlock); + active_objs = 0; + num_slabs = 0; + list_for_each(q,&cachep->lists.slabs_full) { + slabp = list_entry(q, struct slab, list); + if (slabp->inuse != cachep->num && !error) + error = "slabs_full accounting error"; + active_objs += cachep->num; + active_slabs++; + } + list_for_each(q,&cachep->lists.slabs_partial) { + slabp = list_entry(q, struct slab, list); + if (slabp->inuse == cachep->num && !error) + error = "slabs_partial inuse accounting error"; + if (!slabp->inuse && !error) + error = "slabs_partial/inuse accounting error"; + active_objs += slabp->inuse; + active_slabs++; + } + list_for_each(q,&cachep->lists.slabs_free) { + slabp = list_entry(q, struct slab, list); + if (slabp->inuse && !error) + error = "slabs_free/inuse accounting error"; + num_slabs++; + } + num_slabs+=active_slabs; + num_objs = num_slabs*cachep->num; + if (num_objs - active_objs != cachep->lists.free_objects && !error) + error = "free_objects accounting error"; + + name = cachep->name; + if (error) + printk(KERN_ERR "slab: cache %s error: %s\n", name, error); + + seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", + name, active_objs, num_objs, cachep->objsize, + cachep->num, (1<gfporder)); + seq_printf(m, " : tunables %4u %4u %4u", + cachep->limit, cachep->batchcount, + cachep->lists.shared->limit/cachep->batchcount); + seq_printf(m, " : slabdata %6lu %6lu %6u", + active_slabs, num_slabs, cachep->lists.shared->avail); +#if STATS + { /* list3 stats */ + unsigned long high = cachep->high_mark; + unsigned long allocs = cachep->num_allocations; + unsigned long grown = cachep->grown; + unsigned long reaped = cachep->reaped; + unsigned long errors = cachep->errors; + unsigned long max_freeable = cachep->max_freeable; + unsigned long free_limit = cachep->free_limit; + unsigned long node_allocs = cachep->node_allocs; + + seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu %4lu %4lu %4lu %4lu", + allocs, high, grown, reaped, errors, + max_freeable, free_limit, node_allocs); + } + /* cpu stats */ + { + unsigned long allochit = atomic_read(&cachep->allochit); + unsigned long allocmiss = atomic_read(&cachep->allocmiss); + unsigned long freehit = atomic_read(&cachep->freehit); + unsigned long freemiss = atomic_read(&cachep->freemiss); + + seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu", + allochit, allocmiss, freehit, freemiss); + } +#endif + seq_putc(m, '\n'); + spin_unlock_irq(&cachep->spinlock); + return 0; +} + +/* + * slabinfo_op - iterator that generates /proc/slabinfo + * + * Output layout: + * cache-name + * num-active-objs + * total-objs + * object size + * num-active-slabs + * total-slabs + * num-pages-per-slab + * + further values on SMP and with statistics enabled + */ + +struct seq_operations slabinfo_op = { + .start = s_start, + .next = s_next, + .stop = s_stop, + .show = s_show, +}; + +#define MAX_SLABINFO_WRITE 128 +/** + * slabinfo_write - Tuning for the slab allocator + * @file: unused + * @buffer: user buffer + * @count: data length + * @ppos: unused + */ +ssize_t slabinfo_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos) +{ + char kbuf[MAX_SLABINFO_WRITE+1], *tmp; + int limit, batchcount, shared, res; + struct list_head *p; + + if (count > MAX_SLABINFO_WRITE) + return -EINVAL; + if (copy_from_user(&kbuf, buffer, count)) + return -EFAULT; + kbuf[MAX_SLABINFO_WRITE] = '\0'; + + tmp = strchr(kbuf, ' '); + if (!tmp) + return -EINVAL; + *tmp = '\0'; + tmp++; + if (sscanf(tmp, " %d %d %d", &limit, &batchcount, &shared) != 3) + return -EINVAL; + + /* Find the cache in the chain of caches. */ + down(&cache_chain_sem); + res = -EINVAL; + list_for_each(p,&cache_chain) { + kmem_cache_t *cachep = list_entry(p, kmem_cache_t, next); + + if (!strcmp(cachep->name, kbuf)) { + if (limit < 1 || + batchcount < 1 || + batchcount > limit || + shared < 0) { + res = -EINVAL; + } else { + res = do_tune_cpucache(cachep, limit, batchcount, shared); + } + break; + } + } + up(&cache_chain_sem); + if (res >= 0) + res = count; + return res; +} +#endif + +unsigned int ksize(const void *objp) +{ + kmem_cache_t *c; + unsigned long flags; + unsigned int size = 0; + + if (likely(objp != NULL)) { + local_irq_save(flags); + c = GET_PAGE_CACHE(virt_to_page(objp)); + size = kmem_cache_size(c); + local_irq_restore(flags); + } + + return size; +} diff --git a/mm/swap.c b/mm/swap.c new file mode 100644 index 000000000000..7771d2803f62 --- /dev/null +++ b/mm/swap.c @@ -0,0 +1,485 @@ +/* + * linux/mm/swap.c + * + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + */ + +/* + * This file contains the default values for the opereation of the + * Linux VM subsystem. Fine-tuning documentation can be found in + * Documentation/sysctl/vm.txt. + * Started 18.12.91 + * Swap aging added 23.2.95, Stephen Tweedie. + * Buffermem limits added 12.3.98, Rik van Riel. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for try_to_release_page() */ +#include +#include +#include +#include +#include +#include + +/* How many pages do we try to swap or page in/out together? */ +int page_cluster; + +#ifdef CONFIG_HUGETLB_PAGE + +void put_page(struct page *page) +{ + if (unlikely(PageCompound(page))) { + page = (struct page *)page->private; + if (put_page_testzero(page)) { + void (*dtor)(struct page *page); + + dtor = (void (*)(struct page *))page[1].mapping; + (*dtor)(page); + } + return; + } + if (!PageReserved(page) && put_page_testzero(page)) + __page_cache_release(page); +} +EXPORT_SYMBOL(put_page); +#endif + +/* + * Writeback is about to end against a page which has been marked for immediate + * reclaim. If it still appears to be reclaimable, move it to the tail of the + * inactive list. The page still has PageWriteback set, which will pin it. + * + * We don't expect many pages to come through here, so don't bother batching + * things up. + * + * To avoid placing the page at the tail of the LRU while PG_writeback is still + * set, this function will clear PG_writeback before performing the page + * motion. Do that inside the lru lock because once PG_writeback is cleared + * we may not touch the page. + * + * Returns zero if it cleared PG_writeback. + */ +int rotate_reclaimable_page(struct page *page) +{ + struct zone *zone; + unsigned long flags; + + if (PageLocked(page)) + return 1; + if (PageDirty(page)) + return 1; + if (PageActive(page)) + return 1; + if (!PageLRU(page)) + return 1; + + zone = page_zone(page); + spin_lock_irqsave(&zone->lru_lock, flags); + if (PageLRU(page) && !PageActive(page)) { + list_del(&page->lru); + list_add_tail(&page->lru, &zone->inactive_list); + inc_page_state(pgrotated); + } + if (!test_clear_page_writeback(page)) + BUG(); + spin_unlock_irqrestore(&zone->lru_lock, flags); + return 0; +} + +/* + * FIXME: speed this up? + */ +void fastcall activate_page(struct page *page) +{ + struct zone *zone = page_zone(page); + + spin_lock_irq(&zone->lru_lock); + if (PageLRU(page) && !PageActive(page)) { + del_page_from_inactive_list(zone, page); + SetPageActive(page); + add_page_to_active_list(zone, page); + inc_page_state(pgactivate); + } + spin_unlock_irq(&zone->lru_lock); +} + +/* + * Mark a page as having seen activity. + * + * inactive,unreferenced -> inactive,referenced + * inactive,referenced -> active,unreferenced + * active,unreferenced -> active,referenced + */ +void fastcall mark_page_accessed(struct page *page) +{ + if (!PageActive(page) && PageReferenced(page) && PageLRU(page)) { + activate_page(page); + ClearPageReferenced(page); + } else if (!PageReferenced(page)) { + SetPageReferenced(page); + } +} + +EXPORT_SYMBOL(mark_page_accessed); + +/** + * lru_cache_add: add a page to the page lists + * @page: the page to add + */ +static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, }; +static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, }; + +void fastcall lru_cache_add(struct page *page) +{ + struct pagevec *pvec = &get_cpu_var(lru_add_pvecs); + + page_cache_get(page); + if (!pagevec_add(pvec, page)) + __pagevec_lru_add(pvec); + put_cpu_var(lru_add_pvecs); +} + +void fastcall lru_cache_add_active(struct page *page) +{ + struct pagevec *pvec = &get_cpu_var(lru_add_active_pvecs); + + page_cache_get(page); + if (!pagevec_add(pvec, page)) + __pagevec_lru_add_active(pvec); + put_cpu_var(lru_add_active_pvecs); +} + +void lru_add_drain(void) +{ + struct pagevec *pvec = &get_cpu_var(lru_add_pvecs); + + if (pagevec_count(pvec)) + __pagevec_lru_add(pvec); + pvec = &__get_cpu_var(lru_add_active_pvecs); + if (pagevec_count(pvec)) + __pagevec_lru_add_active(pvec); + put_cpu_var(lru_add_pvecs); +} + +/* + * This path almost never happens for VM activity - pages are normally + * freed via pagevecs. But it gets used by networking. + */ +void fastcall __page_cache_release(struct page *page) +{ + unsigned long flags; + struct zone *zone = page_zone(page); + + spin_lock_irqsave(&zone->lru_lock, flags); + if (TestClearPageLRU(page)) + del_page_from_lru(zone, page); + if (page_count(page) != 0) + page = NULL; + spin_unlock_irqrestore(&zone->lru_lock, flags); + if (page) + free_hot_page(page); +} + +EXPORT_SYMBOL(__page_cache_release); + +/* + * Batched page_cache_release(). Decrement the reference count on all the + * passed pages. If it fell to zero then remove the page from the LRU and + * free it. + * + * Avoid taking zone->lru_lock if possible, but if it is taken, retain it + * for the remainder of the operation. + * + * The locking in this function is against shrink_cache(): we recheck the + * page count inside the lock to see whether shrink_cache grabbed the page + * via the LRU. If it did, give up: shrink_cache will free it. + */ +void release_pages(struct page **pages, int nr, int cold) +{ + int i; + struct pagevec pages_to_free; + struct zone *zone = NULL; + + pagevec_init(&pages_to_free, cold); + for (i = 0; i < nr; i++) { + struct page *page = pages[i]; + struct zone *pagezone; + + if (PageReserved(page) || !put_page_testzero(page)) + continue; + + pagezone = page_zone(page); + if (pagezone != zone) { + if (zone) + spin_unlock_irq(&zone->lru_lock); + zone = pagezone; + spin_lock_irq(&zone->lru_lock); + } + if (TestClearPageLRU(page)) + del_page_from_lru(zone, page); + if (page_count(page) == 0) { + if (!pagevec_add(&pages_to_free, page)) { + spin_unlock_irq(&zone->lru_lock); + __pagevec_free(&pages_to_free); + pagevec_reinit(&pages_to_free); + zone = NULL; /* No lock is held */ + } + } + } + if (zone) + spin_unlock_irq(&zone->lru_lock); + + pagevec_free(&pages_to_free); +} + +/* + * The pages which we're about to release may be in the deferred lru-addition + * queues. That would prevent them from really being freed right now. That's + * OK from a correctness point of view but is inefficient - those pages may be + * cache-warm and we want to give them back to the page allocator ASAP. + * + * So __pagevec_release() will drain those queues here. __pagevec_lru_add() + * and __pagevec_lru_add_active() call release_pages() directly to avoid + * mutual recursion. + */ +void __pagevec_release(struct pagevec *pvec) +{ + lru_add_drain(); + release_pages(pvec->pages, pagevec_count(pvec), pvec->cold); + pagevec_reinit(pvec); +} + +/* + * pagevec_release() for pages which are known to not be on the LRU + * + * This function reinitialises the caller's pagevec. + */ +void __pagevec_release_nonlru(struct pagevec *pvec) +{ + int i; + struct pagevec pages_to_free; + + pagevec_init(&pages_to_free, pvec->cold); + pages_to_free.cold = pvec->cold; + for (i = 0; i < pagevec_count(pvec); i++) { + struct page *page = pvec->pages[i]; + + BUG_ON(PageLRU(page)); + if (put_page_testzero(page)) + pagevec_add(&pages_to_free, page); + } + pagevec_free(&pages_to_free); + pagevec_reinit(pvec); +} + +/* + * Add the passed pages to the LRU, then drop the caller's refcount + * on them. Reinitialises the caller's pagevec. + */ +void __pagevec_lru_add(struct pagevec *pvec) +{ + int i; + struct zone *zone = NULL; + + for (i = 0; i < pagevec_count(pvec); i++) { + struct page *page = pvec->pages[i]; + struct zone *pagezone = page_zone(page); + + if (pagezone != zone) { + if (zone) + spin_unlock_irq(&zone->lru_lock); + zone = pagezone; + spin_lock_irq(&zone->lru_lock); + } + if (TestSetPageLRU(page)) + BUG(); + add_page_to_inactive_list(zone, page); + } + if (zone) + spin_unlock_irq(&zone->lru_lock); + release_pages(pvec->pages, pvec->nr, pvec->cold); + pagevec_reinit(pvec); +} + +EXPORT_SYMBOL(__pagevec_lru_add); + +void __pagevec_lru_add_active(struct pagevec *pvec) +{ + int i; + struct zone *zone = NULL; + + for (i = 0; i < pagevec_count(pvec); i++) { + struct page *page = pvec->pages[i]; + struct zone *pagezone = page_zone(page); + + if (pagezone != zone) { + if (zone) + spin_unlock_irq(&zone->lru_lock); + zone = pagezone; + spin_lock_irq(&zone->lru_lock); + } + if (TestSetPageLRU(page)) + BUG(); + if (TestSetPageActive(page)) + BUG(); + add_page_to_active_list(zone, page); + } + if (zone) + spin_unlock_irq(&zone->lru_lock); + release_pages(pvec->pages, pvec->nr, pvec->cold); + pagevec_reinit(pvec); +} + +/* + * Try to drop buffers from the pages in a pagevec + */ +void pagevec_strip(struct pagevec *pvec) +{ + int i; + + for (i = 0; i < pagevec_count(pvec); i++) { + struct page *page = pvec->pages[i]; + + if (PagePrivate(page) && !TestSetPageLocked(page)) { + try_to_release_page(page, 0); + unlock_page(page); + } + } +} + +/** + * pagevec_lookup - gang pagecache lookup + * @pvec: Where the resulting pages are placed + * @mapping: The address_space to search + * @start: The starting page index + * @nr_pages: The maximum number of pages + * + * pagevec_lookup() will search for and return a group of up to @nr_pages pages + * in the mapping. The pages are placed in @pvec. pagevec_lookup() takes a + * reference against the pages in @pvec. + * + * The search returns a group of mapping-contiguous pages with ascending + * indexes. There may be holes in the indices due to not-present pages. + * + * pagevec_lookup() returns the number of pages which were found. + */ +unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, + pgoff_t start, unsigned nr_pages) +{ + pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages); + return pagevec_count(pvec); +} + +unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, + pgoff_t *index, int tag, unsigned nr_pages) +{ + pvec->nr = find_get_pages_tag(mapping, index, tag, + nr_pages, pvec->pages); + return pagevec_count(pvec); +} + + +#ifdef CONFIG_SMP +/* + * We tolerate a little inaccuracy to avoid ping-ponging the counter between + * CPUs + */ +#define ACCT_THRESHOLD max(16, NR_CPUS * 2) + +static DEFINE_PER_CPU(long, committed_space) = 0; + +void vm_acct_memory(long pages) +{ + long *local; + + preempt_disable(); + local = &__get_cpu_var(committed_space); + *local += pages; + if (*local > ACCT_THRESHOLD || *local < -ACCT_THRESHOLD) { + atomic_add(*local, &vm_committed_space); + *local = 0; + } + preempt_enable(); +} +EXPORT_SYMBOL(vm_acct_memory); + +#ifdef CONFIG_HOTPLUG_CPU +static void lru_drain_cache(unsigned int cpu) +{ + struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu); + + /* CPU is dead, so no locking needed. */ + if (pagevec_count(pvec)) + __pagevec_lru_add(pvec); + pvec = &per_cpu(lru_add_active_pvecs, cpu); + if (pagevec_count(pvec)) + __pagevec_lru_add_active(pvec); +} + +/* Drop the CPU's cached committed space back into the central pool. */ +static int cpu_swap_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + long *committed; + + committed = &per_cpu(committed_space, (long)hcpu); + if (action == CPU_DEAD) { + atomic_add(*committed, &vm_committed_space); + *committed = 0; + lru_drain_cache((long)hcpu); + } + return NOTIFY_OK; +} +#endif /* CONFIG_HOTPLUG_CPU */ +#endif /* CONFIG_SMP */ + +#ifdef CONFIG_SMP +void percpu_counter_mod(struct percpu_counter *fbc, long amount) +{ + long count; + long *pcount; + int cpu = get_cpu(); + + pcount = per_cpu_ptr(fbc->counters, cpu); + count = *pcount + amount; + if (count >= FBC_BATCH || count <= -FBC_BATCH) { + spin_lock(&fbc->lock); + fbc->count += count; + spin_unlock(&fbc->lock); + count = 0; + } + *pcount = count; + put_cpu(); +} +EXPORT_SYMBOL(percpu_counter_mod); +#endif + +/* + * Perform any setup for the swap system + */ +void __init swap_setup(void) +{ + unsigned long megs = num_physpages >> (20 - PAGE_SHIFT); + + /* Use a smaller cluster for small-memory machines */ + if (megs < 16) + page_cluster = 2; + else + page_cluster = 3; + /* + * Right now other parts of the system means that we + * _really_ don't want to cluster much more + */ + hotcpu_notifier(cpu_swap_callback, 0); +} diff --git a/mm/swap_state.c b/mm/swap_state.c new file mode 100644 index 000000000000..a063a902ed03 --- /dev/null +++ b/mm/swap_state.c @@ -0,0 +1,382 @@ +/* + * linux/mm/swap_state.c + * + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * Swap reorganised 29.12.95, Stephen Tweedie + * + * Rewritten to use page cache, (C) 1998 Stephen Tweedie + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* + * swapper_space is a fiction, retained to simplify the path through + * vmscan's shrink_list, to make sync_page look nicer, and to allow + * future use of radix_tree tags in the swap cache. + */ +static struct address_space_operations swap_aops = { + .writepage = swap_writepage, + .sync_page = block_sync_page, + .set_page_dirty = __set_page_dirty_nobuffers, +}; + +static struct backing_dev_info swap_backing_dev_info = { + .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK, + .unplug_io_fn = swap_unplug_io_fn, +}; + +struct address_space swapper_space = { + .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), + .tree_lock = RW_LOCK_UNLOCKED, + .a_ops = &swap_aops, + .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear), + .backing_dev_info = &swap_backing_dev_info, +}; +EXPORT_SYMBOL(swapper_space); + +#define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) + +static struct { + unsigned long add_total; + unsigned long del_total; + unsigned long find_success; + unsigned long find_total; + unsigned long noent_race; + unsigned long exist_race; +} swap_cache_info; + +void show_swap_cache_info(void) +{ + printk("Swap cache: add %lu, delete %lu, find %lu/%lu, race %lu+%lu\n", + swap_cache_info.add_total, swap_cache_info.del_total, + swap_cache_info.find_success, swap_cache_info.find_total, + swap_cache_info.noent_race, swap_cache_info.exist_race); + printk("Free swap = %lukB\n", nr_swap_pages << (PAGE_SHIFT - 10)); + printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10)); +} + +/* + * __add_to_swap_cache resembles add_to_page_cache on swapper_space, + * but sets SwapCache flag and private instead of mapping and index. + */ +static int __add_to_swap_cache(struct page *page, + swp_entry_t entry, int gfp_mask) +{ + int error; + + BUG_ON(PageSwapCache(page)); + BUG_ON(PagePrivate(page)); + error = radix_tree_preload(gfp_mask); + if (!error) { + write_lock_irq(&swapper_space.tree_lock); + error = radix_tree_insert(&swapper_space.page_tree, + entry.val, page); + if (!error) { + page_cache_get(page); + SetPageLocked(page); + SetPageSwapCache(page); + page->private = entry.val; + total_swapcache_pages++; + pagecache_acct(1); + } + write_unlock_irq(&swapper_space.tree_lock); + radix_tree_preload_end(); + } + return error; +} + +static int add_to_swap_cache(struct page *page, swp_entry_t entry) +{ + int error; + + if (!swap_duplicate(entry)) { + INC_CACHE_INFO(noent_race); + return -ENOENT; + } + error = __add_to_swap_cache(page, entry, GFP_KERNEL); + /* + * Anon pages are already on the LRU, we don't run lru_cache_add here. + */ + if (error) { + swap_free(entry); + if (error == -EEXIST) + INC_CACHE_INFO(exist_race); + return error; + } + INC_CACHE_INFO(add_total); + return 0; +} + +/* + * This must be called only on pages that have + * been verified to be in the swap cache. + */ +void __delete_from_swap_cache(struct page *page) +{ + BUG_ON(!PageLocked(page)); + BUG_ON(!PageSwapCache(page)); + BUG_ON(PageWriteback(page)); + + radix_tree_delete(&swapper_space.page_tree, page->private); + page->private = 0; + ClearPageSwapCache(page); + total_swapcache_pages--; + pagecache_acct(-1); + INC_CACHE_INFO(del_total); +} + +/** + * add_to_swap - allocate swap space for a page + * @page: page we want to move to swap + * + * Allocate swap space for the page and add the page to the + * swap cache. Caller needs to hold the page lock. + */ +int add_to_swap(struct page * page) +{ + swp_entry_t entry; + int pf_flags; + int err; + + if (!PageLocked(page)) + BUG(); + + for (;;) { + entry = get_swap_page(); + if (!entry.val) + return 0; + + /* Radix-tree node allocations are performing + * GFP_ATOMIC allocations under PF_MEMALLOC. + * They can completely exhaust the page allocator. + * + * So PF_MEMALLOC is dropped here. This causes the slab + * allocations to fail earlier, so radix-tree nodes will + * then be allocated from the mempool reserves. + * + * We're still using __GFP_HIGH for radix-tree node + * allocations, so some of the emergency pools are available, + * just not all of them. + */ + + pf_flags = current->flags; + current->flags &= ~PF_MEMALLOC; + + /* + * Add it to the swap cache and mark it dirty + */ + err = __add_to_swap_cache(page, entry, GFP_ATOMIC|__GFP_NOWARN); + + if (pf_flags & PF_MEMALLOC) + current->flags |= PF_MEMALLOC; + + switch (err) { + case 0: /* Success */ + SetPageUptodate(page); + SetPageDirty(page); + INC_CACHE_INFO(add_total); + return 1; + case -EEXIST: + /* Raced with "speculative" read_swap_cache_async */ + INC_CACHE_INFO(exist_race); + swap_free(entry); + continue; + default: + /* -ENOMEM radix-tree allocation failure */ + swap_free(entry); + return 0; + } + } +} + +/* + * This must be called only on pages that have + * been verified to be in the swap cache and locked. + * It will never put the page into the free list, + * the caller has a reference on the page. + */ +void delete_from_swap_cache(struct page *page) +{ + swp_entry_t entry; + + BUG_ON(!PageSwapCache(page)); + BUG_ON(!PageLocked(page)); + BUG_ON(PageWriteback(page)); + BUG_ON(PagePrivate(page)); + + entry.val = page->private; + + write_lock_irq(&swapper_space.tree_lock); + __delete_from_swap_cache(page); + write_unlock_irq(&swapper_space.tree_lock); + + swap_free(entry); + page_cache_release(page); +} + +/* + * Strange swizzling function only for use by shmem_writepage + */ +int move_to_swap_cache(struct page *page, swp_entry_t entry) +{ + int err = __add_to_swap_cache(page, entry, GFP_ATOMIC); + if (!err) { + remove_from_page_cache(page); + page_cache_release(page); /* pagecache ref */ + if (!swap_duplicate(entry)) + BUG(); + SetPageDirty(page); + INC_CACHE_INFO(add_total); + } else if (err == -EEXIST) + INC_CACHE_INFO(exist_race); + return err; +} + +/* + * Strange swizzling function for shmem_getpage (and shmem_unuse) + */ +int move_from_swap_cache(struct page *page, unsigned long index, + struct address_space *mapping) +{ + int err = add_to_page_cache(page, mapping, index, GFP_ATOMIC); + if (!err) { + delete_from_swap_cache(page); + /* shift page from clean_pages to dirty_pages list */ + ClearPageDirty(page); + set_page_dirty(page); + } + return err; +} + +/* + * If we are the only user, then try to free up the swap cache. + * + * Its ok to check for PageSwapCache without the page lock + * here because we are going to recheck again inside + * exclusive_swap_page() _with_ the lock. + * - Marcelo + */ +static inline void free_swap_cache(struct page *page) +{ + if (PageSwapCache(page) && !TestSetPageLocked(page)) { + remove_exclusive_swap_page(page); + unlock_page(page); + } +} + +/* + * Perform a free_page(), also freeing any swap cache associated with + * this page if it is the last user of the page. Can not do a lock_page, + * as we are holding the page_table_lock spinlock. + */ +void free_page_and_swap_cache(struct page *page) +{ + free_swap_cache(page); + page_cache_release(page); +} + +/* + * Passed an array of pages, drop them all from swapcache and then release + * them. They are removed from the LRU and freed if this is their last use. + */ +void free_pages_and_swap_cache(struct page **pages, int nr) +{ + int chunk = 16; + struct page **pagep = pages; + + lru_add_drain(); + while (nr) { + int todo = min(chunk, nr); + int i; + + for (i = 0; i < todo; i++) + free_swap_cache(pagep[i]); + release_pages(pagep, todo, 0); + pagep += todo; + nr -= todo; + } +} + +/* + * Lookup a swap entry in the swap cache. A found page will be returned + * unlocked and with its refcount incremented - we rely on the kernel + * lock getting page table operations atomic even if we drop the page + * lock before returning. + */ +struct page * lookup_swap_cache(swp_entry_t entry) +{ + struct page *page; + + page = find_get_page(&swapper_space, entry.val); + + if (page) + INC_CACHE_INFO(find_success); + + INC_CACHE_INFO(find_total); + return page; +} + +/* + * Locate a page of swap in physical memory, reserving swap cache space + * and reading the disk if it is not already cached. + * A failure return means that either the page allocation failed or that + * the swap entry is no longer in use. + */ +struct page *read_swap_cache_async(swp_entry_t entry, + struct vm_area_struct *vma, unsigned long addr) +{ + struct page *found_page, *new_page = NULL; + int err; + + do { + /* + * First check the swap cache. Since this is normally + * called after lookup_swap_cache() failed, re-calling + * that would confuse statistics. + */ + found_page = find_get_page(&swapper_space, entry.val); + if (found_page) + break; + + /* + * Get a new page to read into from swap. + */ + if (!new_page) { + new_page = alloc_page_vma(GFP_HIGHUSER, vma, addr); + if (!new_page) + break; /* Out of memory */ + } + + /* + * Associate the page with swap entry in the swap cache. + * May fail (-ENOENT) if swap entry has been freed since + * our caller observed it. May fail (-EEXIST) if there + * is already a page associated with this entry in the + * swap cache: added by a racing read_swap_cache_async, + * or by try_to_swap_out (or shmem_writepage) re-using + * the just freed swap entry for an existing page. + * May fail (-ENOMEM) if radix-tree node allocation failed. + */ + err = add_to_swap_cache(new_page, entry); + if (!err) { + /* + * Initiate read into locked page and return. + */ + lru_cache_add_active(new_page); + swap_readpage(NULL, new_page); + return new_page; + } + } while (err != -ENOENT && err != -ENOMEM); + + if (new_page) + page_cache_release(new_page); + return found_page; +} diff --git a/mm/swapfile.c b/mm/swapfile.c new file mode 100644 index 000000000000..a60e0075d55b --- /dev/null +++ b/mm/swapfile.c @@ -0,0 +1,1672 @@ +/* + * linux/mm/swapfile.c + * + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * Swap reorganised 29.12.95, Stephen Tweedie + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +DEFINE_SPINLOCK(swaplock); +unsigned int nr_swapfiles; +long total_swap_pages; +static int swap_overflow; + +EXPORT_SYMBOL(total_swap_pages); + +static const char Bad_file[] = "Bad swap file entry "; +static const char Unused_file[] = "Unused swap file entry "; +static const char Bad_offset[] = "Bad swap offset entry "; +static const char Unused_offset[] = "Unused swap offset entry "; + +struct swap_list_t swap_list = {-1, -1}; + +struct swap_info_struct swap_info[MAX_SWAPFILES]; + +static DECLARE_MUTEX(swapon_sem); + +/* + * We need this because the bdev->unplug_fn can sleep and we cannot + * hold swap_list_lock while calling the unplug_fn. And swap_list_lock + * cannot be turned into a semaphore. + */ +static DECLARE_RWSEM(swap_unplug_sem); + +#define SWAPFILE_CLUSTER 256 + +void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page) +{ + swp_entry_t entry; + + down_read(&swap_unplug_sem); + entry.val = page->private; + if (PageSwapCache(page)) { + struct block_device *bdev = swap_info[swp_type(entry)].bdev; + struct backing_dev_info *bdi; + + /* + * If the page is removed from swapcache from under us (with a + * racy try_to_unuse/swapoff) we need an additional reference + * count to avoid reading garbage from page->private above. If + * the WARN_ON triggers during a swapoff it maybe the race + * condition and it's harmless. However if it triggers without + * swapoff it signals a problem. + */ + WARN_ON(page_count(page) <= 1); + + bdi = bdev->bd_inode->i_mapping->backing_dev_info; + bdi->unplug_io_fn(bdi, page); + } + up_read(&swap_unplug_sem); +} + +static inline int scan_swap_map(struct swap_info_struct *si) +{ + unsigned long offset; + /* + * We try to cluster swap pages by allocating them + * sequentially in swap. Once we've allocated + * SWAPFILE_CLUSTER pages this way, however, we resort to + * first-free allocation, starting a new cluster. This + * prevents us from scattering swap pages all over the entire + * swap partition, so that we reduce overall disk seek times + * between swap pages. -- sct */ + if (si->cluster_nr) { + while (si->cluster_next <= si->highest_bit) { + offset = si->cluster_next++; + if (si->swap_map[offset]) + continue; + si->cluster_nr--; + goto got_page; + } + } + si->cluster_nr = SWAPFILE_CLUSTER; + + /* try to find an empty (even not aligned) cluster. */ + offset = si->lowest_bit; + check_next_cluster: + if (offset+SWAPFILE_CLUSTER-1 <= si->highest_bit) + { + unsigned long nr; + for (nr = offset; nr < offset+SWAPFILE_CLUSTER; nr++) + if (si->swap_map[nr]) + { + offset = nr+1; + goto check_next_cluster; + } + /* We found a completly empty cluster, so start + * using it. + */ + goto got_page; + } + /* No luck, so now go finegrined as usual. -Andrea */ + for (offset = si->lowest_bit; offset <= si->highest_bit ; offset++) { + if (si->swap_map[offset]) + continue; + si->lowest_bit = offset+1; + got_page: + if (offset == si->lowest_bit) + si->lowest_bit++; + if (offset == si->highest_bit) + si->highest_bit--; + if (si->lowest_bit > si->highest_bit) { + si->lowest_bit = si->max; + si->highest_bit = 0; + } + si->swap_map[offset] = 1; + si->inuse_pages++; + nr_swap_pages--; + si->cluster_next = offset+1; + return offset; + } + si->lowest_bit = si->max; + si->highest_bit = 0; + return 0; +} + +swp_entry_t get_swap_page(void) +{ + struct swap_info_struct * p; + unsigned long offset; + swp_entry_t entry; + int type, wrapped = 0; + + entry.val = 0; /* Out of memory */ + swap_list_lock(); + type = swap_list.next; + if (type < 0) + goto out; + if (nr_swap_pages <= 0) + goto out; + + while (1) { + p = &swap_info[type]; + if ((p->flags & SWP_ACTIVE) == SWP_ACTIVE) { + swap_device_lock(p); + offset = scan_swap_map(p); + swap_device_unlock(p); + if (offset) { + entry = swp_entry(type,offset); + type = swap_info[type].next; + if (type < 0 || + p->prio != swap_info[type].prio) { + swap_list.next = swap_list.head; + } else { + swap_list.next = type; + } + goto out; + } + } + type = p->next; + if (!wrapped) { + if (type < 0 || p->prio != swap_info[type].prio) { + type = swap_list.head; + wrapped = 1; + } + } else + if (type < 0) + goto out; /* out of swap space */ + } +out: + swap_list_unlock(); + return entry; +} + +static struct swap_info_struct * swap_info_get(swp_entry_t entry) +{ + struct swap_info_struct * p; + unsigned long offset, type; + + if (!entry.val) + goto out; + type = swp_type(entry); + if (type >= nr_swapfiles) + goto bad_nofile; + p = & swap_info[type]; + if (!(p->flags & SWP_USED)) + goto bad_device; + offset = swp_offset(entry); + if (offset >= p->max) + goto bad_offset; + if (!p->swap_map[offset]) + goto bad_free; + swap_list_lock(); + if (p->prio > swap_info[swap_list.next].prio) + swap_list.next = type; + swap_device_lock(p); + return p; + +bad_free: + printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val); + goto out; +bad_offset: + printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val); + goto out; +bad_device: + printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val); + goto out; +bad_nofile: + printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val); +out: + return NULL; +} + +static void swap_info_put(struct swap_info_struct * p) +{ + swap_device_unlock(p); + swap_list_unlock(); +} + +static int swap_entry_free(struct swap_info_struct *p, unsigned long offset) +{ + int count = p->swap_map[offset]; + + if (count < SWAP_MAP_MAX) { + count--; + p->swap_map[offset] = count; + if (!count) { + if (offset < p->lowest_bit) + p->lowest_bit = offset; + if (offset > p->highest_bit) + p->highest_bit = offset; + nr_swap_pages++; + p->inuse_pages--; + } + } + return count; +} + +/* + * Caller has made sure that the swapdevice corresponding to entry + * is still around or has not been recycled. + */ +void swap_free(swp_entry_t entry) +{ + struct swap_info_struct * p; + + p = swap_info_get(entry); + if (p) { + swap_entry_free(p, swp_offset(entry)); + swap_info_put(p); + } +} + +/* + * Check if we're the only user of a swap page, + * when the page is locked. + */ +static int exclusive_swap_page(struct page *page) +{ + int retval = 0; + struct swap_info_struct * p; + swp_entry_t entry; + + entry.val = page->private; + p = swap_info_get(entry); + if (p) { + /* Is the only swap cache user the cache itself? */ + if (p->swap_map[swp_offset(entry)] == 1) { + /* Recheck the page count with the swapcache lock held.. */ + write_lock_irq(&swapper_space.tree_lock); + if (page_count(page) == 2) + retval = 1; + write_unlock_irq(&swapper_space.tree_lock); + } + swap_info_put(p); + } + return retval; +} + +/* + * We can use this swap cache entry directly + * if there are no other references to it. + * + * Here "exclusive_swap_page()" does the real + * work, but we opportunistically check whether + * we need to get all the locks first.. + */ +int can_share_swap_page(struct page *page) +{ + int retval = 0; + + if (!PageLocked(page)) + BUG(); + switch (page_count(page)) { + case 3: + if (!PagePrivate(page)) + break; + /* Fallthrough */ + case 2: + if (!PageSwapCache(page)) + break; + retval = exclusive_swap_page(page); + break; + case 1: + if (PageReserved(page)) + break; + retval = 1; + } + return retval; +} + +/* + * Work out if there are any other processes sharing this + * swap cache page. Free it if you can. Return success. + */ +int remove_exclusive_swap_page(struct page *page) +{ + int retval; + struct swap_info_struct * p; + swp_entry_t entry; + + BUG_ON(PagePrivate(page)); + BUG_ON(!PageLocked(page)); + + if (!PageSwapCache(page)) + return 0; + if (PageWriteback(page)) + return 0; + if (page_count(page) != 2) /* 2: us + cache */ + return 0; + + entry.val = page->private; + p = swap_info_get(entry); + if (!p) + return 0; + + /* Is the only swap cache user the cache itself? */ + retval = 0; + if (p->swap_map[swp_offset(entry)] == 1) { + /* Recheck the page count with the swapcache lock held.. */ + write_lock_irq(&swapper_space.tree_lock); + if ((page_count(page) == 2) && !PageWriteback(page)) { + __delete_from_swap_cache(page); + SetPageDirty(page); + retval = 1; + } + write_unlock_irq(&swapper_space.tree_lock); + } + swap_info_put(p); + + if (retval) { + swap_free(entry); + page_cache_release(page); + } + + return retval; +} + +/* + * Free the swap entry like above, but also try to + * free the page cache entry if it is the last user. + */ +void free_swap_and_cache(swp_entry_t entry) +{ + struct swap_info_struct * p; + struct page *page = NULL; + + p = swap_info_get(entry); + if (p) { + if (swap_entry_free(p, swp_offset(entry)) == 1) + page = find_trylock_page(&swapper_space, entry.val); + swap_info_put(p); + } + if (page) { + int one_user; + + BUG_ON(PagePrivate(page)); + page_cache_get(page); + one_user = (page_count(page) == 2); + /* Only cache user (+us), or swap space full? Free it! */ + if (!PageWriteback(page) && (one_user || vm_swap_full())) { + delete_from_swap_cache(page); + SetPageDirty(page); + } + unlock_page(page); + page_cache_release(page); + } +} + +/* + * Always set the resulting pte to be nowrite (the same as COW pages + * after one process has exited). We don't know just how many PTEs will + * share this swap entry, so be cautious and let do_wp_page work out + * what to do if a write is requested later. + * + * vma->vm_mm->page_table_lock is held. + */ +static void unuse_pte(struct vm_area_struct *vma, pte_t *pte, + unsigned long addr, swp_entry_t entry, struct page *page) +{ + inc_mm_counter(vma->vm_mm, rss); + get_page(page); + set_pte_at(vma->vm_mm, addr, pte, + pte_mkold(mk_pte(page, vma->vm_page_prot))); + page_add_anon_rmap(page, vma, addr); + swap_free(entry); + /* + * Move the page to the active list so it is not + * immediately swapped out again after swapon. + */ + activate_page(page); +} + +static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, unsigned long end, + swp_entry_t entry, struct page *page) +{ + pte_t *pte; + pte_t swp_pte = swp_entry_to_pte(entry); + + pte = pte_offset_map(pmd, addr); + do { + /* + * swapoff spends a _lot_ of time in this loop! + * Test inline before going to call unuse_pte. + */ + if (unlikely(pte_same(*pte, swp_pte))) { + unuse_pte(vma, pte, addr, entry, page); + pte_unmap(pte); + return 1; + } + } while (pte++, addr += PAGE_SIZE, addr != end); + pte_unmap(pte - 1); + return 0; +} + +static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, + unsigned long addr, unsigned long end, + swp_entry_t entry, struct page *page) +{ + pmd_t *pmd; + unsigned long next; + + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (pmd_none_or_clear_bad(pmd)) + continue; + if (unuse_pte_range(vma, pmd, addr, next, entry, page)) + return 1; + } while (pmd++, addr = next, addr != end); + return 0; +} + +static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd, + unsigned long addr, unsigned long end, + swp_entry_t entry, struct page *page) +{ + pud_t *pud; + unsigned long next; + + pud = pud_offset(pgd, addr); + do { + next = pud_addr_end(addr, end); + if (pud_none_or_clear_bad(pud)) + continue; + if (unuse_pmd_range(vma, pud, addr, next, entry, page)) + return 1; + } while (pud++, addr = next, addr != end); + return 0; +} + +static int unuse_vma(struct vm_area_struct *vma, + swp_entry_t entry, struct page *page) +{ + pgd_t *pgd; + unsigned long addr, end, next; + + if (page->mapping) { + addr = page_address_in_vma(page, vma); + if (addr == -EFAULT) + return 0; + else + end = addr + PAGE_SIZE; + } else { + addr = vma->vm_start; + end = vma->vm_end; + } + + pgd = pgd_offset(vma->vm_mm, addr); + do { + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(pgd)) + continue; + if (unuse_pud_range(vma, pgd, addr, next, entry, page)) + return 1; + } while (pgd++, addr = next, addr != end); + return 0; +} + +static int unuse_mm(struct mm_struct *mm, + swp_entry_t entry, struct page *page) +{ + struct vm_area_struct *vma; + + if (!down_read_trylock(&mm->mmap_sem)) { + /* + * Our reference to the page stops try_to_unmap_one from + * unmapping its ptes, so swapoff can make progress. + */ + unlock_page(page); + down_read(&mm->mmap_sem); + lock_page(page); + } + spin_lock(&mm->page_table_lock); + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (vma->anon_vma && unuse_vma(vma, entry, page)) + break; + } + spin_unlock(&mm->page_table_lock); + up_read(&mm->mmap_sem); + /* + * Currently unuse_mm cannot fail, but leave error handling + * at call sites for now, since we change it from time to time. + */ + return 0; +} + +/* + * Scan swap_map from current position to next entry still in use. + * Recycle to start on reaching the end, returning 0 when empty. + */ +static int find_next_to_unuse(struct swap_info_struct *si, int prev) +{ + int max = si->max; + int i = prev; + int count; + + /* + * No need for swap_device_lock(si) here: we're just looking + * for whether an entry is in use, not modifying it; false + * hits are okay, and sys_swapoff() has already prevented new + * allocations from this area (while holding swap_list_lock()). + */ + for (;;) { + if (++i >= max) { + if (!prev) { + i = 0; + break; + } + /* + * No entries in use at top of swap_map, + * loop back to start and recheck there. + */ + max = prev + 1; + prev = 0; + i = 1; + } + count = si->swap_map[i]; + if (count && count != SWAP_MAP_BAD) + break; + } + return i; +} + +/* + * We completely avoid races by reading each swap page in advance, + * and then search for the process using it. All the necessary + * page table adjustments can then be made atomically. + */ +static int try_to_unuse(unsigned int type) +{ + struct swap_info_struct * si = &swap_info[type]; + struct mm_struct *start_mm; + unsigned short *swap_map; + unsigned short swcount; + struct page *page; + swp_entry_t entry; + int i = 0; + int retval = 0; + int reset_overflow = 0; + int shmem; + + /* + * When searching mms for an entry, a good strategy is to + * start at the first mm we freed the previous entry from + * (though actually we don't notice whether we or coincidence + * freed the entry). Initialize this start_mm with a hold. + * + * A simpler strategy would be to start at the last mm we + * freed the previous entry from; but that would take less + * advantage of mmlist ordering, which clusters forked mms + * together, child after parent. If we race with dup_mmap(), we + * prefer to resolve parent before child, lest we miss entries + * duplicated after we scanned child: using last mm would invert + * that. Though it's only a serious concern when an overflowed + * swap count is reset from SWAP_MAP_MAX, preventing a rescan. + */ + start_mm = &init_mm; + atomic_inc(&init_mm.mm_users); + + /* + * Keep on scanning until all entries have gone. Usually, + * one pass through swap_map is enough, but not necessarily: + * there are races when an instance of an entry might be missed. + */ + while ((i = find_next_to_unuse(si, i)) != 0) { + if (signal_pending(current)) { + retval = -EINTR; + break; + } + + /* + * Get a page for the entry, using the existing swap + * cache page if there is one. Otherwise, get a clean + * page and read the swap into it. + */ + swap_map = &si->swap_map[i]; + entry = swp_entry(type, i); + page = read_swap_cache_async(entry, NULL, 0); + if (!page) { + /* + * Either swap_duplicate() failed because entry + * has been freed independently, and will not be + * reused since sys_swapoff() already disabled + * allocation from here, or alloc_page() failed. + */ + if (!*swap_map) + continue; + retval = -ENOMEM; + break; + } + + /* + * Don't hold on to start_mm if it looks like exiting. + */ + if (atomic_read(&start_mm->mm_users) == 1) { + mmput(start_mm); + start_mm = &init_mm; + atomic_inc(&init_mm.mm_users); + } + + /* + * Wait for and lock page. When do_swap_page races with + * try_to_unuse, do_swap_page can handle the fault much + * faster than try_to_unuse can locate the entry. This + * apparently redundant "wait_on_page_locked" lets try_to_unuse + * defer to do_swap_page in such a case - in some tests, + * do_swap_page and try_to_unuse repeatedly compete. + */ + wait_on_page_locked(page); + wait_on_page_writeback(page); + lock_page(page); + wait_on_page_writeback(page); + + /* + * Remove all references to entry. + * Whenever we reach init_mm, there's no address space + * to search, but use it as a reminder to search shmem. + */ + shmem = 0; + swcount = *swap_map; + if (swcount > 1) { + if (start_mm == &init_mm) + shmem = shmem_unuse(entry, page); + else + retval = unuse_mm(start_mm, entry, page); + } + if (*swap_map > 1) { + int set_start_mm = (*swap_map >= swcount); + struct list_head *p = &start_mm->mmlist; + struct mm_struct *new_start_mm = start_mm; + struct mm_struct *prev_mm = start_mm; + struct mm_struct *mm; + + atomic_inc(&new_start_mm->mm_users); + atomic_inc(&prev_mm->mm_users); + spin_lock(&mmlist_lock); + while (*swap_map > 1 && !retval && + (p = p->next) != &start_mm->mmlist) { + mm = list_entry(p, struct mm_struct, mmlist); + if (atomic_inc_return(&mm->mm_users) == 1) { + atomic_dec(&mm->mm_users); + continue; + } + spin_unlock(&mmlist_lock); + mmput(prev_mm); + prev_mm = mm; + + cond_resched(); + + swcount = *swap_map; + if (swcount <= 1) + ; + else if (mm == &init_mm) { + set_start_mm = 1; + shmem = shmem_unuse(entry, page); + } else + retval = unuse_mm(mm, entry, page); + if (set_start_mm && *swap_map < swcount) { + mmput(new_start_mm); + atomic_inc(&mm->mm_users); + new_start_mm = mm; + set_start_mm = 0; + } + spin_lock(&mmlist_lock); + } + spin_unlock(&mmlist_lock); + mmput(prev_mm); + mmput(start_mm); + start_mm = new_start_mm; + } + if (retval) { + unlock_page(page); + page_cache_release(page); + break; + } + + /* + * How could swap count reach 0x7fff when the maximum + * pid is 0x7fff, and there's no way to repeat a swap + * page within an mm (except in shmem, where it's the + * shared object which takes the reference count)? + * We believe SWAP_MAP_MAX cannot occur in Linux 2.4. + * + * If that's wrong, then we should worry more about + * exit_mmap() and do_munmap() cases described above: + * we might be resetting SWAP_MAP_MAX too early here. + * We know "Undead"s can happen, they're okay, so don't + * report them; but do report if we reset SWAP_MAP_MAX. + */ + if (*swap_map == SWAP_MAP_MAX) { + swap_device_lock(si); + *swap_map = 1; + swap_device_unlock(si); + reset_overflow = 1; + } + + /* + * If a reference remains (rare), we would like to leave + * the page in the swap cache; but try_to_unmap could + * then re-duplicate the entry once we drop page lock, + * so we might loop indefinitely; also, that page could + * not be swapped out to other storage meanwhile. So: + * delete from cache even if there's another reference, + * after ensuring that the data has been saved to disk - + * since if the reference remains (rarer), it will be + * read from disk into another page. Splitting into two + * pages would be incorrect if swap supported "shared + * private" pages, but they are handled by tmpfs files. + * + * Note shmem_unuse already deleted a swappage from + * the swap cache, unless the move to filepage failed: + * in which case it left swappage in cache, lowered its + * swap count to pass quickly through the loops above, + * and now we must reincrement count to try again later. + */ + if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) { + struct writeback_control wbc = { + .sync_mode = WB_SYNC_NONE, + }; + + swap_writepage(page, &wbc); + lock_page(page); + wait_on_page_writeback(page); + } + if (PageSwapCache(page)) { + if (shmem) + swap_duplicate(entry); + else + delete_from_swap_cache(page); + } + + /* + * So we could skip searching mms once swap count went + * to 1, we did not mark any present ptes as dirty: must + * mark page dirty so shrink_list will preserve it. + */ + SetPageDirty(page); + unlock_page(page); + page_cache_release(page); + + /* + * Make sure that we aren't completely killing + * interactive performance. + */ + cond_resched(); + } + + mmput(start_mm); + if (reset_overflow) { + printk(KERN_WARNING "swapoff: cleared swap entry overflow\n"); + swap_overflow = 0; + } + return retval; +} + +/* + * After a successful try_to_unuse, if no swap is now in use, we know we + * can empty the mmlist. swap_list_lock must be held on entry and exit. + * Note that mmlist_lock nests inside swap_list_lock, and an mm must be + * added to the mmlist just after page_duplicate - before would be racy. + */ +static void drain_mmlist(void) +{ + struct list_head *p, *next; + unsigned int i; + + for (i = 0; i < nr_swapfiles; i++) + if (swap_info[i].inuse_pages) + return; + spin_lock(&mmlist_lock); + list_for_each_safe(p, next, &init_mm.mmlist) + list_del_init(p); + spin_unlock(&mmlist_lock); +} + +/* + * Use this swapdev's extent info to locate the (PAGE_SIZE) block which + * corresponds to page offset `offset'. + */ +sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset) +{ + struct swap_extent *se = sis->curr_swap_extent; + struct swap_extent *start_se = se; + + for ( ; ; ) { + struct list_head *lh; + + if (se->start_page <= offset && + offset < (se->start_page + se->nr_pages)) { + return se->start_block + (offset - se->start_page); + } + lh = se->list.prev; + if (lh == &sis->extent_list) + lh = lh->prev; + se = list_entry(lh, struct swap_extent, list); + sis->curr_swap_extent = se; + BUG_ON(se == start_se); /* It *must* be present */ + } +} + +/* + * Free all of a swapdev's extent information + */ +static void destroy_swap_extents(struct swap_info_struct *sis) +{ + while (!list_empty(&sis->extent_list)) { + struct swap_extent *se; + + se = list_entry(sis->extent_list.next, + struct swap_extent, list); + list_del(&se->list); + kfree(se); + } + sis->nr_extents = 0; +} + +/* + * Add a block range (and the corresponding page range) into this swapdev's + * extent list. The extent list is kept sorted in block order. + * + * This function rather assumes that it is called in ascending sector_t order. + * It doesn't look for extent coalescing opportunities. + */ +static int +add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, + unsigned long nr_pages, sector_t start_block) +{ + struct swap_extent *se; + struct swap_extent *new_se; + struct list_head *lh; + + lh = sis->extent_list.next; /* The highest-addressed block */ + while (lh != &sis->extent_list) { + se = list_entry(lh, struct swap_extent, list); + if (se->start_block + se->nr_pages == start_block && + se->start_page + se->nr_pages == start_page) { + /* Merge it */ + se->nr_pages += nr_pages; + return 0; + } + lh = lh->next; + } + + /* + * No merge. Insert a new extent, preserving ordering. + */ + new_se = kmalloc(sizeof(*se), GFP_KERNEL); + if (new_se == NULL) + return -ENOMEM; + new_se->start_page = start_page; + new_se->nr_pages = nr_pages; + new_se->start_block = start_block; + + lh = sis->extent_list.prev; /* The lowest block */ + while (lh != &sis->extent_list) { + se = list_entry(lh, struct swap_extent, list); + if (se->start_block > start_block) + break; + lh = lh->prev; + } + list_add_tail(&new_se->list, lh); + sis->nr_extents++; + return 0; +} + +/* + * A `swap extent' is a simple thing which maps a contiguous range of pages + * onto a contiguous range of disk blocks. An ordered list of swap extents + * is built at swapon time and is then used at swap_writepage/swap_readpage + * time for locating where on disk a page belongs. + * + * If the swapfile is an S_ISBLK block device, a single extent is installed. + * This is done so that the main operating code can treat S_ISBLK and S_ISREG + * swap files identically. + * + * Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap + * extent list operates in PAGE_SIZE disk blocks. Both S_ISREG and S_ISBLK + * swapfiles are handled *identically* after swapon time. + * + * For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks + * and will parse them into an ordered extent list, in PAGE_SIZE chunks. If + * some stray blocks are found which do not fall within the PAGE_SIZE alignment + * requirements, they are simply tossed out - we will never use those blocks + * for swapping. + * + * For S_ISREG swapfiles we hold i_sem across the life of the swapon. This + * prevents root from shooting her foot off by ftruncating an in-use swapfile, + * which will scribble on the fs. + * + * The amount of disk space which a single swap extent represents varies. + * Typically it is in the 1-4 megabyte range. So we can have hundreds of + * extents in the list. To avoid much list walking, we cache the previous + * search location in `curr_swap_extent', and start new searches from there. + * This is extremely effective. The average number of iterations in + * map_swap_page() has been measured at about 0.3 per page. - akpm. + */ +static int setup_swap_extents(struct swap_info_struct *sis) +{ + struct inode *inode; + unsigned blocks_per_page; + unsigned long page_no; + unsigned blkbits; + sector_t probe_block; + sector_t last_block; + int ret; + + inode = sis->swap_file->f_mapping->host; + if (S_ISBLK(inode->i_mode)) { + ret = add_swap_extent(sis, 0, sis->max, 0); + goto done; + } + + blkbits = inode->i_blkbits; + blocks_per_page = PAGE_SIZE >> blkbits; + + /* + * Map all the blocks into the extent list. This code doesn't try + * to be very smart. + */ + probe_block = 0; + page_no = 0; + last_block = i_size_read(inode) >> blkbits; + while ((probe_block + blocks_per_page) <= last_block && + page_no < sis->max) { + unsigned block_in_page; + sector_t first_block; + + first_block = bmap(inode, probe_block); + if (first_block == 0) + goto bad_bmap; + + /* + * It must be PAGE_SIZE aligned on-disk + */ + if (first_block & (blocks_per_page - 1)) { + probe_block++; + goto reprobe; + } + + for (block_in_page = 1; block_in_page < blocks_per_page; + block_in_page++) { + sector_t block; + + block = bmap(inode, probe_block + block_in_page); + if (block == 0) + goto bad_bmap; + if (block != first_block + block_in_page) { + /* Discontiguity */ + probe_block++; + goto reprobe; + } + } + + /* + * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks + */ + ret = add_swap_extent(sis, page_no, 1, + first_block >> (PAGE_SHIFT - blkbits)); + if (ret) + goto out; + page_no++; + probe_block += blocks_per_page; +reprobe: + continue; + } + ret = 0; + if (page_no == 0) + ret = -EINVAL; + sis->max = page_no; + sis->highest_bit = page_no - 1; +done: + sis->curr_swap_extent = list_entry(sis->extent_list.prev, + struct swap_extent, list); + goto out; +bad_bmap: + printk(KERN_ERR "swapon: swapfile has holes\n"); + ret = -EINVAL; +out: + return ret; +} + +#if 0 /* We don't need this yet */ +#include +int page_queue_congested(struct page *page) +{ + struct backing_dev_info *bdi; + + BUG_ON(!PageLocked(page)); /* It pins the swap_info_struct */ + + if (PageSwapCache(page)) { + swp_entry_t entry = { .val = page->private }; + struct swap_info_struct *sis; + + sis = get_swap_info_struct(swp_type(entry)); + bdi = sis->bdev->bd_inode->i_mapping->backing_dev_info; + } else + bdi = page->mapping->backing_dev_info; + return bdi_write_congested(bdi); +} +#endif + +asmlinkage long sys_swapoff(const char __user * specialfile) +{ + struct swap_info_struct * p = NULL; + unsigned short *swap_map; + struct file *swap_file, *victim; + struct address_space *mapping; + struct inode *inode; + char * pathname; + int i, type, prev; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + pathname = getname(specialfile); + err = PTR_ERR(pathname); + if (IS_ERR(pathname)) + goto out; + + victim = filp_open(pathname, O_RDWR|O_LARGEFILE, 0); + putname(pathname); + err = PTR_ERR(victim); + if (IS_ERR(victim)) + goto out; + + mapping = victim->f_mapping; + prev = -1; + swap_list_lock(); + for (type = swap_list.head; type >= 0; type = swap_info[type].next) { + p = swap_info + type; + if ((p->flags & SWP_ACTIVE) == SWP_ACTIVE) { + if (p->swap_file->f_mapping == mapping) + break; + } + prev = type; + } + if (type < 0) { + err = -EINVAL; + swap_list_unlock(); + goto out_dput; + } + if (!security_vm_enough_memory(p->pages)) + vm_unacct_memory(p->pages); + else { + err = -ENOMEM; + swap_list_unlock(); + goto out_dput; + } + if (prev < 0) { + swap_list.head = p->next; + } else { + swap_info[prev].next = p->next; + } + if (type == swap_list.next) { + /* just pick something that's safe... */ + swap_list.next = swap_list.head; + } + nr_swap_pages -= p->pages; + total_swap_pages -= p->pages; + p->flags &= ~SWP_WRITEOK; + swap_list_unlock(); + current->flags |= PF_SWAPOFF; + err = try_to_unuse(type); + current->flags &= ~PF_SWAPOFF; + + /* wait for any unplug function to finish */ + down_write(&swap_unplug_sem); + up_write(&swap_unplug_sem); + + if (err) { + /* re-insert swap space back into swap_list */ + swap_list_lock(); + for (prev = -1, i = swap_list.head; i >= 0; prev = i, i = swap_info[i].next) + if (p->prio >= swap_info[i].prio) + break; + p->next = i; + if (prev < 0) + swap_list.head = swap_list.next = p - swap_info; + else + swap_info[prev].next = p - swap_info; + nr_swap_pages += p->pages; + total_swap_pages += p->pages; + p->flags |= SWP_WRITEOK; + swap_list_unlock(); + goto out_dput; + } + down(&swapon_sem); + swap_list_lock(); + drain_mmlist(); + swap_device_lock(p); + swap_file = p->swap_file; + p->swap_file = NULL; + p->max = 0; + swap_map = p->swap_map; + p->swap_map = NULL; + p->flags = 0; + destroy_swap_extents(p); + swap_device_unlock(p); + swap_list_unlock(); + up(&swapon_sem); + vfree(swap_map); + inode = mapping->host; + if (S_ISBLK(inode->i_mode)) { + struct block_device *bdev = I_BDEV(inode); + set_blocksize(bdev, p->old_block_size); + bd_release(bdev); + } else { + down(&inode->i_sem); + inode->i_flags &= ~S_SWAPFILE; + up(&inode->i_sem); + } + filp_close(swap_file, NULL); + err = 0; + +out_dput: + filp_close(victim, NULL); +out: + return err; +} + +#ifdef CONFIG_PROC_FS +/* iterator */ +static void *swap_start(struct seq_file *swap, loff_t *pos) +{ + struct swap_info_struct *ptr = swap_info; + int i; + loff_t l = *pos; + + down(&swapon_sem); + + for (i = 0; i < nr_swapfiles; i++, ptr++) { + if (!(ptr->flags & SWP_USED) || !ptr->swap_map) + continue; + if (!l--) + return ptr; + } + + return NULL; +} + +static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) +{ + struct swap_info_struct *ptr = v; + struct swap_info_struct *endptr = swap_info + nr_swapfiles; + + for (++ptr; ptr < endptr; ptr++) { + if (!(ptr->flags & SWP_USED) || !ptr->swap_map) + continue; + ++*pos; + return ptr; + } + + return NULL; +} + +static void swap_stop(struct seq_file *swap, void *v) +{ + up(&swapon_sem); +} + +static int swap_show(struct seq_file *swap, void *v) +{ + struct swap_info_struct *ptr = v; + struct file *file; + int len; + + if (v == swap_info) + seq_puts(swap, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n"); + + file = ptr->swap_file; + len = seq_path(swap, file->f_vfsmnt, file->f_dentry, " \t\n\\"); + seq_printf(swap, "%*s%s\t%d\t%ld\t%d\n", + len < 40 ? 40 - len : 1, " ", + S_ISBLK(file->f_dentry->d_inode->i_mode) ? + "partition" : "file\t", + ptr->pages << (PAGE_SHIFT - 10), + ptr->inuse_pages << (PAGE_SHIFT - 10), + ptr->prio); + return 0; +} + +static struct seq_operations swaps_op = { + .start = swap_start, + .next = swap_next, + .stop = swap_stop, + .show = swap_show +}; + +static int swaps_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &swaps_op); +} + +static struct file_operations proc_swaps_operations = { + .open = swaps_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init procswaps_init(void) +{ + struct proc_dir_entry *entry; + + entry = create_proc_entry("swaps", 0, NULL); + if (entry) + entry->proc_fops = &proc_swaps_operations; + return 0; +} +__initcall(procswaps_init); +#endif /* CONFIG_PROC_FS */ + +/* + * Written 01/25/92 by Simmule Turner, heavily changed by Linus. + * + * The swapon system call + */ +asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) +{ + struct swap_info_struct * p; + char *name = NULL; + struct block_device *bdev = NULL; + struct file *swap_file = NULL; + struct address_space *mapping; + unsigned int type; + int i, prev; + int error; + static int least_priority; + union swap_header *swap_header = NULL; + int swap_header_version; + int nr_good_pages = 0; + unsigned long maxpages = 1; + int swapfilesize; + unsigned short *swap_map; + struct page *page = NULL; + struct inode *inode = NULL; + int did_down = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + swap_list_lock(); + p = swap_info; + for (type = 0 ; type < nr_swapfiles ; type++,p++) + if (!(p->flags & SWP_USED)) + break; + error = -EPERM; + /* + * Test if adding another swap device is possible. There are + * two limiting factors: 1) the number of bits for the swap + * type swp_entry_t definition and 2) the number of bits for + * the swap type in the swap ptes as defined by the different + * architectures. To honor both limitations a swap entry + * with swap offset 0 and swap type ~0UL is created, encoded + * to a swap pte, decoded to a swp_entry_t again and finally + * the swap type part is extracted. This will mask all bits + * from the initial ~0UL that can't be encoded in either the + * swp_entry_t or the architecture definition of a swap pte. + */ + if (type > swp_type(pte_to_swp_entry(swp_entry_to_pte(swp_entry(~0UL,0))))) { + swap_list_unlock(); + goto out; + } + if (type >= nr_swapfiles) + nr_swapfiles = type+1; + INIT_LIST_HEAD(&p->extent_list); + p->flags = SWP_USED; + p->nr_extents = 0; + p->swap_file = NULL; + p->old_block_size = 0; + p->swap_map = NULL; + p->lowest_bit = 0; + p->highest_bit = 0; + p->cluster_nr = 0; + p->inuse_pages = 0; + spin_lock_init(&p->sdev_lock); + p->next = -1; + if (swap_flags & SWAP_FLAG_PREFER) { + p->prio = + (swap_flags & SWAP_FLAG_PRIO_MASK)>>SWAP_FLAG_PRIO_SHIFT; + } else { + p->prio = --least_priority; + } + swap_list_unlock(); + name = getname(specialfile); + error = PTR_ERR(name); + if (IS_ERR(name)) { + name = NULL; + goto bad_swap_2; + } + swap_file = filp_open(name, O_RDWR|O_LARGEFILE, 0); + error = PTR_ERR(swap_file); + if (IS_ERR(swap_file)) { + swap_file = NULL; + goto bad_swap_2; + } + + p->swap_file = swap_file; + mapping = swap_file->f_mapping; + inode = mapping->host; + + error = -EBUSY; + for (i = 0; i < nr_swapfiles; i++) { + struct swap_info_struct *q = &swap_info[i]; + + if (i == type || !q->swap_file) + continue; + if (mapping == q->swap_file->f_mapping) + goto bad_swap; + } + + error = -EINVAL; + if (S_ISBLK(inode->i_mode)) { + bdev = I_BDEV(inode); + error = bd_claim(bdev, sys_swapon); + if (error < 0) { + bdev = NULL; + goto bad_swap; + } + p->old_block_size = block_size(bdev); + error = set_blocksize(bdev, PAGE_SIZE); + if (error < 0) + goto bad_swap; + p->bdev = bdev; + } else if (S_ISREG(inode->i_mode)) { + p->bdev = inode->i_sb->s_bdev; + down(&inode->i_sem); + did_down = 1; + if (IS_SWAPFILE(inode)) { + error = -EBUSY; + goto bad_swap; + } + } else { + goto bad_swap; + } + + swapfilesize = i_size_read(inode) >> PAGE_SHIFT; + + /* + * Read the swap header. + */ + if (!mapping->a_ops->readpage) { + error = -EINVAL; + goto bad_swap; + } + page = read_cache_page(mapping, 0, + (filler_t *)mapping->a_ops->readpage, swap_file); + if (IS_ERR(page)) { + error = PTR_ERR(page); + goto bad_swap; + } + wait_on_page_locked(page); + if (!PageUptodate(page)) + goto bad_swap; + kmap(page); + swap_header = page_address(page); + + if (!memcmp("SWAP-SPACE",swap_header->magic.magic,10)) + swap_header_version = 1; + else if (!memcmp("SWAPSPACE2",swap_header->magic.magic,10)) + swap_header_version = 2; + else { + printk("Unable to find swap-space signature\n"); + error = -EINVAL; + goto bad_swap; + } + + switch (swap_header_version) { + case 1: + printk(KERN_ERR "version 0 swap is no longer supported. " + "Use mkswap -v1 %s\n", name); + error = -EINVAL; + goto bad_swap; + case 2: + /* Check the swap header's sub-version and the size of + the swap file and bad block lists */ + if (swap_header->info.version != 1) { + printk(KERN_WARNING + "Unable to handle swap header version %d\n", + swap_header->info.version); + error = -EINVAL; + goto bad_swap; + } + + p->lowest_bit = 1; + /* + * Find out how many pages are allowed for a single swap + * device. There are two limiting factors: 1) the number of + * bits for the swap offset in the swp_entry_t type and + * 2) the number of bits in the a swap pte as defined by + * the different architectures. In order to find the + * largest possible bit mask a swap entry with swap type 0 + * and swap offset ~0UL is created, encoded to a swap pte, + * decoded to a swp_entry_t again and finally the swap + * offset is extracted. This will mask all the bits from + * the initial ~0UL mask that can't be encoded in either + * the swp_entry_t or the architecture definition of a + * swap pte. + */ + maxpages = swp_offset(pte_to_swp_entry(swp_entry_to_pte(swp_entry(0,~0UL)))) - 1; + if (maxpages > swap_header->info.last_page) + maxpages = swap_header->info.last_page; + p->highest_bit = maxpages - 1; + + error = -EINVAL; + if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) + goto bad_swap; + + /* OK, set up the swap map and apply the bad block list */ + if (!(p->swap_map = vmalloc(maxpages * sizeof(short)))) { + error = -ENOMEM; + goto bad_swap; + } + + error = 0; + memset(p->swap_map, 0, maxpages * sizeof(short)); + for (i=0; iinfo.nr_badpages; i++) { + int page = swap_header->info.badpages[i]; + if (page <= 0 || page >= swap_header->info.last_page) + error = -EINVAL; + else + p->swap_map[page] = SWAP_MAP_BAD; + } + nr_good_pages = swap_header->info.last_page - + swap_header->info.nr_badpages - + 1 /* header page */; + if (error) + goto bad_swap; + } + + if (swapfilesize && maxpages > swapfilesize) { + printk(KERN_WARNING + "Swap area shorter than signature indicates\n"); + error = -EINVAL; + goto bad_swap; + } + if (!nr_good_pages) { + printk(KERN_WARNING "Empty swap-file\n"); + error = -EINVAL; + goto bad_swap; + } + p->swap_map[0] = SWAP_MAP_BAD; + p->max = maxpages; + p->pages = nr_good_pages; + + error = setup_swap_extents(p); + if (error) + goto bad_swap; + + down(&swapon_sem); + swap_list_lock(); + swap_device_lock(p); + p->flags = SWP_ACTIVE; + nr_swap_pages += nr_good_pages; + total_swap_pages += nr_good_pages; + printk(KERN_INFO "Adding %dk swap on %s. Priority:%d extents:%d\n", + nr_good_pages<<(PAGE_SHIFT-10), name, + p->prio, p->nr_extents); + + /* insert swap space into swap_list: */ + prev = -1; + for (i = swap_list.head; i >= 0; i = swap_info[i].next) { + if (p->prio >= swap_info[i].prio) { + break; + } + prev = i; + } + p->next = i; + if (prev < 0) { + swap_list.head = swap_list.next = p - swap_info; + } else { + swap_info[prev].next = p - swap_info; + } + swap_device_unlock(p); + swap_list_unlock(); + up(&swapon_sem); + error = 0; + goto out; +bad_swap: + if (bdev) { + set_blocksize(bdev, p->old_block_size); + bd_release(bdev); + } +bad_swap_2: + swap_list_lock(); + swap_map = p->swap_map; + p->swap_file = NULL; + p->swap_map = NULL; + p->flags = 0; + if (!(swap_flags & SWAP_FLAG_PREFER)) + ++least_priority; + swap_list_unlock(); + destroy_swap_extents(p); + vfree(swap_map); + if (swap_file) + filp_close(swap_file, NULL); +out: + if (page && !IS_ERR(page)) { + kunmap(page); + page_cache_release(page); + } + if (name) + putname(name); + if (did_down) { + if (!error) + inode->i_flags |= S_SWAPFILE; + up(&inode->i_sem); + } + return error; +} + +void si_swapinfo(struct sysinfo *val) +{ + unsigned int i; + unsigned long nr_to_be_unused = 0; + + swap_list_lock(); + for (i = 0; i < nr_swapfiles; i++) { + if (!(swap_info[i].flags & SWP_USED) || + (swap_info[i].flags & SWP_WRITEOK)) + continue; + nr_to_be_unused += swap_info[i].inuse_pages; + } + val->freeswap = nr_swap_pages + nr_to_be_unused; + val->totalswap = total_swap_pages + nr_to_be_unused; + swap_list_unlock(); +} + +/* + * Verify that a swap entry is valid and increment its swap map count. + * + * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as + * "permanent", but will be reclaimed by the next swapoff. + */ +int swap_duplicate(swp_entry_t entry) +{ + struct swap_info_struct * p; + unsigned long offset, type; + int result = 0; + + type = swp_type(entry); + if (type >= nr_swapfiles) + goto bad_file; + p = type + swap_info; + offset = swp_offset(entry); + + swap_device_lock(p); + if (offset < p->max && p->swap_map[offset]) { + if (p->swap_map[offset] < SWAP_MAP_MAX - 1) { + p->swap_map[offset]++; + result = 1; + } else if (p->swap_map[offset] <= SWAP_MAP_MAX) { + if (swap_overflow++ < 5) + printk(KERN_WARNING "swap_dup: swap entry overflow\n"); + p->swap_map[offset] = SWAP_MAP_MAX; + result = 1; + } + } + swap_device_unlock(p); +out: + return result; + +bad_file: + printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); + goto out; +} + +struct swap_info_struct * +get_swap_info_struct(unsigned type) +{ + return &swap_info[type]; +} + +/* + * swap_device_lock prevents swap_map being freed. Don't grab an extra + * reference on the swaphandle, it doesn't matter if it becomes unused. + */ +int valid_swaphandles(swp_entry_t entry, unsigned long *offset) +{ + int ret = 0, i = 1 << page_cluster; + unsigned long toff; + struct swap_info_struct *swapdev = swp_type(entry) + swap_info; + + if (!page_cluster) /* no readahead */ + return 0; + toff = (swp_offset(entry) >> page_cluster) << page_cluster; + if (!toff) /* first page is swap header */ + toff++, i--; + *offset = toff; + + swap_device_lock(swapdev); + do { + /* Don't read-ahead past the end of the swap area */ + if (toff >= swapdev->max) + break; + /* Don't read in free or bad pages */ + if (!swapdev->swap_map[toff]) + break; + if (swapdev->swap_map[toff] == SWAP_MAP_BAD) + break; + toff++; + ret++; + } while (--i); + swap_device_unlock(swapdev); + return ret; +} diff --git a/mm/thrash.c b/mm/thrash.c new file mode 100644 index 000000000000..11461f7ad830 --- /dev/null +++ b/mm/thrash.c @@ -0,0 +1,102 @@ +/* + * mm/thrash.c + * + * Copyright (C) 2004, Red Hat, Inc. + * Copyright (C) 2004, Rik van Riel + * Released under the GPL, see the file COPYING for details. + * + * Simple token based thrashing protection, using the algorithm + * described in: http://www.cs.wm.edu/~sjiang/token.pdf + */ +#include +#include +#include +#include + +static DEFINE_SPINLOCK(swap_token_lock); +static unsigned long swap_token_timeout; +static unsigned long swap_token_check; +struct mm_struct * swap_token_mm = &init_mm; + +#define SWAP_TOKEN_CHECK_INTERVAL (HZ * 2) +#define SWAP_TOKEN_TIMEOUT 0 +/* + * Currently disabled; Needs further code to work at HZ * 300. + */ +unsigned long swap_token_default_timeout = SWAP_TOKEN_TIMEOUT; + +/* + * Take the token away if the process had no page faults + * in the last interval, or if it has held the token for + * too long. + */ +#define SWAP_TOKEN_ENOUGH_RSS 1 +#define SWAP_TOKEN_TIMED_OUT 2 +static int should_release_swap_token(struct mm_struct *mm) +{ + int ret = 0; + if (!mm->recent_pagein) + ret = SWAP_TOKEN_ENOUGH_RSS; + else if (time_after(jiffies, swap_token_timeout)) + ret = SWAP_TOKEN_TIMED_OUT; + mm->recent_pagein = 0; + return ret; +} + +/* + * Try to grab the swapout protection token. We only try to + * grab it once every TOKEN_CHECK_INTERVAL, both to prevent + * SMP lock contention and to check that the process that held + * the token before is no longer thrashing. + */ +void grab_swap_token(void) +{ + struct mm_struct *mm; + int reason; + + /* We have the token. Let others know we still need it. */ + if (has_swap_token(current->mm)) { + current->mm->recent_pagein = 1; + return; + } + + if (time_after(jiffies, swap_token_check)) { + + /* Can't get swapout protection if we exceed our RSS limit. */ + // if (current->mm->rss > current->mm->rlimit_rss) + // return; + + /* ... or if we recently held the token. */ + if (time_before(jiffies, current->mm->swap_token_time)) + return; + + if (!spin_trylock(&swap_token_lock)) + return; + + swap_token_check = jiffies + SWAP_TOKEN_CHECK_INTERVAL; + + mm = swap_token_mm; + if ((reason = should_release_swap_token(mm))) { + unsigned long eligible = jiffies; + if (reason == SWAP_TOKEN_TIMED_OUT) { + eligible += swap_token_default_timeout; + } + mm->swap_token_time = eligible; + swap_token_timeout = jiffies + swap_token_default_timeout; + swap_token_mm = current->mm; + } + spin_unlock(&swap_token_lock); + } + return; +} + +/* Called on process exit. */ +void __put_swap_token(struct mm_struct *mm) +{ + spin_lock(&swap_token_lock); + if (likely(mm == swap_token_mm)) { + swap_token_mm = &init_mm; + swap_token_check = jiffies; + } + spin_unlock(&swap_token_lock); +} diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c new file mode 100644 index 000000000000..c13a2161bca2 --- /dev/null +++ b/mm/tiny-shmem.c @@ -0,0 +1,122 @@ +/* + * tiny-shmem.c: simple shmemfs and tmpfs using ramfs code + * + * Matt Mackall January, 2004 + * derived from mm/shmem.c and fs/ramfs/inode.c + * + * This is intended for small system where the benefits of the full + * shmem code (swap-backed and resource-limited) are outweighed by + * their complexity. On systems without swap this code should be + * effectively equivalent, but much lighter weight. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static struct file_system_type tmpfs_fs_type = { + .name = "tmpfs", + .get_sb = ramfs_get_sb, + .kill_sb = kill_litter_super, +}; + +static struct vfsmount *shm_mnt; + +static int __init init_tmpfs(void) +{ + register_filesystem(&tmpfs_fs_type); +#ifdef CONFIG_TMPFS + devfs_mk_dir("shm"); +#endif + shm_mnt = kern_mount(&tmpfs_fs_type); + return 0; +} +module_init(init_tmpfs) + +/* + * shmem_file_setup - get an unlinked file living in tmpfs + * + * @name: name for dentry (to be seen in /proc//maps + * @size: size to be set for the file + * + */ +struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) +{ + int error; + struct file *file; + struct inode *inode; + struct dentry *dentry, *root; + struct qstr this; + + if (IS_ERR(shm_mnt)) + return (void *)shm_mnt; + + error = -ENOMEM; + this.name = name; + this.len = strlen(name); + this.hash = 0; /* will go */ + root = shm_mnt->mnt_root; + dentry = d_alloc(root, &this); + if (!dentry) + goto put_memory; + + error = -ENFILE; + file = get_empty_filp(); + if (!file) + goto put_dentry; + + error = -ENOSPC; + inode = ramfs_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0); + if (!inode) + goto close_file; + + d_instantiate(dentry, inode); + inode->i_size = size; + inode->i_nlink = 0; /* It is unlinked */ + file->f_vfsmnt = mntget(shm_mnt); + file->f_dentry = dentry; + file->f_mapping = inode->i_mapping; + file->f_op = &ramfs_file_operations; + file->f_mode = FMODE_WRITE | FMODE_READ; + return file; + +close_file: + put_filp(file); +put_dentry: + dput(dentry); +put_memory: + return ERR_PTR(error); +} + +/* + * shmem_zero_setup - setup a shared anonymous mapping + * + * @vma: the vma to be mmapped is prepared by do_mmap_pgoff + */ +int shmem_zero_setup(struct vm_area_struct *vma) +{ + struct file *file; + loff_t size = vma->vm_end - vma->vm_start; + + file = shmem_file_setup("dev/zero", size, vma->vm_flags); + if (IS_ERR(file)) + return PTR_ERR(file); + + if (vma->vm_file) + fput(vma->vm_file); + vma->vm_file = file; + vma->vm_ops = &generic_file_vm_ops; + return 0; +} + +int shmem_unuse(swp_entry_t entry, struct page *page) +{ + return 0; +} diff --git a/mm/truncate.c b/mm/truncate.c new file mode 100644 index 000000000000..c9a63f0b69a2 --- /dev/null +++ b/mm/truncate.c @@ -0,0 +1,336 @@ +/* + * mm/truncate.c - code for taking down pages from address_spaces + * + * Copyright (C) 2002, Linus Torvalds + * + * 10Sep2002 akpm@zip.com.au + * Initial version. + */ + +#include +#include +#include +#include +#include +#include /* grr. try_to_release_page, + block_invalidatepage */ + + +static int do_invalidatepage(struct page *page, unsigned long offset) +{ + int (*invalidatepage)(struct page *, unsigned long); + invalidatepage = page->mapping->a_ops->invalidatepage; + if (invalidatepage == NULL) + invalidatepage = block_invalidatepage; + return (*invalidatepage)(page, offset); +} + +static inline void truncate_partial_page(struct page *page, unsigned partial) +{ + memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial); + if (PagePrivate(page)) + do_invalidatepage(page, partial); +} + +/* + * If truncate cannot remove the fs-private metadata from the page, the page + * becomes anonymous. It will be left on the LRU and may even be mapped into + * user pagetables if we're racing with filemap_nopage(). + * + * We need to bale out if page->mapping is no longer equal to the original + * mapping. This happens a) when the VM reclaimed the page while we waited on + * its lock, b) when a concurrent invalidate_inode_pages got there first and + * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space. + */ +static void +truncate_complete_page(struct address_space *mapping, struct page *page) +{ + if (page->mapping != mapping) + return; + + if (PagePrivate(page)) + do_invalidatepage(page, 0); + + clear_page_dirty(page); + ClearPageUptodate(page); + ClearPageMappedToDisk(page); + remove_from_page_cache(page); + page_cache_release(page); /* pagecache ref */ +} + +/* + * This is for invalidate_inode_pages(). That function can be called at + * any time, and is not supposed to throw away dirty pages. But pages can + * be marked dirty at any time too. So we re-check the dirtiness inside + * ->tree_lock. That provides exclusion against the __set_page_dirty + * functions. + * + * Returns non-zero if the page was successfully invalidated. + */ +static int +invalidate_complete_page(struct address_space *mapping, struct page *page) +{ + if (page->mapping != mapping) + return 0; + + if (PagePrivate(page) && !try_to_release_page(page, 0)) + return 0; + + write_lock_irq(&mapping->tree_lock); + if (PageDirty(page)) { + write_unlock_irq(&mapping->tree_lock); + return 0; + } + + BUG_ON(PagePrivate(page)); + __remove_from_page_cache(page); + write_unlock_irq(&mapping->tree_lock); + ClearPageUptodate(page); + page_cache_release(page); /* pagecache ref */ + return 1; +} + +/** + * truncate_inode_pages - truncate *all* the pages from an offset + * @mapping: mapping to truncate + * @lstart: offset from which to truncate + * + * Truncate the page cache at a set offset, removing the pages that are beyond + * that offset (and zeroing out partial pages). + * + * Truncate takes two passes - the first pass is nonblocking. It will not + * block on page locks and it will not block on writeback. The second pass + * will wait. This is to prevent as much IO as possible in the affected region. + * The first pass will remove most pages, so the search cost of the second pass + * is low. + * + * When looking at page->index outside the page lock we need to be careful to + * copy it into a local to avoid races (it could change at any time). + * + * We pass down the cache-hot hint to the page freeing code. Even if the + * mapping is large, it is probably the case that the final pages are the most + * recently touched, and freeing happens in ascending file offset order. + * + * Called under (and serialised by) inode->i_sem. + */ +void truncate_inode_pages(struct address_space *mapping, loff_t lstart) +{ + const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; + const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); + struct pagevec pvec; + pgoff_t next; + int i; + + if (mapping->nrpages == 0) + return; + + pagevec_init(&pvec, 0); + next = start; + while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i]; + pgoff_t page_index = page->index; + + if (page_index > next) + next = page_index; + next++; + if (TestSetPageLocked(page)) + continue; + if (PageWriteback(page)) { + unlock_page(page); + continue; + } + truncate_complete_page(mapping, page); + unlock_page(page); + } + pagevec_release(&pvec); + cond_resched(); + } + + if (partial) { + struct page *page = find_lock_page(mapping, start - 1); + if (page) { + wait_on_page_writeback(page); + truncate_partial_page(page, partial); + unlock_page(page); + page_cache_release(page); + } + } + + next = start; + for ( ; ; ) { + cond_resched(); + if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { + if (next == start) + break; + next = start; + continue; + } + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i]; + + lock_page(page); + wait_on_page_writeback(page); + if (page->index > next) + next = page->index; + next++; + truncate_complete_page(mapping, page); + unlock_page(page); + } + pagevec_release(&pvec); + } +} + +EXPORT_SYMBOL(truncate_inode_pages); + +/** + * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode + * @mapping: the address_space which holds the pages to invalidate + * @start: the offset 'from' which to invalidate + * @end: the offset 'to' which to invalidate (inclusive) + * + * This function only removes the unlocked pages, if you want to + * remove all the pages of one inode, you must call truncate_inode_pages. + * + * invalidate_mapping_pages() will not block on IO activity. It will not + * invalidate pages which are dirty, locked, under writeback or mapped into + * pagetables. + */ +unsigned long invalidate_mapping_pages(struct address_space *mapping, + pgoff_t start, pgoff_t end) +{ + struct pagevec pvec; + pgoff_t next = start; + unsigned long ret = 0; + int i; + + pagevec_init(&pvec, 0); + while (next <= end && + pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i]; + + if (TestSetPageLocked(page)) { + next++; + continue; + } + if (page->index > next) + next = page->index; + next++; + if (PageDirty(page) || PageWriteback(page)) + goto unlock; + if (page_mapped(page)) + goto unlock; + ret += invalidate_complete_page(mapping, page); +unlock: + unlock_page(page); + if (next > end) + break; + } + pagevec_release(&pvec); + cond_resched(); + } + return ret; +} + +unsigned long invalidate_inode_pages(struct address_space *mapping) +{ + return invalidate_mapping_pages(mapping, 0, ~0UL); +} + +EXPORT_SYMBOL(invalidate_inode_pages); + +/** + * invalidate_inode_pages2_range - remove range of pages from an address_space + * @mapping - the address_space + * @start: the page offset 'from' which to invalidate + * @end: the page offset 'to' which to invalidate (inclusive) + * + * Any pages which are found to be mapped into pagetables are unmapped prior to + * invalidation. + * + * Returns -EIO if any pages could not be invalidated. + */ +int invalidate_inode_pages2_range(struct address_space *mapping, + pgoff_t start, pgoff_t end) +{ + struct pagevec pvec; + pgoff_t next; + int i; + int ret = 0; + int did_range_unmap = 0; + int wrapped = 0; + + pagevec_init(&pvec, 0); + next = start; + while (next <= end && !ret && !wrapped && + pagevec_lookup(&pvec, mapping, next, + min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { + for (i = 0; !ret && i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i]; + pgoff_t page_index; + int was_dirty; + + lock_page(page); + if (page->mapping != mapping) { + unlock_page(page); + continue; + } + page_index = page->index; + next = page_index + 1; + if (next == 0) + wrapped = 1; + if (page_index > end) { + unlock_page(page); + break; + } + wait_on_page_writeback(page); + while (page_mapped(page)) { + if (!did_range_unmap) { + /* + * Zap the rest of the file in one hit. + */ + unmap_mapping_range(mapping, + page_index << PAGE_CACHE_SHIFT, + (end - page_index + 1) + << PAGE_CACHE_SHIFT, + 0); + did_range_unmap = 1; + } else { + /* + * Just zap this page + */ + unmap_mapping_range(mapping, + page_index << PAGE_CACHE_SHIFT, + PAGE_CACHE_SIZE, 0); + } + } + was_dirty = test_clear_page_dirty(page); + if (!invalidate_complete_page(mapping, page)) { + if (was_dirty) + set_page_dirty(page); + ret = -EIO; + } + unlock_page(page); + } + pagevec_release(&pvec); + cond_resched(); + } + return ret; +} +EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range); + +/** + * invalidate_inode_pages2 - remove all pages from an address_space + * @mapping - the address_space + * + * Any pages which are found to be mapped into pagetables are unmapped prior to + * invalidation. + * + * Returns -EIO if any pages could not be invalidated. + */ +int invalidate_inode_pages2(struct address_space *mapping) +{ + return invalidate_inode_pages2_range(mapping, 0, -1); +} +EXPORT_SYMBOL_GPL(invalidate_inode_pages2); diff --git a/mm/vmalloc.c b/mm/vmalloc.c new file mode 100644 index 000000000000..c6182f6f1305 --- /dev/null +++ b/mm/vmalloc.c @@ -0,0 +1,588 @@ +/* + * linux/mm/vmalloc.c + * + * Copyright (C) 1993 Linus Torvalds + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 + * SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian , May 2000 + * Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002 + */ + +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + + +DEFINE_RWLOCK(vmlist_lock); +struct vm_struct *vmlist; + +static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) +{ + pte_t *pte; + + pte = pte_offset_kernel(pmd, addr); + do { + pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte); + WARN_ON(!pte_none(ptent) && !pte_present(ptent)); + } while (pte++, addr += PAGE_SIZE, addr != end); +} + +static inline void vunmap_pmd_range(pud_t *pud, unsigned long addr, + unsigned long end) +{ + pmd_t *pmd; + unsigned long next; + + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (pmd_none_or_clear_bad(pmd)) + continue; + vunmap_pte_range(pmd, addr, next); + } while (pmd++, addr = next, addr != end); +} + +static inline void vunmap_pud_range(pgd_t *pgd, unsigned long addr, + unsigned long end) +{ + pud_t *pud; + unsigned long next; + + pud = pud_offset(pgd, addr); + do { + next = pud_addr_end(addr, end); + if (pud_none_or_clear_bad(pud)) + continue; + vunmap_pmd_range(pud, addr, next); + } while (pud++, addr = next, addr != end); +} + +void unmap_vm_area(struct vm_struct *area) +{ + pgd_t *pgd; + unsigned long next; + unsigned long addr = (unsigned long) area->addr; + unsigned long end = addr + area->size; + + BUG_ON(addr >= end); + pgd = pgd_offset_k(addr); + flush_cache_vunmap(addr, end); + do { + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(pgd)) + continue; + vunmap_pud_range(pgd, addr, next); + } while (pgd++, addr = next, addr != end); + flush_tlb_kernel_range((unsigned long) area->addr, end); +} + +static int vmap_pte_range(pmd_t *pmd, unsigned long addr, + unsigned long end, pgprot_t prot, struct page ***pages) +{ + pte_t *pte; + + pte = pte_alloc_kernel(&init_mm, pmd, addr); + if (!pte) + return -ENOMEM; + do { + struct page *page = **pages; + WARN_ON(!pte_none(*pte)); + if (!page) + return -ENOMEM; + set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); + (*pages)++; + } while (pte++, addr += PAGE_SIZE, addr != end); + return 0; +} + +static inline int vmap_pmd_range(pud_t *pud, unsigned long addr, + unsigned long end, pgprot_t prot, struct page ***pages) +{ + pmd_t *pmd; + unsigned long next; + + pmd = pmd_alloc(&init_mm, pud, addr); + if (!pmd) + return -ENOMEM; + do { + next = pmd_addr_end(addr, end); + if (vmap_pte_range(pmd, addr, next, prot, pages)) + return -ENOMEM; + } while (pmd++, addr = next, addr != end); + return 0; +} + +static inline int vmap_pud_range(pgd_t *pgd, unsigned long addr, + unsigned long end, pgprot_t prot, struct page ***pages) +{ + pud_t *pud; + unsigned long next; + + pud = pud_alloc(&init_mm, pgd, addr); + if (!pud) + return -ENOMEM; + do { + next = pud_addr_end(addr, end); + if (vmap_pmd_range(pud, addr, next, prot, pages)) + return -ENOMEM; + } while (pud++, addr = next, addr != end); + return 0; +} + +int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) +{ + pgd_t *pgd; + unsigned long next; + unsigned long addr = (unsigned long) area->addr; + unsigned long end = addr + area->size - PAGE_SIZE; + int err; + + BUG_ON(addr >= end); + pgd = pgd_offset_k(addr); + spin_lock(&init_mm.page_table_lock); + do { + next = pgd_addr_end(addr, end); + err = vmap_pud_range(pgd, addr, next, prot, pages); + if (err) + break; + } while (pgd++, addr = next, addr != end); + spin_unlock(&init_mm.page_table_lock); + flush_cache_vmap((unsigned long) area->addr, end); + return err; +} + +#define IOREMAP_MAX_ORDER (7 + PAGE_SHIFT) /* 128 pages */ + +struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, + unsigned long start, unsigned long end) +{ + struct vm_struct **p, *tmp, *area; + unsigned long align = 1; + unsigned long addr; + + if (flags & VM_IOREMAP) { + int bit = fls(size); + + if (bit > IOREMAP_MAX_ORDER) + bit = IOREMAP_MAX_ORDER; + else if (bit < PAGE_SHIFT) + bit = PAGE_SHIFT; + + align = 1ul << bit; + } + addr = ALIGN(start, align); + size = PAGE_ALIGN(size); + + area = kmalloc(sizeof(*area), GFP_KERNEL); + if (unlikely(!area)) + return NULL; + + if (unlikely(!size)) { + kfree (area); + return NULL; + } + + /* + * We always allocate a guard page. + */ + size += PAGE_SIZE; + + write_lock(&vmlist_lock); + for (p = &vmlist; (tmp = *p) != NULL ;p = &tmp->next) { + if ((unsigned long)tmp->addr < addr) { + if((unsigned long)tmp->addr + tmp->size >= addr) + addr = ALIGN(tmp->size + + (unsigned long)tmp->addr, align); + continue; + } + if ((size + addr) < addr) + goto out; + if (size + addr <= (unsigned long)tmp->addr) + goto found; + addr = ALIGN(tmp->size + (unsigned long)tmp->addr, align); + if (addr > end - size) + goto out; + } + +found: + area->next = *p; + *p = area; + + area->flags = flags; + area->addr = (void *)addr; + area->size = size; + area->pages = NULL; + area->nr_pages = 0; + area->phys_addr = 0; + write_unlock(&vmlist_lock); + + return area; + +out: + write_unlock(&vmlist_lock); + kfree(area); + if (printk_ratelimit()) + printk(KERN_WARNING "allocation failed: out of vmalloc space - use vmalloc= to increase size.\n"); + return NULL; +} + +/** + * get_vm_area - reserve a contingous kernel virtual area + * + * @size: size of the area + * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC + * + * Search an area of @size in the kernel virtual mapping area, + * and reserved it for out purposes. Returns the area descriptor + * on success or %NULL on failure. + */ +struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) +{ + return __get_vm_area(size, flags, VMALLOC_START, VMALLOC_END); +} + +/** + * remove_vm_area - find and remove a contingous kernel virtual area + * + * @addr: base address + * + * Search for the kernel VM area starting at @addr, and remove it. + * This function returns the found VM area, but using it is NOT safe + * on SMP machines. + */ +struct vm_struct *remove_vm_area(void *addr) +{ + struct vm_struct **p, *tmp; + + write_lock(&vmlist_lock); + for (p = &vmlist ; (tmp = *p) != NULL ;p = &tmp->next) { + if (tmp->addr == addr) + goto found; + } + write_unlock(&vmlist_lock); + return NULL; + +found: + unmap_vm_area(tmp); + *p = tmp->next; + write_unlock(&vmlist_lock); + + /* + * Remove the guard page. + */ + tmp->size -= PAGE_SIZE; + return tmp; +} + +void __vunmap(void *addr, int deallocate_pages) +{ + struct vm_struct *area; + + if (!addr) + return; + + if ((PAGE_SIZE-1) & (unsigned long)addr) { + printk(KERN_ERR "Trying to vfree() bad address (%p)\n", addr); + WARN_ON(1); + return; + } + + area = remove_vm_area(addr); + if (unlikely(!area)) { + printk(KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", + addr); + WARN_ON(1); + return; + } + + if (deallocate_pages) { + int i; + + for (i = 0; i < area->nr_pages; i++) { + if (unlikely(!area->pages[i])) + BUG(); + __free_page(area->pages[i]); + } + + if (area->nr_pages > PAGE_SIZE/sizeof(struct page *)) + vfree(area->pages); + else + kfree(area->pages); + } + + kfree(area); + return; +} + +/** + * vfree - release memory allocated by vmalloc() + * + * @addr: memory base address + * + * Free the virtually contiguous memory area starting at @addr, as + * obtained from vmalloc(), vmalloc_32() or __vmalloc(). + * + * May not be called in interrupt context. + */ +void vfree(void *addr) +{ + BUG_ON(in_interrupt()); + __vunmap(addr, 1); +} + +EXPORT_SYMBOL(vfree); + +/** + * vunmap - release virtual mapping obtained by vmap() + * + * @addr: memory base address + * + * Free the virtually contiguous memory area starting at @addr, + * which was created from the page array passed to vmap(). + * + * May not be called in interrupt context. + */ +void vunmap(void *addr) +{ + BUG_ON(in_interrupt()); + __vunmap(addr, 0); +} + +EXPORT_SYMBOL(vunmap); + +/** + * vmap - map an array of pages into virtually contiguous space + * + * @pages: array of page pointers + * @count: number of pages to map + * @flags: vm_area->flags + * @prot: page protection for the mapping + * + * Maps @count pages from @pages into contiguous kernel virtual + * space. + */ +void *vmap(struct page **pages, unsigned int count, + unsigned long flags, pgprot_t prot) +{ + struct vm_struct *area; + + if (count > num_physpages) + return NULL; + + area = get_vm_area((count << PAGE_SHIFT), flags); + if (!area) + return NULL; + if (map_vm_area(area, prot, &pages)) { + vunmap(area->addr); + return NULL; + } + + return area->addr; +} + +EXPORT_SYMBOL(vmap); + +void *__vmalloc_area(struct vm_struct *area, unsigned int __nocast gfp_mask, pgprot_t prot) +{ + struct page **pages; + unsigned int nr_pages, array_size, i; + + nr_pages = (area->size - PAGE_SIZE) >> PAGE_SHIFT; + array_size = (nr_pages * sizeof(struct page *)); + + area->nr_pages = nr_pages; + /* Please note that the recursion is strictly bounded. */ + if (array_size > PAGE_SIZE) + pages = __vmalloc(array_size, gfp_mask, PAGE_KERNEL); + else + pages = kmalloc(array_size, (gfp_mask & ~__GFP_HIGHMEM)); + area->pages = pages; + if (!area->pages) { + remove_vm_area(area->addr); + kfree(area); + return NULL; + } + memset(area->pages, 0, array_size); + + for (i = 0; i < area->nr_pages; i++) { + area->pages[i] = alloc_page(gfp_mask); + if (unlikely(!area->pages[i])) { + /* Successfully allocated i pages, free them in __vunmap() */ + area->nr_pages = i; + goto fail; + } + } + + if (map_vm_area(area, prot, &pages)) + goto fail; + return area->addr; + +fail: + vfree(area->addr); + return NULL; +} + +/** + * __vmalloc - allocate virtually contiguous memory + * + * @size: allocation size + * @gfp_mask: flags for the page level allocator + * @prot: protection mask for the allocated pages + * + * Allocate enough pages to cover @size from the page level + * allocator with @gfp_mask flags. Map them into contiguous + * kernel virtual space, using a pagetable protection of @prot. + */ +void *__vmalloc(unsigned long size, unsigned int __nocast gfp_mask, pgprot_t prot) +{ + struct vm_struct *area; + + size = PAGE_ALIGN(size); + if (!size || (size >> PAGE_SHIFT) > num_physpages) + return NULL; + + area = get_vm_area(size, VM_ALLOC); + if (!area) + return NULL; + + return __vmalloc_area(area, gfp_mask, prot); +} + +EXPORT_SYMBOL(__vmalloc); + +/** + * vmalloc - allocate virtually contiguous memory + * + * @size: allocation size + * + * Allocate enough pages to cover @size from the page level + * allocator and map them into contiguous kernel virtual space. + * + * For tight cotrol over page level allocator and protection flags + * use __vmalloc() instead. + */ +void *vmalloc(unsigned long size) +{ + return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL); +} + +EXPORT_SYMBOL(vmalloc); + +/** + * vmalloc_exec - allocate virtually contiguous, executable memory + * + * @size: allocation size + * + * Kernel-internal function to allocate enough pages to cover @size + * the page level allocator and map them into contiguous and + * executable kernel virtual space. + * + * For tight cotrol over page level allocator and protection flags + * use __vmalloc() instead. + */ + +#ifndef PAGE_KERNEL_EXEC +# define PAGE_KERNEL_EXEC PAGE_KERNEL +#endif + +void *vmalloc_exec(unsigned long size) +{ + return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC); +} + +/** + * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) + * + * @size: allocation size + * + * Allocate enough 32bit PA addressable pages to cover @size from the + * page level allocator and map them into contiguous kernel virtual space. + */ +void *vmalloc_32(unsigned long size) +{ + return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL); +} + +EXPORT_SYMBOL(vmalloc_32); + +long vread(char *buf, char *addr, unsigned long count) +{ + struct vm_struct *tmp; + char *vaddr, *buf_start = buf; + unsigned long n; + + /* Don't allow overflow */ + if ((unsigned long) addr + count < count) + count = -(unsigned long) addr; + + read_lock(&vmlist_lock); + for (tmp = vmlist; tmp; tmp = tmp->next) { + vaddr = (char *) tmp->addr; + if (addr >= vaddr + tmp->size - PAGE_SIZE) + continue; + while (addr < vaddr) { + if (count == 0) + goto finished; + *buf = '\0'; + buf++; + addr++; + count--; + } + n = vaddr + tmp->size - PAGE_SIZE - addr; + do { + if (count == 0) + goto finished; + *buf = *addr; + buf++; + addr++; + count--; + } while (--n > 0); + } +finished: + read_unlock(&vmlist_lock); + return buf - buf_start; +} + +long vwrite(char *buf, char *addr, unsigned long count) +{ + struct vm_struct *tmp; + char *vaddr, *buf_start = buf; + unsigned long n; + + /* Don't allow overflow */ + if ((unsigned long) addr + count < count) + count = -(unsigned long) addr; + + read_lock(&vmlist_lock); + for (tmp = vmlist; tmp; tmp = tmp->next) { + vaddr = (char *) tmp->addr; + if (addr >= vaddr + tmp->size - PAGE_SIZE) + continue; + while (addr < vaddr) { + if (count == 0) + goto finished; + buf++; + addr++; + count--; + } + n = vaddr + tmp->size - PAGE_SIZE - addr; + do { + if (count == 0) + goto finished; + *addr = *buf; + buf++; + addr++; + count--; + } while (--n > 0); + } +finished: + read_unlock(&vmlist_lock); + return buf - buf_start; +} diff --git a/mm/vmscan.c b/mm/vmscan.c new file mode 100644 index 000000000000..4003c0518d28 --- /dev/null +++ b/mm/vmscan.c @@ -0,0 +1,1311 @@ +/* + * linux/mm/vmscan.c + * + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * + * Swap reorganised 29.12.95, Stephen Tweedie. + * kswapd added: 7.1.96 sct + * Removed kswapd_ctl limits, and swap out as many pages as needed + * to bring the system back to freepages.high: 2.4.97, Rik van Riel. + * Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com). + * Multiqueue VM started 5.8.00, Rik van Riel. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for try_to_release_page(), + buffer_heads_over_limit */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +/* possible outcome of pageout() */ +typedef enum { + /* failed to write page out, page is locked */ + PAGE_KEEP, + /* move page to the active list, page is locked */ + PAGE_ACTIVATE, + /* page has been sent to the disk successfully, page is unlocked */ + PAGE_SUCCESS, + /* page is clean and locked */ + PAGE_CLEAN, +} pageout_t; + +struct scan_control { + /* Ask refill_inactive_zone, or shrink_cache to scan this many pages */ + unsigned long nr_to_scan; + + /* Incremented by the number of inactive pages that were scanned */ + unsigned long nr_scanned; + + /* Incremented by the number of pages reclaimed */ + unsigned long nr_reclaimed; + + unsigned long nr_mapped; /* From page_state */ + + /* How many pages shrink_cache() should reclaim */ + int nr_to_reclaim; + + /* Ask shrink_caches, or shrink_zone to scan at this priority */ + unsigned int priority; + + /* This context's GFP mask */ + unsigned int gfp_mask; + + int may_writepage; + + /* This context's SWAP_CLUSTER_MAX. If freeing memory for + * suspend, we effectively ignore SWAP_CLUSTER_MAX. + * In this context, it doesn't matter that we scan the + * whole list at once. */ + int swap_cluster_max; +}; + +/* + * The list of shrinker callbacks used by to apply pressure to + * ageable caches. + */ +struct shrinker { + shrinker_t shrinker; + struct list_head list; + int seeks; /* seeks to recreate an obj */ + long nr; /* objs pending delete */ +}; + +#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) + +#ifdef ARCH_HAS_PREFETCH +#define prefetch_prev_lru_page(_page, _base, _field) \ + do { \ + if ((_page)->lru.prev != _base) { \ + struct page *prev; \ + \ + prev = lru_to_page(&(_page->lru)); \ + prefetch(&prev->_field); \ + } \ + } while (0) +#else +#define prefetch_prev_lru_page(_page, _base, _field) do { } while (0) +#endif + +#ifdef ARCH_HAS_PREFETCHW +#define prefetchw_prev_lru_page(_page, _base, _field) \ + do { \ + if ((_page)->lru.prev != _base) { \ + struct page *prev; \ + \ + prev = lru_to_page(&(_page->lru)); \ + prefetchw(&prev->_field); \ + } \ + } while (0) +#else +#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0) +#endif + +/* + * From 0 .. 100. Higher means more swappy. + */ +int vm_swappiness = 60; +static long total_memory; + +static LIST_HEAD(shrinker_list); +static DECLARE_RWSEM(shrinker_rwsem); + +/* + * Add a shrinker callback to be called from the vm + */ +struct shrinker *set_shrinker(int seeks, shrinker_t theshrinker) +{ + struct shrinker *shrinker; + + shrinker = kmalloc(sizeof(*shrinker), GFP_KERNEL); + if (shrinker) { + shrinker->shrinker = theshrinker; + shrinker->seeks = seeks; + shrinker->nr = 0; + down_write(&shrinker_rwsem); + list_add_tail(&shrinker->list, &shrinker_list); + up_write(&shrinker_rwsem); + } + return shrinker; +} +EXPORT_SYMBOL(set_shrinker); + +/* + * Remove one + */ +void remove_shrinker(struct shrinker *shrinker) +{ + down_write(&shrinker_rwsem); + list_del(&shrinker->list); + up_write(&shrinker_rwsem); + kfree(shrinker); +} +EXPORT_SYMBOL(remove_shrinker); + +#define SHRINK_BATCH 128 +/* + * Call the shrink functions to age shrinkable caches + * + * Here we assume it costs one seek to replace a lru page and that it also + * takes a seek to recreate a cache object. With this in mind we age equal + * percentages of the lru and ageable caches. This should balance the seeks + * generated by these structures. + * + * If the vm encounted mapped pages on the LRU it increase the pressure on + * slab to avoid swapping. + * + * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits. + * + * `lru_pages' represents the number of on-LRU pages in all the zones which + * are eligible for the caller's allocation attempt. It is used for balancing + * slab reclaim versus page reclaim. + */ +static int shrink_slab(unsigned long scanned, unsigned int gfp_mask, + unsigned long lru_pages) +{ + struct shrinker *shrinker; + + if (scanned == 0) + scanned = SWAP_CLUSTER_MAX; + + if (!down_read_trylock(&shrinker_rwsem)) + return 0; + + list_for_each_entry(shrinker, &shrinker_list, list) { + unsigned long long delta; + unsigned long total_scan; + + delta = (4 * scanned) / shrinker->seeks; + delta *= (*shrinker->shrinker)(0, gfp_mask); + do_div(delta, lru_pages + 1); + shrinker->nr += delta; + if (shrinker->nr < 0) + shrinker->nr = LONG_MAX; /* It wrapped! */ + + total_scan = shrinker->nr; + shrinker->nr = 0; + + while (total_scan >= SHRINK_BATCH) { + long this_scan = SHRINK_BATCH; + int shrink_ret; + + shrink_ret = (*shrinker->shrinker)(this_scan, gfp_mask); + if (shrink_ret == -1) + break; + mod_page_state(slabs_scanned, this_scan); + total_scan -= this_scan; + + cond_resched(); + } + + shrinker->nr += total_scan; + } + up_read(&shrinker_rwsem); + return 0; +} + +/* Called without lock on whether page is mapped, so answer is unstable */ +static inline int page_mapping_inuse(struct page *page) +{ + struct address_space *mapping; + + /* Page is in somebody's page tables. */ + if (page_mapped(page)) + return 1; + + /* Be more reluctant to reclaim swapcache than pagecache */ + if (PageSwapCache(page)) + return 1; + + mapping = page_mapping(page); + if (!mapping) + return 0; + + /* File is mmap'd by somebody? */ + return mapping_mapped(mapping); +} + +static inline int is_page_cache_freeable(struct page *page) +{ + return page_count(page) - !!PagePrivate(page) == 2; +} + +static int may_write_to_queue(struct backing_dev_info *bdi) +{ + if (current_is_kswapd()) + return 1; + if (current_is_pdflush()) /* This is unlikely, but why not... */ + return 1; + if (!bdi_write_congested(bdi)) + return 1; + if (bdi == current->backing_dev_info) + return 1; + return 0; +} + +/* + * We detected a synchronous write error writing a page out. Probably + * -ENOSPC. We need to propagate that into the address_space for a subsequent + * fsync(), msync() or close(). + * + * The tricky part is that after writepage we cannot touch the mapping: nothing + * prevents it from being freed up. But we have a ref on the page and once + * that page is locked, the mapping is pinned. + * + * We're allowed to run sleeping lock_page() here because we know the caller has + * __GFP_FS. + */ +static void handle_write_error(struct address_space *mapping, + struct page *page, int error) +{ + lock_page(page); + if (page_mapping(page) == mapping) { + if (error == -ENOSPC) + set_bit(AS_ENOSPC, &mapping->flags); + else + set_bit(AS_EIO, &mapping->flags); + } + unlock_page(page); +} + +/* + * pageout is called by shrink_list() for each dirty page. Calls ->writepage(). + */ +static pageout_t pageout(struct page *page, struct address_space *mapping) +{ + /* + * If the page is dirty, only perform writeback if that write + * will be non-blocking. To prevent this allocation from being + * stalled by pagecache activity. But note that there may be + * stalls if we need to run get_block(). We could test + * PagePrivate for that. + * + * If this process is currently in generic_file_write() against + * this page's queue, we can perform writeback even if that + * will block. + * + * If the page is swapcache, write it back even if that would + * block, for some throttling. This happens by accident, because + * swap_backing_dev_info is bust: it doesn't reflect the + * congestion state of the swapdevs. Easy to fix, if needed. + * See swapfile.c:page_queue_congested(). + */ + if (!is_page_cache_freeable(page)) + return PAGE_KEEP; + if (!mapping) { + /* + * Some data journaling orphaned pages can have + * page->mapping == NULL while being dirty with clean buffers. + */ + if (PageDirty(page) && PagePrivate(page)) { + if (try_to_free_buffers(page)) { + ClearPageDirty(page); + printk("%s: orphaned page\n", __FUNCTION__); + return PAGE_CLEAN; + } + } + return PAGE_KEEP; + } + if (mapping->a_ops->writepage == NULL) + return PAGE_ACTIVATE; + if (!may_write_to_queue(mapping->backing_dev_info)) + return PAGE_KEEP; + + if (clear_page_dirty_for_io(page)) { + int res; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_NONE, + .nr_to_write = SWAP_CLUSTER_MAX, + .nonblocking = 1, + .for_reclaim = 1, + }; + + SetPageReclaim(page); + res = mapping->a_ops->writepage(page, &wbc); + if (res < 0) + handle_write_error(mapping, page, res); + if (res == WRITEPAGE_ACTIVATE) { + ClearPageReclaim(page); + return PAGE_ACTIVATE; + } + if (!PageWriteback(page)) { + /* synchronous write or broken a_ops? */ + ClearPageReclaim(page); + } + + return PAGE_SUCCESS; + } + + return PAGE_CLEAN; +} + +/* + * shrink_list adds the number of reclaimed pages to sc->nr_reclaimed + */ +static int shrink_list(struct list_head *page_list, struct scan_control *sc) +{ + LIST_HEAD(ret_pages); + struct pagevec freed_pvec; + int pgactivate = 0; + int reclaimed = 0; + + cond_resched(); + + pagevec_init(&freed_pvec, 1); + while (!list_empty(page_list)) { + struct address_space *mapping; + struct page *page; + int may_enter_fs; + int referenced; + + cond_resched(); + + page = lru_to_page(page_list); + list_del(&page->lru); + + if (TestSetPageLocked(page)) + goto keep; + + BUG_ON(PageActive(page)); + + sc->nr_scanned++; + /* Double the slab pressure for mapped and swapcache pages */ + if (page_mapped(page) || PageSwapCache(page)) + sc->nr_scanned++; + + if (PageWriteback(page)) + goto keep_locked; + + referenced = page_referenced(page, 1, sc->priority <= 0); + /* In active use or really unfreeable? Activate it. */ + if (referenced && page_mapping_inuse(page)) + goto activate_locked; + +#ifdef CONFIG_SWAP + /* + * Anonymous process memory has backing store? + * Try to allocate it some swap space here. + */ + if (PageAnon(page) && !PageSwapCache(page)) { + if (!add_to_swap(page)) + goto activate_locked; + } +#endif /* CONFIG_SWAP */ + + mapping = page_mapping(page); + may_enter_fs = (sc->gfp_mask & __GFP_FS) || + (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); + + /* + * The page is mapped into the page tables of one or more + * processes. Try to unmap it here. + */ + if (page_mapped(page) && mapping) { + switch (try_to_unmap(page)) { + case SWAP_FAIL: + goto activate_locked; + case SWAP_AGAIN: + goto keep_locked; + case SWAP_SUCCESS: + ; /* try to free the page below */ + } + } + + if (PageDirty(page)) { + if (referenced) + goto keep_locked; + if (!may_enter_fs) + goto keep_locked; + if (laptop_mode && !sc->may_writepage) + goto keep_locked; + + /* Page is dirty, try to write it out here */ + switch(pageout(page, mapping)) { + case PAGE_KEEP: + goto keep_locked; + case PAGE_ACTIVATE: + goto activate_locked; + case PAGE_SUCCESS: + if (PageWriteback(page) || PageDirty(page)) + goto keep; + /* + * A synchronous write - probably a ramdisk. Go + * ahead and try to reclaim the page. + */ + if (TestSetPageLocked(page)) + goto keep; + if (PageDirty(page) || PageWriteback(page)) + goto keep_locked; + mapping = page_mapping(page); + case PAGE_CLEAN: + ; /* try to free the page below */ + } + } + + /* + * If the page has buffers, try to free the buffer mappings + * associated with this page. If we succeed we try to free + * the page as well. + * + * We do this even if the page is PageDirty(). + * try_to_release_page() does not perform I/O, but it is + * possible for a page to have PageDirty set, but it is actually + * clean (all its buffers are clean). This happens if the + * buffers were written out directly, with submit_bh(). ext3 + * will do this, as well as the blockdev mapping. + * try_to_release_page() will discover that cleanness and will + * drop the buffers and mark the page clean - it can be freed. + * + * Rarely, pages can have buffers and no ->mapping. These are + * the pages which were not successfully invalidated in + * truncate_complete_page(). We try to drop those buffers here + * and if that worked, and the page is no longer mapped into + * process address space (page_count == 1) it can be freed. + * Otherwise, leave the page on the LRU so it is swappable. + */ + if (PagePrivate(page)) { + if (!try_to_release_page(page, sc->gfp_mask)) + goto activate_locked; + if (!mapping && page_count(page) == 1) + goto free_it; + } + + if (!mapping) + goto keep_locked; /* truncate got there first */ + + write_lock_irq(&mapping->tree_lock); + + /* + * The non-racy check for busy page. It is critical to check + * PageDirty _after_ making sure that the page is freeable and + * not in use by anybody. (pagecache + us == 2) + */ + if (page_count(page) != 2 || PageDirty(page)) { + write_unlock_irq(&mapping->tree_lock); + goto keep_locked; + } + +#ifdef CONFIG_SWAP + if (PageSwapCache(page)) { + swp_entry_t swap = { .val = page->private }; + __delete_from_swap_cache(page); + write_unlock_irq(&mapping->tree_lock); + swap_free(swap); + __put_page(page); /* The pagecache ref */ + goto free_it; + } +#endif /* CONFIG_SWAP */ + + __remove_from_page_cache(page); + write_unlock_irq(&mapping->tree_lock); + __put_page(page); + +free_it: + unlock_page(page); + reclaimed++; + if (!pagevec_add(&freed_pvec, page)) + __pagevec_release_nonlru(&freed_pvec); + continue; + +activate_locked: + SetPageActive(page); + pgactivate++; +keep_locked: + unlock_page(page); +keep: + list_add(&page->lru, &ret_pages); + BUG_ON(PageLRU(page)); + } + list_splice(&ret_pages, page_list); + if (pagevec_count(&freed_pvec)) + __pagevec_release_nonlru(&freed_pvec); + mod_page_state(pgactivate, pgactivate); + sc->nr_reclaimed += reclaimed; + return reclaimed; +} + +/* + * zone->lru_lock is heavily contended. Some of the functions that + * shrink the lists perform better by taking out a batch of pages + * and working on them outside the LRU lock. + * + * For pagecache intensive workloads, this function is the hottest + * spot in the kernel (apart from copy_*_user functions). + * + * Appropriate locks must be held before calling this function. + * + * @nr_to_scan: The number of pages to look through on the list. + * @src: The LRU list to pull pages off. + * @dst: The temp list to put pages on to. + * @scanned: The number of pages that were scanned. + * + * returns how many pages were moved onto *@dst. + */ +static int isolate_lru_pages(int nr_to_scan, struct list_head *src, + struct list_head *dst, int *scanned) +{ + int nr_taken = 0; + struct page *page; + int scan = 0; + + while (scan++ < nr_to_scan && !list_empty(src)) { + page = lru_to_page(src); + prefetchw_prev_lru_page(page, src, flags); + + if (!TestClearPageLRU(page)) + BUG(); + list_del(&page->lru); + if (get_page_testone(page)) { + /* + * It is being freed elsewhere + */ + __put_page(page); + SetPageLRU(page); + list_add(&page->lru, src); + continue; + } else { + list_add(&page->lru, dst); + nr_taken++; + } + } + + *scanned = scan; + return nr_taken; +} + +/* + * shrink_cache() adds the number of pages reclaimed to sc->nr_reclaimed + */ +static void shrink_cache(struct zone *zone, struct scan_control *sc) +{ + LIST_HEAD(page_list); + struct pagevec pvec; + int max_scan = sc->nr_to_scan; + + pagevec_init(&pvec, 1); + + lru_add_drain(); + spin_lock_irq(&zone->lru_lock); + while (max_scan > 0) { + struct page *page; + int nr_taken; + int nr_scan; + int nr_freed; + + nr_taken = isolate_lru_pages(sc->swap_cluster_max, + &zone->inactive_list, + &page_list, &nr_scan); + zone->nr_inactive -= nr_taken; + zone->pages_scanned += nr_scan; + spin_unlock_irq(&zone->lru_lock); + + if (nr_taken == 0) + goto done; + + max_scan -= nr_scan; + if (current_is_kswapd()) + mod_page_state_zone(zone, pgscan_kswapd, nr_scan); + else + mod_page_state_zone(zone, pgscan_direct, nr_scan); + nr_freed = shrink_list(&page_list, sc); + if (current_is_kswapd()) + mod_page_state(kswapd_steal, nr_freed); + mod_page_state_zone(zone, pgsteal, nr_freed); + sc->nr_to_reclaim -= nr_freed; + + spin_lock_irq(&zone->lru_lock); + /* + * Put back any unfreeable pages. + */ + while (!list_empty(&page_list)) { + page = lru_to_page(&page_list); + if (TestSetPageLRU(page)) + BUG(); + list_del(&page->lru); + if (PageActive(page)) + add_page_to_active_list(zone, page); + else + add_page_to_inactive_list(zone, page); + if (!pagevec_add(&pvec, page)) { + spin_unlock_irq(&zone->lru_lock); + __pagevec_release(&pvec); + spin_lock_irq(&zone->lru_lock); + } + } + } + spin_unlock_irq(&zone->lru_lock); +done: + pagevec_release(&pvec); +} + +/* + * This moves pages from the active list to the inactive list. + * + * We move them the other way if the page is referenced by one or more + * processes, from rmap. + * + * If the pages are mostly unmapped, the processing is fast and it is + * appropriate to hold zone->lru_lock across the whole operation. But if + * the pages are mapped, the processing is slow (page_referenced()) so we + * should drop zone->lru_lock around each page. It's impossible to balance + * this, so instead we remove the pages from the LRU while processing them. + * It is safe to rely on PG_active against the non-LRU pages in here because + * nobody will play with that bit on a non-LRU page. + * + * The downside is that we have to touch page->_count against each page. + * But we had to alter page->flags anyway. + */ +static void +refill_inactive_zone(struct zone *zone, struct scan_control *sc) +{ + int pgmoved; + int pgdeactivate = 0; + int pgscanned; + int nr_pages = sc->nr_to_scan; + LIST_HEAD(l_hold); /* The pages which were snipped off */ + LIST_HEAD(l_inactive); /* Pages to go onto the inactive_list */ + LIST_HEAD(l_active); /* Pages to go onto the active_list */ + struct page *page; + struct pagevec pvec; + int reclaim_mapped = 0; + long mapped_ratio; + long distress; + long swap_tendency; + + lru_add_drain(); + spin_lock_irq(&zone->lru_lock); + pgmoved = isolate_lru_pages(nr_pages, &zone->active_list, + &l_hold, &pgscanned); + zone->pages_scanned += pgscanned; + zone->nr_active -= pgmoved; + spin_unlock_irq(&zone->lru_lock); + + /* + * `distress' is a measure of how much trouble we're having reclaiming + * pages. 0 -> no problems. 100 -> great trouble. + */ + distress = 100 >> zone->prev_priority; + + /* + * The point of this algorithm is to decide when to start reclaiming + * mapped memory instead of just pagecache. Work out how much memory + * is mapped. + */ + mapped_ratio = (sc->nr_mapped * 100) / total_memory; + + /* + * Now decide how much we really want to unmap some pages. The mapped + * ratio is downgraded - just because there's a lot of mapped memory + * doesn't necessarily mean that page reclaim isn't succeeding. + * + * The distress ratio is important - we don't want to start going oom. + * + * A 100% value of vm_swappiness overrides this algorithm altogether. + */ + swap_tendency = mapped_ratio / 2 + distress + vm_swappiness; + + /* + * Now use this metric to decide whether to start moving mapped memory + * onto the inactive list. + */ + if (swap_tendency >= 100) + reclaim_mapped = 1; + + while (!list_empty(&l_hold)) { + cond_resched(); + page = lru_to_page(&l_hold); + list_del(&page->lru); + if (page_mapped(page)) { + if (!reclaim_mapped || + (total_swap_pages == 0 && PageAnon(page)) || + page_referenced(page, 0, sc->priority <= 0)) { + list_add(&page->lru, &l_active); + continue; + } + } + list_add(&page->lru, &l_inactive); + } + + pagevec_init(&pvec, 1); + pgmoved = 0; + spin_lock_irq(&zone->lru_lock); + while (!list_empty(&l_inactive)) { + page = lru_to_page(&l_inactive); + prefetchw_prev_lru_page(page, &l_inactive, flags); + if (TestSetPageLRU(page)) + BUG(); + if (!TestClearPageActive(page)) + BUG(); + list_move(&page->lru, &zone->inactive_list); + pgmoved++; + if (!pagevec_add(&pvec, page)) { + zone->nr_inactive += pgmoved; + spin_unlock_irq(&zone->lru_lock); + pgdeactivate += pgmoved; + pgmoved = 0; + if (buffer_heads_over_limit) + pagevec_strip(&pvec); + __pagevec_release(&pvec); + spin_lock_irq(&zone->lru_lock); + } + } + zone->nr_inactive += pgmoved; + pgdeactivate += pgmoved; + if (buffer_heads_over_limit) { + spin_unlock_irq(&zone->lru_lock); + pagevec_strip(&pvec); + spin_lock_irq(&zone->lru_lock); + } + + pgmoved = 0; + while (!list_empty(&l_active)) { + page = lru_to_page(&l_active); + prefetchw_prev_lru_page(page, &l_active, flags); + if (TestSetPageLRU(page)) + BUG(); + BUG_ON(!PageActive(page)); + list_move(&page->lru, &zone->active_list); + pgmoved++; + if (!pagevec_add(&pvec, page)) { + zone->nr_active += pgmoved; + pgmoved = 0; + spin_unlock_irq(&zone->lru_lock); + __pagevec_release(&pvec); + spin_lock_irq(&zone->lru_lock); + } + } + zone->nr_active += pgmoved; + spin_unlock_irq(&zone->lru_lock); + pagevec_release(&pvec); + + mod_page_state_zone(zone, pgrefill, pgscanned); + mod_page_state(pgdeactivate, pgdeactivate); +} + +/* + * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. + */ +static void +shrink_zone(struct zone *zone, struct scan_control *sc) +{ + unsigned long nr_active; + unsigned long nr_inactive; + + /* + * Add one to `nr_to_scan' just to make sure that the kernel will + * slowly sift through the active list. + */ + zone->nr_scan_active += (zone->nr_active >> sc->priority) + 1; + nr_active = zone->nr_scan_active; + if (nr_active >= sc->swap_cluster_max) + zone->nr_scan_active = 0; + else + nr_active = 0; + + zone->nr_scan_inactive += (zone->nr_inactive >> sc->priority) + 1; + nr_inactive = zone->nr_scan_inactive; + if (nr_inactive >= sc->swap_cluster_max) + zone->nr_scan_inactive = 0; + else + nr_inactive = 0; + + sc->nr_to_reclaim = sc->swap_cluster_max; + + while (nr_active || nr_inactive) { + if (nr_active) { + sc->nr_to_scan = min(nr_active, + (unsigned long)sc->swap_cluster_max); + nr_active -= sc->nr_to_scan; + refill_inactive_zone(zone, sc); + } + + if (nr_inactive) { + sc->nr_to_scan = min(nr_inactive, + (unsigned long)sc->swap_cluster_max); + nr_inactive -= sc->nr_to_scan; + shrink_cache(zone, sc); + if (sc->nr_to_reclaim <= 0) + break; + } + } + + throttle_vm_writeout(); +} + +/* + * This is the direct reclaim path, for page-allocating processes. We only + * try to reclaim pages from zones which will satisfy the caller's allocation + * request. + * + * We reclaim from a zone even if that zone is over pages_high. Because: + * a) The caller may be trying to free *extra* pages to satisfy a higher-order + * allocation or + * b) The zones may be over pages_high but they must go *over* pages_high to + * satisfy the `incremental min' zone defense algorithm. + * + * Returns the number of reclaimed pages. + * + * If a zone is deemed to be full of pinned pages then just give it a light + * scan then give up on it. + */ +static void +shrink_caches(struct zone **zones, struct scan_control *sc) +{ + int i; + + for (i = 0; zones[i] != NULL; i++) { + struct zone *zone = zones[i]; + + if (zone->present_pages == 0) + continue; + + if (!cpuset_zone_allowed(zone)) + continue; + + zone->temp_priority = sc->priority; + if (zone->prev_priority > sc->priority) + zone->prev_priority = sc->priority; + + if (zone->all_unreclaimable && sc->priority != DEF_PRIORITY) + continue; /* Let kswapd poll it */ + + shrink_zone(zone, sc); + } +} + +/* + * This is the main entry point to direct page reclaim. + * + * If a full scan of the inactive list fails to free enough memory then we + * are "out of memory" and something needs to be killed. + * + * If the caller is !__GFP_FS then the probability of a failure is reasonably + * high - the zone may be full of dirty or under-writeback pages, which this + * caller can't do much about. We kick pdflush and take explicit naps in the + * hope that some of these pages can be written. But if the allocating task + * holds filesystem locks which prevent writeout this might not work, and the + * allocation attempt will fail. + */ +int try_to_free_pages(struct zone **zones, + unsigned int gfp_mask, unsigned int order) +{ + int priority; + int ret = 0; + int total_scanned = 0, total_reclaimed = 0; + struct reclaim_state *reclaim_state = current->reclaim_state; + struct scan_control sc; + unsigned long lru_pages = 0; + int i; + + sc.gfp_mask = gfp_mask; + sc.may_writepage = 0; + + inc_page_state(allocstall); + + for (i = 0; zones[i] != NULL; i++) { + struct zone *zone = zones[i]; + + if (!cpuset_zone_allowed(zone)) + continue; + + zone->temp_priority = DEF_PRIORITY; + lru_pages += zone->nr_active + zone->nr_inactive; + } + + for (priority = DEF_PRIORITY; priority >= 0; priority--) { + sc.nr_mapped = read_page_state(nr_mapped); + sc.nr_scanned = 0; + sc.nr_reclaimed = 0; + sc.priority = priority; + sc.swap_cluster_max = SWAP_CLUSTER_MAX; + shrink_caches(zones, &sc); + shrink_slab(sc.nr_scanned, gfp_mask, lru_pages); + if (reclaim_state) { + sc.nr_reclaimed += reclaim_state->reclaimed_slab; + reclaim_state->reclaimed_slab = 0; + } + total_scanned += sc.nr_scanned; + total_reclaimed += sc.nr_reclaimed; + if (total_reclaimed >= sc.swap_cluster_max) { + ret = 1; + goto out; + } + + /* + * Try to write back as many pages as we just scanned. This + * tends to cause slow streaming writers to write data to the + * disk smoothly, at the dirtying rate, which is nice. But + * that's undesirable in laptop mode, where we *want* lumpy + * writeout. So in laptop mode, write out the whole world. + */ + if (total_scanned > sc.swap_cluster_max + sc.swap_cluster_max/2) { + wakeup_bdflush(laptop_mode ? 0 : total_scanned); + sc.may_writepage = 1; + } + + /* Take a nap, wait for some writeback to complete */ + if (sc.nr_scanned && priority < DEF_PRIORITY - 2) + blk_congestion_wait(WRITE, HZ/10); + } +out: + for (i = 0; zones[i] != 0; i++) { + struct zone *zone = zones[i]; + + if (!cpuset_zone_allowed(zone)) + continue; + + zone->prev_priority = zone->temp_priority; + } + return ret; +} + +/* + * For kswapd, balance_pgdat() will work across all this node's zones until + * they are all at pages_high. + * + * If `nr_pages' is non-zero then it is the number of pages which are to be + * reclaimed, regardless of the zone occupancies. This is a software suspend + * special. + * + * Returns the number of pages which were actually freed. + * + * There is special handling here for zones which are full of pinned pages. + * This can happen if the pages are all mlocked, or if they are all used by + * device drivers (say, ZONE_DMA). Or if they are all in use by hugetlb. + * What we do is to detect the case where all pages in the zone have been + * scanned twice and there has been zero successful reclaim. Mark the zone as + * dead and from now on, only perform a short scan. Basically we're polling + * the zone for when the problem goes away. + * + * kswapd scans the zones in the highmem->normal->dma direction. It skips + * zones which have free_pages > pages_high, but once a zone is found to have + * free_pages <= pages_high, we scan that zone and the lower zones regardless + * of the number of free pages in the lower zones. This interoperates with + * the page allocator fallback scheme to ensure that aging of pages is balanced + * across the zones. + */ +static int balance_pgdat(pg_data_t *pgdat, int nr_pages, int order) +{ + int to_free = nr_pages; + int all_zones_ok; + int priority; + int i; + int total_scanned, total_reclaimed; + struct reclaim_state *reclaim_state = current->reclaim_state; + struct scan_control sc; + +loop_again: + total_scanned = 0; + total_reclaimed = 0; + sc.gfp_mask = GFP_KERNEL; + sc.may_writepage = 0; + sc.nr_mapped = read_page_state(nr_mapped); + + inc_page_state(pageoutrun); + + for (i = 0; i < pgdat->nr_zones; i++) { + struct zone *zone = pgdat->node_zones + i; + + zone->temp_priority = DEF_PRIORITY; + } + + for (priority = DEF_PRIORITY; priority >= 0; priority--) { + int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ + unsigned long lru_pages = 0; + + all_zones_ok = 1; + + if (nr_pages == 0) { + /* + * Scan in the highmem->dma direction for the highest + * zone which needs scanning + */ + for (i = pgdat->nr_zones - 1; i >= 0; i--) { + struct zone *zone = pgdat->node_zones + i; + + if (zone->present_pages == 0) + continue; + + if (zone->all_unreclaimable && + priority != DEF_PRIORITY) + continue; + + if (!zone_watermark_ok(zone, order, + zone->pages_high, 0, 0, 0)) { + end_zone = i; + goto scan; + } + } + goto out; + } else { + end_zone = pgdat->nr_zones - 1; + } +scan: + for (i = 0; i <= end_zone; i++) { + struct zone *zone = pgdat->node_zones + i; + + lru_pages += zone->nr_active + zone->nr_inactive; + } + + /* + * Now scan the zone in the dma->highmem direction, stopping + * at the last zone which needs scanning. + * + * We do this because the page allocator works in the opposite + * direction. This prevents the page allocator from allocating + * pages behind kswapd's direction of progress, which would + * cause too much scanning of the lower zones. + */ + for (i = 0; i <= end_zone; i++) { + struct zone *zone = pgdat->node_zones + i; + + if (zone->present_pages == 0) + continue; + + if (zone->all_unreclaimable && priority != DEF_PRIORITY) + continue; + + if (nr_pages == 0) { /* Not software suspend */ + if (!zone_watermark_ok(zone, order, + zone->pages_high, end_zone, 0, 0)) + all_zones_ok = 0; + } + zone->temp_priority = priority; + if (zone->prev_priority > priority) + zone->prev_priority = priority; + sc.nr_scanned = 0; + sc.nr_reclaimed = 0; + sc.priority = priority; + sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX; + shrink_zone(zone, &sc); + reclaim_state->reclaimed_slab = 0; + shrink_slab(sc.nr_scanned, GFP_KERNEL, lru_pages); + sc.nr_reclaimed += reclaim_state->reclaimed_slab; + total_reclaimed += sc.nr_reclaimed; + total_scanned += sc.nr_scanned; + if (zone->all_unreclaimable) + continue; + if (zone->pages_scanned >= (zone->nr_active + + zone->nr_inactive) * 4) + zone->all_unreclaimable = 1; + /* + * If we've done a decent amount of scanning and + * the reclaim ratio is low, start doing writepage + * even in laptop mode + */ + if (total_scanned > SWAP_CLUSTER_MAX * 2 && + total_scanned > total_reclaimed+total_reclaimed/2) + sc.may_writepage = 1; + } + if (nr_pages && to_free > total_reclaimed) + continue; /* swsusp: need to do more work */ + if (all_zones_ok) + break; /* kswapd: all done */ + /* + * OK, kswapd is getting into trouble. Take a nap, then take + * another pass across the zones. + */ + if (total_scanned && priority < DEF_PRIORITY - 2) + blk_congestion_wait(WRITE, HZ/10); + + /* + * We do this so kswapd doesn't build up large priorities for + * example when it is freeing in parallel with allocators. It + * matches the direct reclaim path behaviour in terms of impact + * on zone->*_priority. + */ + if ((total_reclaimed >= SWAP_CLUSTER_MAX) && (!nr_pages)) + break; + } +out: + for (i = 0; i < pgdat->nr_zones; i++) { + struct zone *zone = pgdat->node_zones + i; + + zone->prev_priority = zone->temp_priority; + } + if (!all_zones_ok) { + cond_resched(); + goto loop_again; + } + + return total_reclaimed; +} + +/* + * The background pageout daemon, started as a kernel thread + * from the init process. + * + * This basically trickles out pages so that we have _some_ + * free memory available even if there is no other activity + * that frees anything up. This is needed for things like routing + * etc, where we otherwise might have all activity going on in + * asynchronous contexts that cannot page things out. + * + * If there are applications that are active memory-allocators + * (most normal use), this basically shouldn't matter. + */ +static int kswapd(void *p) +{ + unsigned long order; + pg_data_t *pgdat = (pg_data_t*)p; + struct task_struct *tsk = current; + DEFINE_WAIT(wait); + struct reclaim_state reclaim_state = { + .reclaimed_slab = 0, + }; + cpumask_t cpumask; + + daemonize("kswapd%d", pgdat->node_id); + cpumask = node_to_cpumask(pgdat->node_id); + if (!cpus_empty(cpumask)) + set_cpus_allowed(tsk, cpumask); + current->reclaim_state = &reclaim_state; + + /* + * Tell the memory management that we're a "memory allocator", + * and that if we need more memory we should get access to it + * regardless (see "__alloc_pages()"). "kswapd" should + * never get caught in the normal page freeing logic. + * + * (Kswapd normally doesn't need memory anyway, but sometimes + * you need a small amount of memory in order to be able to + * page out something else, and this flag essentially protects + * us from recursively trying to free more memory as we're + * trying to free the first piece of memory in the first place). + */ + tsk->flags |= PF_MEMALLOC|PF_KSWAPD; + + order = 0; + for ( ; ; ) { + unsigned long new_order; + if (current->flags & PF_FREEZE) + refrigerator(PF_FREEZE); + + prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); + new_order = pgdat->kswapd_max_order; + pgdat->kswapd_max_order = 0; + if (order < new_order) { + /* + * Don't sleep if someone wants a larger 'order' + * allocation + */ + order = new_order; + } else { + schedule(); + order = pgdat->kswapd_max_order; + } + finish_wait(&pgdat->kswapd_wait, &wait); + + balance_pgdat(pgdat, 0, order); + } + return 0; +} + +/* + * A zone is low on free memory, so wake its kswapd task to service it. + */ +void wakeup_kswapd(struct zone *zone, int order) +{ + pg_data_t *pgdat; + + if (zone->present_pages == 0) + return; + + pgdat = zone->zone_pgdat; + if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0, 0)) + return; + if (pgdat->kswapd_max_order < order) + pgdat->kswapd_max_order = order; + if (!cpuset_zone_allowed(zone)) + return; + if (!waitqueue_active(&zone->zone_pgdat->kswapd_wait)) + return; + wake_up_interruptible(&zone->zone_pgdat->kswapd_wait); +} + +#ifdef CONFIG_PM +/* + * Try to free `nr_pages' of memory, system-wide. Returns the number of freed + * pages. + */ +int shrink_all_memory(int nr_pages) +{ + pg_data_t *pgdat; + int nr_to_free = nr_pages; + int ret = 0; + struct reclaim_state reclaim_state = { + .reclaimed_slab = 0, + }; + + current->reclaim_state = &reclaim_state; + for_each_pgdat(pgdat) { + int freed; + freed = balance_pgdat(pgdat, nr_to_free, 0); + ret += freed; + nr_to_free -= freed; + if (nr_to_free <= 0) + break; + } + current->reclaim_state = NULL; + return ret; +} +#endif + +#ifdef CONFIG_HOTPLUG_CPU +/* It's optimal to keep kswapds on the same CPUs as their memory, but + not required for correctness. So if the last cpu in a node goes + away, we get changed to run anywhere: as the first one comes back, + restore their cpu bindings. */ +static int __devinit cpu_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + pg_data_t *pgdat; + cpumask_t mask; + + if (action == CPU_ONLINE) { + for_each_pgdat(pgdat) { + mask = node_to_cpumask(pgdat->node_id); + if (any_online_cpu(mask) != NR_CPUS) + /* One of our CPUs online: restore mask */ + set_cpus_allowed(pgdat->kswapd, mask); + } + } + return NOTIFY_OK; +} +#endif /* CONFIG_HOTPLUG_CPU */ + +static int __init kswapd_init(void) +{ + pg_data_t *pgdat; + swap_setup(); + for_each_pgdat(pgdat) + pgdat->kswapd + = find_task_by_pid(kernel_thread(kswapd, pgdat, CLONE_KERNEL)); + total_memory = nr_free_pagecache_pages(); + hotcpu_notifier(cpu_callback, 0); + return 0; +} + +module_init(kswapd_init) -- cgit v1.2.3-70-g09d2