diff options
Diffstat (limited to 'fs')
| -rw-r--r-- | fs/bio.c | 2 | ||||
| -rw-r--r-- | fs/block_dev.c | 200 | 
2 files changed, 175 insertions, 27 deletions
@@ -916,7 +916,7 @@ void bio_set_pages_dirty(struct bio *bio)  	}  } -static void bio_release_pages(struct bio *bio) +void bio_release_pages(struct bio *bio)  {  	struct bio_vec *bvec = bio->bi_io_vec;  	int i; diff --git a/fs/block_dev.c b/fs/block_dev.c index 197f93921847..1715d6b5f411 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -129,43 +129,191 @@ blkdev_get_block(struct inode *inode, sector_t iblock,  	return 0;  } -static int -blkdev_get_blocks(struct inode *inode, sector_t iblock, -		struct buffer_head *bh, int create) +static int blk_end_aio(struct bio *bio, unsigned int bytes_done, int error)  { -	sector_t end_block = max_block(I_BDEV(inode)); -	unsigned long max_blocks = bh->b_size >> inode->i_blkbits; +	struct kiocb *iocb = bio->bi_private; +	atomic_t *bio_count = &iocb->ki_bio_count; -	if ((iblock + max_blocks) > end_block) { -		max_blocks = end_block - iblock; -		if ((long)max_blocks <= 0) { -			if (create) -				return -EIO;	/* write fully beyond EOF */ -			/* -			 * It is a read which is fully beyond EOF.  We return -			 * a !buffer_mapped buffer -			 */ -			max_blocks = 0; -		} +	if (bio_data_dir(bio) == READ) +		bio_check_pages_dirty(bio); +	else { +		bio_release_pages(bio); +		bio_put(bio); +	} + +	/* iocb->ki_nbytes stores error code from LLDD */ +	if (error) +		iocb->ki_nbytes = -EIO; + +	if (atomic_dec_and_test(bio_count)) { +		if (iocb->ki_nbytes < 0) +			aio_complete(iocb, iocb->ki_nbytes, 0); +		else +			aio_complete(iocb, iocb->ki_left, 0);  	} -	bh->b_bdev = I_BDEV(inode); -	bh->b_blocknr = iblock; -	bh->b_size = max_blocks << inode->i_blkbits; -	if (max_blocks) -		set_buffer_mapped(bh);  	return 0;  } +#define VEC_SIZE	16 +struct pvec { +	unsigned short nr; +	unsigned short idx; +	struct page *page[VEC_SIZE]; +}; + +#define PAGES_SPANNED(addr, len)	\ +	(DIV_ROUND_UP((addr) + (len), PAGE_SIZE) - (addr) / PAGE_SIZE); + +/* + * get page pointer for user addr, we internally cache struct page array for + * (addr, count) range in pvec to avoid frequent call to get_user_pages.  If + * internal page list is exhausted, a batch count of up to VEC_SIZE is used + * to get next set of page struct. + */ +static struct page *blk_get_page(unsigned long addr, size_t count, int rw, +				 struct pvec *pvec) +{ +	int ret, nr_pages; +	if (pvec->idx == pvec->nr) { +		nr_pages = PAGES_SPANNED(addr, count); +		nr_pages = min(nr_pages, VEC_SIZE); +		down_read(¤t->mm->mmap_sem); +		ret = get_user_pages(current, current->mm, addr, nr_pages, +				     rw == READ, 0, pvec->page, NULL); +		up_read(¤t->mm->mmap_sem); +		if (ret < 0) +			return ERR_PTR(ret); +		pvec->nr = ret; +		pvec->idx = 0; +	} +	return pvec->page[pvec->idx++]; +} +  static ssize_t  blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, -			loff_t offset, unsigned long nr_segs) +		 loff_t pos, unsigned long nr_segs)  { -	struct file *file = iocb->ki_filp; -	struct inode *inode = file->f_mapping->host; +	struct inode *inode = iocb->ki_filp->f_mapping->host; +	unsigned blkbits = blksize_bits(bdev_hardsect_size(I_BDEV(inode))); +	unsigned blocksize_mask = (1 << blkbits) - 1; +	unsigned long seg = 0;	/* iov segment iterator */ +	unsigned long nvec;	/* number of bio vec needed */ +	unsigned long cur_off;	/* offset into current page */ +	unsigned long cur_len;	/* I/O len of current page, up to PAGE_SIZE */ + +	unsigned long addr;	/* user iovec address */ +	size_t count;		/* user iovec len */ +	size_t nbytes = iocb->ki_nbytes = iocb->ki_left; /* total xfer size */ +	loff_t size;		/* size of block device */ +	struct bio *bio; +	atomic_t *bio_count = &iocb->ki_bio_count; +	struct page *page; +	struct pvec pvec; + +	pvec.nr = 0; +	pvec.idx = 0; + +	if (pos & blocksize_mask) +		return -EINVAL; + +	size = i_size_read(inode); +	if (pos + nbytes > size) { +		nbytes = size - pos; +		iocb->ki_left = nbytes; +	} + +	/* +	 * check first non-zero iov alignment, the remaining +	 * iov alignment is checked inside bio loop below. +	 */ +	do { +		addr = (unsigned long) iov[seg].iov_base; +		count = min(iov[seg].iov_len, nbytes); +		if (addr & blocksize_mask || count & blocksize_mask) +			return -EINVAL; +	} while (!count && ++seg < nr_segs); +	atomic_set(bio_count, 1); -	return blockdev_direct_IO_no_locking(rw, iocb, inode, I_BDEV(inode), -				iov, offset, nr_segs, blkdev_get_blocks, NULL); +	while (nbytes) { +		/* roughly estimate number of bio vec needed */ +		nvec = (nbytes + PAGE_SIZE - 1) / PAGE_SIZE; +		nvec = max(nvec, nr_segs - seg); +		nvec = min(nvec, (unsigned long) BIO_MAX_PAGES); + +		/* bio_alloc should not fail with GFP_KERNEL flag */ +		bio = bio_alloc(GFP_KERNEL, nvec); +		bio->bi_bdev = I_BDEV(inode); +		bio->bi_end_io = blk_end_aio; +		bio->bi_private = iocb; +		bio->bi_sector = pos >> blkbits; +same_bio: +		cur_off = addr & ~PAGE_MASK; +		cur_len = PAGE_SIZE - cur_off; +		if (count < cur_len) +			cur_len = count; + +		page = blk_get_page(addr, count, rw, &pvec); +		if (unlikely(IS_ERR(page))) +			goto backout; + +		if (bio_add_page(bio, page, cur_len, cur_off)) { +			pos += cur_len; +			addr += cur_len; +			count -= cur_len; +			nbytes -= cur_len; + +			if (count) +				goto same_bio; +			while (++seg < nr_segs) { +				addr = (unsigned long) iov[seg].iov_base; +				count = iov[seg].iov_len; +				if (!count) +					continue; +				if (unlikely(addr & blocksize_mask || +					     count & blocksize_mask)) { +					page = ERR_PTR(-EINVAL); +					goto backout; +				} +				count = min(count, nbytes); +				goto same_bio; +			} +		} + +		/* bio is ready, submit it */ +		if (rw == READ) +			bio_set_pages_dirty(bio); +		atomic_inc(bio_count); +		submit_bio(rw, bio); +	} + +completion: +	iocb->ki_left -= nbytes; +	nbytes = iocb->ki_left; +	iocb->ki_pos += nbytes; + +	blk_run_address_space(inode->i_mapping); +	if (atomic_dec_and_test(bio_count)) +		aio_complete(iocb, nbytes, 0); + +	return -EIOCBQUEUED; + +backout: +	/* +	 * back out nbytes count constructed so far for this bio, +	 * we will throw away current bio. +	 */ +	nbytes += bio->bi_size; +	bio_release_pages(bio); +	bio_put(bio); + +	/* +	 * if no bio was submmitted, return the error code. +	 * otherwise, proceed with pending I/O completion. +	 */ +	if (atomic_read(bio_count) == 1) +		return PTR_ERR(page); +	goto completion;  }  static int blkdev_writepage(struct page *page, struct writeback_control *wbc)  | 
