diff options
Diffstat (limited to 'fs/nfs')
36 files changed, 4631 insertions, 572 deletions
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index 81515545ba7..dbcd82126ae 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -77,6 +77,7 @@ config NFS_V4  config NFS_V4_1  	bool "NFS client support for NFSv4.1 (EXPERIMENTAL)"  	depends on NFS_FS && NFS_V4 && EXPERIMENTAL +	select SUNRPC_BACKCHANNEL  	select PNFS_FILE_LAYOUT  	help  	  This option enables support for minor version 1 of the NFSv4 protocol @@ -87,15 +88,15 @@ config NFS_V4_1  config PNFS_FILE_LAYOUT  	tristate +config PNFS_BLOCK +	tristate +	depends on NFS_FS && NFS_V4_1 && BLK_DEV_DM +	default m +  config PNFS_OBJLAYOUT -	tristate "Provide support for the pNFS Objects Layout Driver for NFSv4.1 pNFS (EXPERIMENTAL)" +	tristate  	depends on NFS_FS && NFS_V4_1 && SCSI_OSD_ULD -	help -	  Say M here if you want your pNFS client to support the Objects Layout Driver. -	  Requires the SCSI osd initiator library (SCSI_OSD_INITIATOR) and -	  upper level driver (SCSI_OSD_ULD). - -	  If unsure, say N. +	default m  config ROOT_NFS  	bool "Root file system on NFS" diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile index 6a34f7dd0e6..b58613d0abb 100644 --- a/fs/nfs/Makefile +++ b/fs/nfs/Makefile @@ -23,3 +23,4 @@ obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o  nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o  obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/ +obj-$(CONFIG_PNFS_BLOCK) += blocklayout/ diff --git a/fs/nfs/blocklayout/Makefile b/fs/nfs/blocklayout/Makefile new file mode 100644 index 00000000000..d5815505c02 --- /dev/null +++ b/fs/nfs/blocklayout/Makefile @@ -0,0 +1,5 @@ +# +# Makefile for the pNFS block layout driver kernel module +# +obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o +blocklayoutdriver-objs := blocklayout.o extents.o blocklayoutdev.o blocklayoutdm.o diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c new file mode 100644 index 00000000000..9561c8fc8bd --- /dev/null +++ b/fs/nfs/blocklayout/blocklayout.c @@ -0,0 +1,1020 @@ +/* + *  linux/fs/nfs/blocklayout/blocklayout.c + * + *  Module for the NFSv4.1 pNFS block layout driver. + * + *  Copyright (c) 2006 The Regents of the University of Michigan. + *  All rights reserved. + * + *  Andy Adamson <andros@citi.umich.edu> + *  Fred Isaman <iisaman@umich.edu> + * + * permission is granted to use, copy, create derivative works and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the university of michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization.  if + * the above copyright notice or any other identification of the + * university of michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * this software is provided as is, without representation from the + * university of michigan as to its fitness for any purpose, and without + * warranty by the university of michigan of any kind, either express + * or implied, including without limitation the implied warranties of + * merchantability and fitness for a particular purpose.  the regents + * of the university of michigan shall not be liable for any damages, + * including special, indirect, incidental, or consequential damages, + * with respect to any claim arising out or in connection with the use + * of the software, even if it has been or is hereafter advised of the + * possibility of such damages. + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/mount.h> +#include <linux/namei.h> +#include <linux/bio.h>		/* struct bio */ +#include <linux/buffer_head.h>	/* various write calls */ +#include <linux/prefetch.h> + +#include "blocklayout.h" + +#define NFSDBG_FACILITY	NFSDBG_PNFS_LD + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Andy Adamson <andros@citi.umich.edu>"); +MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver"); + +struct dentry *bl_device_pipe; +wait_queue_head_t bl_wq; + +static void print_page(struct page *page) +{ +	dprintk("PRINTPAGE page %p\n", page); +	dprintk("	PagePrivate %d\n", PagePrivate(page)); +	dprintk("	PageUptodate %d\n", PageUptodate(page)); +	dprintk("	PageError %d\n", PageError(page)); +	dprintk("	PageDirty %d\n", PageDirty(page)); +	dprintk("	PageReferenced %d\n", PageReferenced(page)); +	dprintk("	PageLocked %d\n", PageLocked(page)); +	dprintk("	PageWriteback %d\n", PageWriteback(page)); +	dprintk("	PageMappedToDisk %d\n", PageMappedToDisk(page)); +	dprintk("\n"); +} + +/* Given the be associated with isect, determine if page data needs to be + * initialized. + */ +static int is_hole(struct pnfs_block_extent *be, sector_t isect) +{ +	if (be->be_state == PNFS_BLOCK_NONE_DATA) +		return 1; +	else if (be->be_state != PNFS_BLOCK_INVALID_DATA) +		return 0; +	else +		return !bl_is_sector_init(be->be_inval, isect); +} + +/* Given the be associated with isect, determine if page data can be + * written to disk. + */ +static int is_writable(struct pnfs_block_extent *be, sector_t isect) +{ +	return (be->be_state == PNFS_BLOCK_READWRITE_DATA || +		be->be_state == PNFS_BLOCK_INVALID_DATA); +} + +/* The data we are handed might be spread across several bios.  We need + * to track when the last one is finished. + */ +struct parallel_io { +	struct kref refcnt; +	struct rpc_call_ops call_ops; +	void (*pnfs_callback) (void *data); +	void *data; +}; + +static inline struct parallel_io *alloc_parallel(void *data) +{ +	struct parallel_io *rv; + +	rv  = kmalloc(sizeof(*rv), GFP_NOFS); +	if (rv) { +		rv->data = data; +		kref_init(&rv->refcnt); +	} +	return rv; +} + +static inline void get_parallel(struct parallel_io *p) +{ +	kref_get(&p->refcnt); +} + +static void destroy_parallel(struct kref *kref) +{ +	struct parallel_io *p = container_of(kref, struct parallel_io, refcnt); + +	dprintk("%s enter\n", __func__); +	p->pnfs_callback(p->data); +	kfree(p); +} + +static inline void put_parallel(struct parallel_io *p) +{ +	kref_put(&p->refcnt, destroy_parallel); +} + +static struct bio * +bl_submit_bio(int rw, struct bio *bio) +{ +	if (bio) { +		get_parallel(bio->bi_private); +		dprintk("%s submitting %s bio %u@%llu\n", __func__, +			rw == READ ? "read" : "write", +			bio->bi_size, (unsigned long long)bio->bi_sector); +		submit_bio(rw, bio); +	} +	return NULL; +} + +static struct bio *bl_alloc_init_bio(int npg, sector_t isect, +				     struct pnfs_block_extent *be, +				     void (*end_io)(struct bio *, int err), +				     struct parallel_io *par) +{ +	struct bio *bio; + +	bio = bio_alloc(GFP_NOIO, npg); +	if (!bio) +		return NULL; + +	bio->bi_sector = isect - be->be_f_offset + be->be_v_offset; +	bio->bi_bdev = be->be_mdev; +	bio->bi_end_io = end_io; +	bio->bi_private = par; +	return bio; +} + +static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw, +				      sector_t isect, struct page *page, +				      struct pnfs_block_extent *be, +				      void (*end_io)(struct bio *, int err), +				      struct parallel_io *par) +{ +retry: +	if (!bio) { +		bio = bl_alloc_init_bio(npg, isect, be, end_io, par); +		if (!bio) +			return ERR_PTR(-ENOMEM); +	} +	if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) { +		bio = bl_submit_bio(rw, bio); +		goto retry; +	} +	return bio; +} + +static void bl_set_lo_fail(struct pnfs_layout_segment *lseg) +{ +	if (lseg->pls_range.iomode == IOMODE_RW) { +		dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__); +		set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags); +	} else { +		dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__); +		set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags); +	} +} + +/* This is basically copied from mpage_end_io_read */ +static void bl_end_io_read(struct bio *bio, int err) +{ +	struct parallel_io *par = bio->bi_private; +	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); +	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; +	struct nfs_read_data *rdata = (struct nfs_read_data *)par->data; + +	do { +		struct page *page = bvec->bv_page; + +		if (--bvec >= bio->bi_io_vec) +			prefetchw(&bvec->bv_page->flags); +		if (uptodate) +			SetPageUptodate(page); +	} while (bvec >= bio->bi_io_vec); +	if (!uptodate) { +		if (!rdata->pnfs_error) +			rdata->pnfs_error = -EIO; +		bl_set_lo_fail(rdata->lseg); +	} +	bio_put(bio); +	put_parallel(par); +} + +static void bl_read_cleanup(struct work_struct *work) +{ +	struct rpc_task *task; +	struct nfs_read_data *rdata; +	dprintk("%s enter\n", __func__); +	task = container_of(work, struct rpc_task, u.tk_work); +	rdata = container_of(task, struct nfs_read_data, task); +	pnfs_ld_read_done(rdata); +} + +static void +bl_end_par_io_read(void *data) +{ +	struct nfs_read_data *rdata = data; + +	INIT_WORK(&rdata->task.u.tk_work, bl_read_cleanup); +	schedule_work(&rdata->task.u.tk_work); +} + +/* We don't want normal .rpc_call_done callback used, so we replace it + * with this stub. + */ +static void bl_rpc_do_nothing(struct rpc_task *task, void *calldata) +{ +	return; +} + +static enum pnfs_try_status +bl_read_pagelist(struct nfs_read_data *rdata) +{ +	int i, hole; +	struct bio *bio = NULL; +	struct pnfs_block_extent *be = NULL, *cow_read = NULL; +	sector_t isect, extent_length = 0; +	struct parallel_io *par; +	loff_t f_offset = rdata->args.offset; +	size_t count = rdata->args.count; +	struct page **pages = rdata->args.pages; +	int pg_index = rdata->args.pgbase >> PAGE_CACHE_SHIFT; + +	dprintk("%s enter nr_pages %u offset %lld count %Zd\n", __func__, +	       rdata->npages, f_offset, count); + +	par = alloc_parallel(rdata); +	if (!par) +		goto use_mds; +	par->call_ops = *rdata->mds_ops; +	par->call_ops.rpc_call_done = bl_rpc_do_nothing; +	par->pnfs_callback = bl_end_par_io_read; +	/* At this point, we can no longer jump to use_mds */ + +	isect = (sector_t) (f_offset >> SECTOR_SHIFT); +	/* Code assumes extents are page-aligned */ +	for (i = pg_index; i < rdata->npages; i++) { +		if (!extent_length) { +			/* We've used up the previous extent */ +			bl_put_extent(be); +			bl_put_extent(cow_read); +			bio = bl_submit_bio(READ, bio); +			/* Get the next one */ +			be = bl_find_get_extent(BLK_LSEG2EXT(rdata->lseg), +					     isect, &cow_read); +			if (!be) { +				rdata->pnfs_error = -EIO; +				goto out; +			} +			extent_length = be->be_length - +				(isect - be->be_f_offset); +			if (cow_read) { +				sector_t cow_length = cow_read->be_length - +					(isect - cow_read->be_f_offset); +				extent_length = min(extent_length, cow_length); +			} +		} +		hole = is_hole(be, isect); +		if (hole && !cow_read) { +			bio = bl_submit_bio(READ, bio); +			/* Fill hole w/ zeroes w/o accessing device */ +			dprintk("%s Zeroing page for hole\n", __func__); +			zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE); +			print_page(pages[i]); +			SetPageUptodate(pages[i]); +		} else { +			struct pnfs_block_extent *be_read; + +			be_read = (hole && cow_read) ? cow_read : be; +			bio = bl_add_page_to_bio(bio, rdata->npages - i, READ, +						 isect, pages[i], be_read, +						 bl_end_io_read, par); +			if (IS_ERR(bio)) { +				rdata->pnfs_error = PTR_ERR(bio); +				goto out; +			} +		} +		isect += PAGE_CACHE_SECTORS; +		extent_length -= PAGE_CACHE_SECTORS; +	} +	if ((isect << SECTOR_SHIFT) >= rdata->inode->i_size) { +		rdata->res.eof = 1; +		rdata->res.count = rdata->inode->i_size - f_offset; +	} else { +		rdata->res.count = (isect << SECTOR_SHIFT) - f_offset; +	} +out: +	bl_put_extent(be); +	bl_put_extent(cow_read); +	bl_submit_bio(READ, bio); +	put_parallel(par); +	return PNFS_ATTEMPTED; + + use_mds: +	dprintk("Giving up and using normal NFS\n"); +	return PNFS_NOT_ATTEMPTED; +} + +static void mark_extents_written(struct pnfs_block_layout *bl, +				 __u64 offset, __u32 count) +{ +	sector_t isect, end; +	struct pnfs_block_extent *be; + +	dprintk("%s(%llu, %u)\n", __func__, offset, count); +	if (count == 0) +		return; +	isect = (offset & (long)(PAGE_CACHE_MASK)) >> SECTOR_SHIFT; +	end = (offset + count + PAGE_CACHE_SIZE - 1) & (long)(PAGE_CACHE_MASK); +	end >>= SECTOR_SHIFT; +	while (isect < end) { +		sector_t len; +		be = bl_find_get_extent(bl, isect, NULL); +		BUG_ON(!be); /* FIXME */ +		len = min(end, be->be_f_offset + be->be_length) - isect; +		if (be->be_state == PNFS_BLOCK_INVALID_DATA) +			bl_mark_for_commit(be, isect, len); /* What if fails? */ +		isect += len; +		bl_put_extent(be); +	} +} + +static void bl_end_io_write_zero(struct bio *bio, int err) +{ +	struct parallel_io *par = bio->bi_private; +	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); +	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; +	struct nfs_write_data *wdata = (struct nfs_write_data *)par->data; + +	do { +		struct page *page = bvec->bv_page; + +		if (--bvec >= bio->bi_io_vec) +			prefetchw(&bvec->bv_page->flags); +		/* This is the zeroing page we added */ +		end_page_writeback(page); +		page_cache_release(page); +	} while (bvec >= bio->bi_io_vec); +	if (!uptodate) { +		if (!wdata->pnfs_error) +			wdata->pnfs_error = -EIO; +		bl_set_lo_fail(wdata->lseg); +	} +	bio_put(bio); +	put_parallel(par); +} + +/* This is basically copied from mpage_end_io_read */ +static void bl_end_io_write(struct bio *bio, int err) +{ +	struct parallel_io *par = bio->bi_private; +	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); +	struct nfs_write_data *wdata = (struct nfs_write_data *)par->data; + +	if (!uptodate) { +		if (!wdata->pnfs_error) +			wdata->pnfs_error = -EIO; +		bl_set_lo_fail(wdata->lseg); +	} +	bio_put(bio); +	put_parallel(par); +} + +/* Function scheduled for call during bl_end_par_io_write, + * it marks sectors as written and extends the commitlist. + */ +static void bl_write_cleanup(struct work_struct *work) +{ +	struct rpc_task *task; +	struct nfs_write_data *wdata; +	dprintk("%s enter\n", __func__); +	task = container_of(work, struct rpc_task, u.tk_work); +	wdata = container_of(task, struct nfs_write_data, task); +	if (!wdata->pnfs_error) { +		/* Marks for LAYOUTCOMMIT */ +		mark_extents_written(BLK_LSEG2EXT(wdata->lseg), +				     wdata->args.offset, wdata->args.count); +	} +	pnfs_ld_write_done(wdata); +} + +/* Called when last of bios associated with a bl_write_pagelist call finishes */ +static void bl_end_par_io_write(void *data) +{ +	struct nfs_write_data *wdata = data; + +	wdata->task.tk_status = 0; +	wdata->verf.committed = NFS_FILE_SYNC; +	INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup); +	schedule_work(&wdata->task.u.tk_work); +} + +/* FIXME STUB - mark intersection of layout and page as bad, so is not + * used again. + */ +static void mark_bad_read(void) +{ +	return; +} + +/* + * map_block:  map a requested I/0 block (isect) into an offset in the LVM + * block_device + */ +static void +map_block(struct buffer_head *bh, sector_t isect, struct pnfs_block_extent *be) +{ +	dprintk("%s enter be=%p\n", __func__, be); + +	set_buffer_mapped(bh); +	bh->b_bdev = be->be_mdev; +	bh->b_blocknr = (isect - be->be_f_offset + be->be_v_offset) >> +	    (be->be_mdev->bd_inode->i_blkbits - SECTOR_SHIFT); + +	dprintk("%s isect %llu, bh->b_blocknr %ld, using bsize %Zd\n", +		__func__, (unsigned long long)isect, (long)bh->b_blocknr, +		bh->b_size); +	return; +} + +/* Given an unmapped page, zero it or read in page for COW, page is locked + * by caller. + */ +static int +init_page_for_write(struct page *page, struct pnfs_block_extent *cow_read) +{ +	struct buffer_head *bh = NULL; +	int ret = 0; +	sector_t isect; + +	dprintk("%s enter, %p\n", __func__, page); +	BUG_ON(PageUptodate(page)); +	if (!cow_read) { +		zero_user_segment(page, 0, PAGE_SIZE); +		SetPageUptodate(page); +		goto cleanup; +	} + +	bh = alloc_page_buffers(page, PAGE_CACHE_SIZE, 0); +	if (!bh) { +		ret = -ENOMEM; +		goto cleanup; +	} + +	isect = (sector_t) page->index << PAGE_CACHE_SECTOR_SHIFT; +	map_block(bh, isect, cow_read); +	if (!bh_uptodate_or_lock(bh)) +		ret = bh_submit_read(bh); +	if (ret) +		goto cleanup; +	SetPageUptodate(page); + +cleanup: +	bl_put_extent(cow_read); +	if (bh) +		free_buffer_head(bh); +	if (ret) { +		/* Need to mark layout with bad read...should now +		 * just use nfs4 for reads and writes. +		 */ +		mark_bad_read(); +	} +	return ret; +} + +static enum pnfs_try_status +bl_write_pagelist(struct nfs_write_data *wdata, int sync) +{ +	int i, ret, npg_zero, pg_index, last = 0; +	struct bio *bio = NULL; +	struct pnfs_block_extent *be = NULL, *cow_read = NULL; +	sector_t isect, last_isect = 0, extent_length = 0; +	struct parallel_io *par; +	loff_t offset = wdata->args.offset; +	size_t count = wdata->args.count; +	struct page **pages = wdata->args.pages; +	struct page *page; +	pgoff_t index; +	u64 temp; +	int npg_per_block = +	    NFS_SERVER(wdata->inode)->pnfs_blksize >> PAGE_CACHE_SHIFT; + +	dprintk("%s enter, %Zu@%lld\n", __func__, count, offset); +	/* At this point, wdata->pages is a (sequential) list of nfs_pages. +	 * We want to write each, and if there is an error set pnfs_error +	 * to have it redone using nfs. +	 */ +	par = alloc_parallel(wdata); +	if (!par) +		return PNFS_NOT_ATTEMPTED; +	par->call_ops = *wdata->mds_ops; +	par->call_ops.rpc_call_done = bl_rpc_do_nothing; +	par->pnfs_callback = bl_end_par_io_write; +	/* At this point, have to be more careful with error handling */ + +	isect = (sector_t) ((offset & (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT); +	be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg), isect, &cow_read); +	if (!be || !is_writable(be, isect)) { +		dprintk("%s no matching extents!\n", __func__); +		wdata->pnfs_error = -EINVAL; +		goto out; +	} + +	/* First page inside INVALID extent */ +	if (be->be_state == PNFS_BLOCK_INVALID_DATA) { +		temp = offset >> PAGE_CACHE_SHIFT; +		npg_zero = do_div(temp, npg_per_block); +		isect = (sector_t) (((offset - npg_zero * PAGE_CACHE_SIZE) & +				     (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT); +		extent_length = be->be_length - (isect - be->be_f_offset); + +fill_invalid_ext: +		dprintk("%s need to zero %d pages\n", __func__, npg_zero); +		for (;npg_zero > 0; npg_zero--) { +			/* page ref released in bl_end_io_write_zero */ +			index = isect >> PAGE_CACHE_SECTOR_SHIFT; +			dprintk("%s zero %dth page: index %lu isect %llu\n", +				__func__, npg_zero, index, +				(unsigned long long)isect); +			page = +			    find_or_create_page(wdata->inode->i_mapping, index, +						GFP_NOFS); +			if (!page) { +				dprintk("%s oom\n", __func__); +				wdata->pnfs_error = -ENOMEM; +				goto out; +			} + +			/* PageDirty: Other will write this out +			 * PageWriteback: Other is writing this out +			 * PageUptodate: It was read before +			 * sector_initialized: already written out +			 */ +			if (PageDirty(page) || PageWriteback(page) || +			    bl_is_sector_init(be->be_inval, isect)) { +				print_page(page); +				unlock_page(page); +				page_cache_release(page); +				goto next_page; +			} +			if (!PageUptodate(page)) { +				/* New page, readin or zero it */ +				init_page_for_write(page, cow_read); +			} +			set_page_writeback(page); +			unlock_page(page); + +			ret = bl_mark_sectors_init(be->be_inval, isect, +						       PAGE_CACHE_SECTORS, +						       NULL); +			if (unlikely(ret)) { +				dprintk("%s bl_mark_sectors_init fail %d\n", +					__func__, ret); +				end_page_writeback(page); +				page_cache_release(page); +				wdata->pnfs_error = ret; +				goto out; +			} +			bio = bl_add_page_to_bio(bio, npg_zero, WRITE, +						 isect, page, be, +						 bl_end_io_write_zero, par); +			if (IS_ERR(bio)) { +				wdata->pnfs_error = PTR_ERR(bio); +				goto out; +			} +			/* FIXME: This should be done in bi_end_io */ +			mark_extents_written(BLK_LSEG2EXT(wdata->lseg), +					     page->index << PAGE_CACHE_SHIFT, +					     PAGE_CACHE_SIZE); +next_page: +			isect += PAGE_CACHE_SECTORS; +			extent_length -= PAGE_CACHE_SECTORS; +		} +		if (last) +			goto write_done; +	} +	bio = bl_submit_bio(WRITE, bio); + +	/* Middle pages */ +	pg_index = wdata->args.pgbase >> PAGE_CACHE_SHIFT; +	for (i = pg_index; i < wdata->npages; i++) { +		if (!extent_length) { +			/* We've used up the previous extent */ +			bl_put_extent(be); +			bio = bl_submit_bio(WRITE, bio); +			/* Get the next one */ +			be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg), +					     isect, NULL); +			if (!be || !is_writable(be, isect)) { +				wdata->pnfs_error = -EINVAL; +				goto out; +			} +			extent_length = be->be_length - +			    (isect - be->be_f_offset); +		} +		if (be->be_state == PNFS_BLOCK_INVALID_DATA) { +			ret = bl_mark_sectors_init(be->be_inval, isect, +						       PAGE_CACHE_SECTORS, +						       NULL); +			if (unlikely(ret)) { +				dprintk("%s bl_mark_sectors_init fail %d\n", +					__func__, ret); +				wdata->pnfs_error = ret; +				goto out; +			} +		} +		bio = bl_add_page_to_bio(bio, wdata->npages - i, WRITE, +					 isect, pages[i], be, +					 bl_end_io_write, par); +		if (IS_ERR(bio)) { +			wdata->pnfs_error = PTR_ERR(bio); +			goto out; +		} +		isect += PAGE_CACHE_SECTORS; +		last_isect = isect; +		extent_length -= PAGE_CACHE_SECTORS; +	} + +	/* Last page inside INVALID extent */ +	if (be->be_state == PNFS_BLOCK_INVALID_DATA) { +		bio = bl_submit_bio(WRITE, bio); +		temp = last_isect >> PAGE_CACHE_SECTOR_SHIFT; +		npg_zero = npg_per_block - do_div(temp, npg_per_block); +		if (npg_zero < npg_per_block) { +			last = 1; +			goto fill_invalid_ext; +		} +	} + +write_done: +	wdata->res.count = (last_isect << SECTOR_SHIFT) - (offset); +	if (count < wdata->res.count) { +		wdata->res.count = count; +	} +out: +	bl_put_extent(be); +	bl_submit_bio(WRITE, bio); +	put_parallel(par); +	return PNFS_ATTEMPTED; +} + +/* FIXME - range ignored */ +static void +release_extents(struct pnfs_block_layout *bl, struct pnfs_layout_range *range) +{ +	int i; +	struct pnfs_block_extent *be; + +	spin_lock(&bl->bl_ext_lock); +	for (i = 0; i < EXTENT_LISTS; i++) { +		while (!list_empty(&bl->bl_extents[i])) { +			be = list_first_entry(&bl->bl_extents[i], +					      struct pnfs_block_extent, +					      be_node); +			list_del(&be->be_node); +			bl_put_extent(be); +		} +	} +	spin_unlock(&bl->bl_ext_lock); +} + +static void +release_inval_marks(struct pnfs_inval_markings *marks) +{ +	struct pnfs_inval_tracking *pos, *temp; + +	list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) { +		list_del(&pos->it_link); +		kfree(pos); +	} +	return; +} + +static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo) +{ +	struct pnfs_block_layout *bl = BLK_LO2EXT(lo); + +	dprintk("%s enter\n", __func__); +	release_extents(bl, NULL); +	release_inval_marks(&bl->bl_inval); +	kfree(bl); +} + +static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode, +						   gfp_t gfp_flags) +{ +	struct pnfs_block_layout *bl; + +	dprintk("%s enter\n", __func__); +	bl = kzalloc(sizeof(*bl), gfp_flags); +	if (!bl) +		return NULL; +	spin_lock_init(&bl->bl_ext_lock); +	INIT_LIST_HEAD(&bl->bl_extents[0]); +	INIT_LIST_HEAD(&bl->bl_extents[1]); +	INIT_LIST_HEAD(&bl->bl_commit); +	INIT_LIST_HEAD(&bl->bl_committing); +	bl->bl_count = 0; +	bl->bl_blocksize = NFS_SERVER(inode)->pnfs_blksize >> SECTOR_SHIFT; +	BL_INIT_INVAL_MARKS(&bl->bl_inval, bl->bl_blocksize); +	return &bl->bl_layout; +} + +static void bl_free_lseg(struct pnfs_layout_segment *lseg) +{ +	dprintk("%s enter\n", __func__); +	kfree(lseg); +} + +/* We pretty much ignore lseg, and store all data layout wide, so we + * can correctly merge. + */ +static struct pnfs_layout_segment *bl_alloc_lseg(struct pnfs_layout_hdr *lo, +						 struct nfs4_layoutget_res *lgr, +						 gfp_t gfp_flags) +{ +	struct pnfs_layout_segment *lseg; +	int status; + +	dprintk("%s enter\n", __func__); +	lseg = kzalloc(sizeof(*lseg), gfp_flags); +	if (!lseg) +		return ERR_PTR(-ENOMEM); +	status = nfs4_blk_process_layoutget(lo, lgr, gfp_flags); +	if (status) { +		/* We don't want to call the full-blown bl_free_lseg, +		 * since on error extents were not touched. +		 */ +		kfree(lseg); +		return ERR_PTR(status); +	} +	return lseg; +} + +static void +bl_encode_layoutcommit(struct pnfs_layout_hdr *lo, struct xdr_stream *xdr, +		       const struct nfs4_layoutcommit_args *arg) +{ +	dprintk("%s enter\n", __func__); +	encode_pnfs_block_layoutupdate(BLK_LO2EXT(lo), xdr, arg); +} + +static void +bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data *lcdata) +{ +	struct pnfs_layout_hdr *lo = NFS_I(lcdata->args.inode)->layout; + +	dprintk("%s enter\n", __func__); +	clean_pnfs_block_layoutupdate(BLK_LO2EXT(lo), &lcdata->args, lcdata->res.status); +} + +static void free_blk_mountid(struct block_mount_id *mid) +{ +	if (mid) { +		struct pnfs_block_dev *dev; +		spin_lock(&mid->bm_lock); +		while (!list_empty(&mid->bm_devlist)) { +			dev = list_first_entry(&mid->bm_devlist, +					       struct pnfs_block_dev, +					       bm_node); +			list_del(&dev->bm_node); +			bl_free_block_dev(dev); +		} +		spin_unlock(&mid->bm_lock); +		kfree(mid); +	} +} + +/* This is mostly copied from the filelayout's get_device_info function. + * It seems much of this should be at the generic pnfs level. + */ +static struct pnfs_block_dev * +nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh, +			struct nfs4_deviceid *d_id) +{ +	struct pnfs_device *dev; +	struct pnfs_block_dev *rv = NULL; +	u32 max_resp_sz; +	int max_pages; +	struct page **pages = NULL; +	int i, rc; + +	/* +	 * Use the session max response size as the basis for setting +	 * GETDEVICEINFO's maxcount +	 */ +	max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; +	max_pages = max_resp_sz >> PAGE_SHIFT; +	dprintk("%s max_resp_sz %u max_pages %d\n", +		__func__, max_resp_sz, max_pages); + +	dev = kmalloc(sizeof(*dev), GFP_NOFS); +	if (!dev) { +		dprintk("%s kmalloc failed\n", __func__); +		return NULL; +	} + +	pages = kzalloc(max_pages * sizeof(struct page *), GFP_NOFS); +	if (pages == NULL) { +		kfree(dev); +		return NULL; +	} +	for (i = 0; i < max_pages; i++) { +		pages[i] = alloc_page(GFP_NOFS); +		if (!pages[i]) +			goto out_free; +	} + +	memcpy(&dev->dev_id, d_id, sizeof(*d_id)); +	dev->layout_type = LAYOUT_BLOCK_VOLUME; +	dev->pages = pages; +	dev->pgbase = 0; +	dev->pglen = PAGE_SIZE * max_pages; +	dev->mincount = 0; + +	dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data); +	rc = nfs4_proc_getdeviceinfo(server, dev); +	dprintk("%s getdevice info returns %d\n", __func__, rc); +	if (rc) +		goto out_free; + +	rv = nfs4_blk_decode_device(server, dev); + out_free: +	for (i = 0; i < max_pages; i++) +		__free_page(pages[i]); +	kfree(pages); +	kfree(dev); +	return rv; +} + +static int +bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh) +{ +	struct block_mount_id *b_mt_id = NULL; +	struct pnfs_devicelist *dlist = NULL; +	struct pnfs_block_dev *bdev; +	LIST_HEAD(block_disklist); +	int status = 0, i; + +	dprintk("%s enter\n", __func__); + +	if (server->pnfs_blksize == 0) { +		dprintk("%s Server did not return blksize\n", __func__); +		return -EINVAL; +	} +	b_mt_id = kzalloc(sizeof(struct block_mount_id), GFP_NOFS); +	if (!b_mt_id) { +		status = -ENOMEM; +		goto out_error; +	} +	/* Initialize nfs4 block layout mount id */ +	spin_lock_init(&b_mt_id->bm_lock); +	INIT_LIST_HEAD(&b_mt_id->bm_devlist); + +	dlist = kmalloc(sizeof(struct pnfs_devicelist), GFP_NOFS); +	if (!dlist) { +		status = -ENOMEM; +		goto out_error; +	} +	dlist->eof = 0; +	while (!dlist->eof) { +		status = nfs4_proc_getdevicelist(server, fh, dlist); +		if (status) +			goto out_error; +		dprintk("%s GETDEVICELIST numdevs=%i, eof=%i\n", +			__func__, dlist->num_devs, dlist->eof); +		for (i = 0; i < dlist->num_devs; i++) { +			bdev = nfs4_blk_get_deviceinfo(server, fh, +						       &dlist->dev_id[i]); +			if (!bdev) { +				status = -ENODEV; +				goto out_error; +			} +			spin_lock(&b_mt_id->bm_lock); +			list_add(&bdev->bm_node, &b_mt_id->bm_devlist); +			spin_unlock(&b_mt_id->bm_lock); +		} +	} +	dprintk("%s SUCCESS\n", __func__); +	server->pnfs_ld_data = b_mt_id; + + out_return: +	kfree(dlist); +	return status; + + out_error: +	free_blk_mountid(b_mt_id); +	goto out_return; +} + +static int +bl_clear_layoutdriver(struct nfs_server *server) +{ +	struct block_mount_id *b_mt_id = server->pnfs_ld_data; + +	dprintk("%s enter\n", __func__); +	free_blk_mountid(b_mt_id); +	dprintk("%s RETURNS\n", __func__); +	return 0; +} + +static const struct nfs_pageio_ops bl_pg_read_ops = { +	.pg_init = pnfs_generic_pg_init_read, +	.pg_test = pnfs_generic_pg_test, +	.pg_doio = pnfs_generic_pg_readpages, +}; + +static const struct nfs_pageio_ops bl_pg_write_ops = { +	.pg_init = pnfs_generic_pg_init_write, +	.pg_test = pnfs_generic_pg_test, +	.pg_doio = pnfs_generic_pg_writepages, +}; + +static struct pnfs_layoutdriver_type blocklayout_type = { +	.id				= LAYOUT_BLOCK_VOLUME, +	.name				= "LAYOUT_BLOCK_VOLUME", +	.read_pagelist			= bl_read_pagelist, +	.write_pagelist			= bl_write_pagelist, +	.alloc_layout_hdr		= bl_alloc_layout_hdr, +	.free_layout_hdr		= bl_free_layout_hdr, +	.alloc_lseg			= bl_alloc_lseg, +	.free_lseg			= bl_free_lseg, +	.encode_layoutcommit		= bl_encode_layoutcommit, +	.cleanup_layoutcommit		= bl_cleanup_layoutcommit, +	.set_layoutdriver		= bl_set_layoutdriver, +	.clear_layoutdriver		= bl_clear_layoutdriver, +	.pg_read_ops			= &bl_pg_read_ops, +	.pg_write_ops			= &bl_pg_write_ops, +}; + +static const struct rpc_pipe_ops bl_upcall_ops = { +	.upcall		= bl_pipe_upcall, +	.downcall	= bl_pipe_downcall, +	.destroy_msg	= bl_pipe_destroy_msg, +}; + +static int __init nfs4blocklayout_init(void) +{ +	struct vfsmount *mnt; +	struct path path; +	int ret; + +	dprintk("%s: NFSv4 Block Layout Driver Registering...\n", __func__); + +	ret = pnfs_register_layoutdriver(&blocklayout_type); +	if (ret) +		goto out; + +	init_waitqueue_head(&bl_wq); + +	mnt = rpc_get_mount(); +	if (IS_ERR(mnt)) { +		ret = PTR_ERR(mnt); +		goto out_remove; +	} + +	ret = vfs_path_lookup(mnt->mnt_root, +			      mnt, +			      NFS_PIPE_DIRNAME, 0, &path); +	if (ret) +		goto out_remove; + +	bl_device_pipe = rpc_mkpipe(path.dentry, "blocklayout", NULL, +				    &bl_upcall_ops, 0); +	if (IS_ERR(bl_device_pipe)) { +		ret = PTR_ERR(bl_device_pipe); +		goto out_remove; +	} +out: +	return ret; + +out_remove: +	pnfs_unregister_layoutdriver(&blocklayout_type); +	return ret; +} + +static void __exit nfs4blocklayout_exit(void) +{ +	dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n", +	       __func__); + +	pnfs_unregister_layoutdriver(&blocklayout_type); +	rpc_unlink(bl_device_pipe); +} + +MODULE_ALIAS("nfs-layouttype4-3"); + +module_init(nfs4blocklayout_init); +module_exit(nfs4blocklayout_exit); diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h new file mode 100644 index 00000000000..f27d827960a --- /dev/null +++ b/fs/nfs/blocklayout/blocklayout.h @@ -0,0 +1,207 @@ +/* + *  linux/fs/nfs/blocklayout/blocklayout.h + * + *  Module for the NFSv4.1 pNFS block layout driver. + * + *  Copyright (c) 2006 The Regents of the University of Michigan. + *  All rights reserved. + * + *  Andy Adamson <andros@citi.umich.edu> + *  Fred Isaman <iisaman@umich.edu> + * + * permission is granted to use, copy, create derivative works and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the university of michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization.  if + * the above copyright notice or any other identification of the + * university of michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * this software is provided as is, without representation from the + * university of michigan as to its fitness for any purpose, and without + * warranty by the university of michigan of any kind, either express + * or implied, including without limitation the implied warranties of + * merchantability and fitness for a particular purpose.  the regents + * of the university of michigan shall not be liable for any damages, + * including special, indirect, incidental, or consequential damages, + * with respect to any claim arising out or in connection with the use + * of the software, even if it has been or is hereafter advised of the + * possibility of such damages. + */ +#ifndef FS_NFS_NFS4BLOCKLAYOUT_H +#define FS_NFS_NFS4BLOCKLAYOUT_H + +#include <linux/device-mapper.h> +#include <linux/nfs_fs.h> +#include <linux/sunrpc/rpc_pipe_fs.h> + +#include "../pnfs.h" + +#define PAGE_CACHE_SECTORS (PAGE_CACHE_SIZE >> SECTOR_SHIFT) +#define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT) + +struct block_mount_id { +	spinlock_t			bm_lock;    /* protects list */ +	struct list_head		bm_devlist; /* holds pnfs_block_dev */ +}; + +struct pnfs_block_dev { +	struct list_head		bm_node; +	struct nfs4_deviceid		bm_mdevid;    /* associated devid */ +	struct block_device		*bm_mdev;     /* meta device itself */ +}; + +enum exstate4 { +	PNFS_BLOCK_READWRITE_DATA	= 0, +	PNFS_BLOCK_READ_DATA		= 1, +	PNFS_BLOCK_INVALID_DATA		= 2, /* mapped, but data is invalid */ +	PNFS_BLOCK_NONE_DATA		= 3  /* unmapped, it's a hole */ +}; + +#define MY_MAX_TAGS (15) /* tag bitnums used must be less than this */ + +struct my_tree { +	sector_t		mtt_step_size;	/* Internal sector alignment */ +	struct list_head	mtt_stub; /* Should be a radix tree */ +}; + +struct pnfs_inval_markings { +	spinlock_t	im_lock; +	struct my_tree	im_tree;	/* Sectors that need LAYOUTCOMMIT */ +	sector_t	im_block_size;	/* Server blocksize in sectors */ +}; + +struct pnfs_inval_tracking { +	struct list_head it_link; +	int		 it_sector; +	int		 it_tags; +}; + +/* sector_t fields are all in 512-byte sectors */ +struct pnfs_block_extent { +	struct kref	be_refcnt; +	struct list_head be_node;	/* link into lseg list */ +	struct nfs4_deviceid be_devid;  /* FIXME: could use device cache instead */ +	struct block_device *be_mdev; +	sector_t	be_f_offset;	/* the starting offset in the file */ +	sector_t	be_length;	/* the size of the extent */ +	sector_t	be_v_offset;	/* the starting offset in the volume */ +	enum exstate4	be_state;	/* the state of this extent */ +	struct pnfs_inval_markings *be_inval; /* tracks INVAL->RW transition */ +}; + +/* Shortened extent used by LAYOUTCOMMIT */ +struct pnfs_block_short_extent { +	struct list_head bse_node; +	struct nfs4_deviceid bse_devid; +	struct block_device *bse_mdev; +	sector_t	bse_f_offset;	/* the starting offset in the file */ +	sector_t	bse_length;	/* the size of the extent */ +}; + +static inline void +BL_INIT_INVAL_MARKS(struct pnfs_inval_markings *marks, sector_t blocksize) +{ +	spin_lock_init(&marks->im_lock); +	INIT_LIST_HEAD(&marks->im_tree.mtt_stub); +	marks->im_block_size = blocksize; +	marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS, +					   blocksize); +} + +enum extentclass4 { +	RW_EXTENT       = 0, /* READWRTE and INVAL */ +	RO_EXTENT       = 1, /* READ and NONE */ +	EXTENT_LISTS    = 2, +}; + +static inline int bl_choose_list(enum exstate4 state) +{ +	if (state == PNFS_BLOCK_READ_DATA || state == PNFS_BLOCK_NONE_DATA) +		return RO_EXTENT; +	else +		return RW_EXTENT; +} + +struct pnfs_block_layout { +	struct pnfs_layout_hdr bl_layout; +	struct pnfs_inval_markings bl_inval; /* tracks INVAL->RW transition */ +	spinlock_t		bl_ext_lock;   /* Protects list manipulation */ +	struct list_head	bl_extents[EXTENT_LISTS]; /* R and RW extents */ +	struct list_head	bl_commit;	/* Needs layout commit */ +	struct list_head	bl_committing;	/* Layout committing */ +	unsigned int		bl_count;	/* entries in bl_commit */ +	sector_t		bl_blocksize;  /* Server blocksize in sectors */ +}; + +#define BLK_ID(lo) ((struct block_mount_id *)(NFS_SERVER(lo->plh_inode)->pnfs_ld_data)) + +static inline struct pnfs_block_layout * +BLK_LO2EXT(struct pnfs_layout_hdr *lo) +{ +	return container_of(lo, struct pnfs_block_layout, bl_layout); +} + +static inline struct pnfs_block_layout * +BLK_LSEG2EXT(struct pnfs_layout_segment *lseg) +{ +	return BLK_LO2EXT(lseg->pls_layout); +} + +struct bl_dev_msg { +	int status; +	uint32_t major, minor; +}; + +struct bl_msg_hdr { +	u8  type; +	u16 totallen; /* length of entire message, including hdr itself */ +}; + +extern struct dentry *bl_device_pipe; +extern wait_queue_head_t bl_wq; + +#define BL_DEVICE_UMOUNT               0x0 /* Umount--delete devices */ +#define BL_DEVICE_MOUNT                0x1 /* Mount--create devices*/ +#define BL_DEVICE_REQUEST_INIT         0x0 /* Start request */ +#define BL_DEVICE_REQUEST_PROC         0x1 /* User level process succeeds */ +#define BL_DEVICE_REQUEST_ERR          0x2 /* User level process fails */ + +/* blocklayoutdev.c */ +ssize_t bl_pipe_upcall(struct file *, struct rpc_pipe_msg *, +		       char __user *, size_t); +ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t); +void bl_pipe_destroy_msg(struct rpc_pipe_msg *); +struct block_device *nfs4_blkdev_get(dev_t dev); +int nfs4_blkdev_put(struct block_device *bdev); +struct pnfs_block_dev *nfs4_blk_decode_device(struct nfs_server *server, +						struct pnfs_device *dev); +int nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo, +				struct nfs4_layoutget_res *lgr, gfp_t gfp_flags); + +/* blocklayoutdm.c */ +void bl_free_block_dev(struct pnfs_block_dev *bdev); + +/* extents.c */ +struct pnfs_block_extent * +bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect, +		struct pnfs_block_extent **cow_read); +int bl_mark_sectors_init(struct pnfs_inval_markings *marks, +			     sector_t offset, sector_t length, +			     sector_t **pages); +void bl_put_extent(struct pnfs_block_extent *be); +struct pnfs_block_extent *bl_alloc_extent(void); +int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect); +int encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, +				   struct xdr_stream *xdr, +				   const struct nfs4_layoutcommit_args *arg); +void clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, +				   const struct nfs4_layoutcommit_args *arg, +				   int status); +int bl_add_merge_extent(struct pnfs_block_layout *bl, +			 struct pnfs_block_extent *new); +int bl_mark_for_commit(struct pnfs_block_extent *be, +			sector_t offset, sector_t length); + +#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */ diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c new file mode 100644 index 00000000000..a83b393fb01 --- /dev/null +++ b/fs/nfs/blocklayout/blocklayoutdev.c @@ -0,0 +1,410 @@ +/* + *  linux/fs/nfs/blocklayout/blocklayoutdev.c + * + *  Device operations for the pnfs nfs4 file layout driver. + * + *  Copyright (c) 2006 The Regents of the University of Michigan. + *  All rights reserved. + * + *  Andy Adamson <andros@citi.umich.edu> + *  Fred Isaman <iisaman@umich.edu> + * + * permission is granted to use, copy, create derivative works and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the university of michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization.  if + * the above copyright notice or any other identification of the + * university of michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * this software is provided as is, without representation from the + * university of michigan as to its fitness for any purpose, and without + * warranty by the university of michigan of any kind, either express + * or implied, including without limitation the implied warranties of + * merchantability and fitness for a particular purpose.  the regents + * of the university of michigan shall not be liable for any damages, + * including special, indirect, incidental, or consequential damages, + * with respect to any claim arising out or in connection with the use + * of the software, even if it has been or is hereafter advised of the + * possibility of such damages. + */ +#include <linux/module.h> +#include <linux/buffer_head.h> /* __bread */ + +#include <linux/genhd.h> +#include <linux/blkdev.h> +#include <linux/hash.h> + +#include "blocklayout.h" + +#define NFSDBG_FACILITY         NFSDBG_PNFS_LD + +static int decode_sector_number(__be32 **rp, sector_t *sp) +{ +	uint64_t s; + +	*rp = xdr_decode_hyper(*rp, &s); +	if (s & 0x1ff) { +		printk(KERN_WARNING "%s: sector not aligned\n", __func__); +		return -1; +	} +	*sp = s >> SECTOR_SHIFT; +	return 0; +} + +/* Open a block_device by device number. */ +struct block_device *nfs4_blkdev_get(dev_t dev) +{ +	struct block_device *bd; + +	dprintk("%s enter\n", __func__); +	bd = blkdev_get_by_dev(dev, FMODE_READ, NULL); +	if (IS_ERR(bd)) +		goto fail; +	return bd; +fail: +	dprintk("%s failed to open device : %ld\n", +			__func__, PTR_ERR(bd)); +	return NULL; +} + +/* + * Release the block device + */ +int nfs4_blkdev_put(struct block_device *bdev) +{ +	dprintk("%s for device %d:%d\n", __func__, MAJOR(bdev->bd_dev), +			MINOR(bdev->bd_dev)); +	return blkdev_put(bdev, FMODE_READ); +} + +/* + * Shouldn't there be a rpc_generic_upcall() to do this for us? + */ +ssize_t bl_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg, +		       char __user *dst, size_t buflen) +{ +	char *data = (char *)msg->data + msg->copied; +	size_t mlen = min(msg->len - msg->copied, buflen); +	unsigned long left; + +	left = copy_to_user(dst, data, mlen); +	if (left == mlen) { +		msg->errno = -EFAULT; +		return -EFAULT; +	} + +	mlen -= left; +	msg->copied += mlen; +	msg->errno = 0; +	return mlen; +} + +static struct bl_dev_msg bl_mount_reply; + +ssize_t bl_pipe_downcall(struct file *filp, const char __user *src, +			 size_t mlen) +{ +	if (mlen != sizeof (struct bl_dev_msg)) +		return -EINVAL; + +	if (copy_from_user(&bl_mount_reply, src, mlen) != 0) +		return -EFAULT; + +	wake_up(&bl_wq); + +	return mlen; +} + +void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg) +{ +	if (msg->errno >= 0) +		return; +	wake_up(&bl_wq); +} + +/* + * Decodes pnfs_block_deviceaddr4 which is XDR encoded in dev->dev_addr_buf. + */ +struct pnfs_block_dev * +nfs4_blk_decode_device(struct nfs_server *server, +		       struct pnfs_device *dev) +{ +	struct pnfs_block_dev *rv = NULL; +	struct block_device *bd = NULL; +	struct rpc_pipe_msg msg; +	struct bl_msg_hdr bl_msg = { +		.type = BL_DEVICE_MOUNT, +		.totallen = dev->mincount, +	}; +	uint8_t *dataptr; +	DECLARE_WAITQUEUE(wq, current); +	struct bl_dev_msg *reply = &bl_mount_reply; +	int offset, len, i; + +	dprintk("%s CREATING PIPEFS MESSAGE\n", __func__); +	dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data, +		dev->mincount); + +	memset(&msg, 0, sizeof(msg)); +	msg.data = kzalloc(sizeof(bl_msg) + dev->mincount, GFP_NOFS); +	if (!msg.data) { +		rv = ERR_PTR(-ENOMEM); +		goto out; +	} + +	memcpy(msg.data, &bl_msg, sizeof(bl_msg)); +	dataptr = (uint8_t *) msg.data; +	len = dev->mincount; +	offset = sizeof(bl_msg); +	for (i = 0; len > 0; i++) { +		memcpy(&dataptr[offset], page_address(dev->pages[i]), +				len < PAGE_CACHE_SIZE ? len : PAGE_CACHE_SIZE); +		len -= PAGE_CACHE_SIZE; +		offset += PAGE_CACHE_SIZE; +	} +	msg.len = sizeof(bl_msg) + dev->mincount; + +	dprintk("%s CALLING USERSPACE DAEMON\n", __func__); +	add_wait_queue(&bl_wq, &wq); +	if (rpc_queue_upcall(bl_device_pipe->d_inode, &msg) < 0) { +		remove_wait_queue(&bl_wq, &wq); +		goto out; +	} + +	set_current_state(TASK_UNINTERRUPTIBLE); +	schedule(); +	__set_current_state(TASK_RUNNING); +	remove_wait_queue(&bl_wq, &wq); + +	if (reply->status != BL_DEVICE_REQUEST_PROC) { +		dprintk("%s failed to open device: %d\n", +			__func__, reply->status); +		rv = ERR_PTR(-EINVAL); +		goto out; +	} + +	bd = nfs4_blkdev_get(MKDEV(reply->major, reply->minor)); +	if (IS_ERR(bd)) { +		dprintk("%s failed to open device : %ld\n", +			__func__, PTR_ERR(bd)); +		goto out; +	} + +	rv = kzalloc(sizeof(*rv), GFP_NOFS); +	if (!rv) { +		rv = ERR_PTR(-ENOMEM); +		goto out; +	} + +	rv->bm_mdev = bd; +	memcpy(&rv->bm_mdevid, &dev->dev_id, sizeof(struct nfs4_deviceid)); +	dprintk("%s Created device %s with bd_block_size %u\n", +		__func__, +		bd->bd_disk->disk_name, +		bd->bd_block_size); + +out: +	kfree(msg.data); +	return rv; +} + +/* Map deviceid returned by the server to constructed block_device */ +static struct block_device *translate_devid(struct pnfs_layout_hdr *lo, +					    struct nfs4_deviceid *id) +{ +	struct block_device *rv = NULL; +	struct block_mount_id *mid; +	struct pnfs_block_dev *dev; + +	dprintk("%s enter, lo=%p, id=%p\n", __func__, lo, id); +	mid = BLK_ID(lo); +	spin_lock(&mid->bm_lock); +	list_for_each_entry(dev, &mid->bm_devlist, bm_node) { +		if (memcmp(id->data, dev->bm_mdevid.data, +			   NFS4_DEVICEID4_SIZE) == 0) { +			rv = dev->bm_mdev; +			goto out; +		} +	} + out: +	spin_unlock(&mid->bm_lock); +	dprintk("%s returning %p\n", __func__, rv); +	return rv; +} + +/* Tracks info needed to ensure extents in layout obey constraints of spec */ +struct layout_verification { +	u32 mode;	/* R or RW */ +	u64 start;	/* Expected start of next non-COW extent */ +	u64 inval;	/* Start of INVAL coverage */ +	u64 cowread;	/* End of COW read coverage */ +}; + +/* Verify the extent meets the layout requirements of the pnfs-block draft, + * section 2.3.1. + */ +static int verify_extent(struct pnfs_block_extent *be, +			 struct layout_verification *lv) +{ +	if (lv->mode == IOMODE_READ) { +		if (be->be_state == PNFS_BLOCK_READWRITE_DATA || +		    be->be_state == PNFS_BLOCK_INVALID_DATA) +			return -EIO; +		if (be->be_f_offset != lv->start) +			return -EIO; +		lv->start += be->be_length; +		return 0; +	} +	/* lv->mode == IOMODE_RW */ +	if (be->be_state == PNFS_BLOCK_READWRITE_DATA) { +		if (be->be_f_offset != lv->start) +			return -EIO; +		if (lv->cowread > lv->start) +			return -EIO; +		lv->start += be->be_length; +		lv->inval = lv->start; +		return 0; +	} else if (be->be_state == PNFS_BLOCK_INVALID_DATA) { +		if (be->be_f_offset != lv->start) +			return -EIO; +		lv->start += be->be_length; +		return 0; +	} else if (be->be_state == PNFS_BLOCK_READ_DATA) { +		if (be->be_f_offset > lv->start) +			return -EIO; +		if (be->be_f_offset < lv->inval) +			return -EIO; +		if (be->be_f_offset < lv->cowread) +			return -EIO; +		/* It looks like you might want to min this with lv->start, +		 * but you really don't. +		 */ +		lv->inval = lv->inval + be->be_length; +		lv->cowread = be->be_f_offset + be->be_length; +		return 0; +	} else +		return -EIO; +} + +/* XDR decode pnfs_block_layout4 structure */ +int +nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo, +			   struct nfs4_layoutget_res *lgr, gfp_t gfp_flags) +{ +	struct pnfs_block_layout *bl = BLK_LO2EXT(lo); +	int i, status = -EIO; +	uint32_t count; +	struct pnfs_block_extent *be = NULL, *save; +	struct xdr_stream stream; +	struct xdr_buf buf; +	struct page *scratch; +	__be32 *p; +	struct layout_verification lv = { +		.mode = lgr->range.iomode, +		.start = lgr->range.offset >> SECTOR_SHIFT, +		.inval = lgr->range.offset >> SECTOR_SHIFT, +		.cowread = lgr->range.offset >> SECTOR_SHIFT, +	}; +	LIST_HEAD(extents); + +	dprintk("---> %s\n", __func__); + +	scratch = alloc_page(gfp_flags); +	if (!scratch) +		return -ENOMEM; + +	xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len); +	xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); + +	p = xdr_inline_decode(&stream, 4); +	if (unlikely(!p)) +		goto out_err; + +	count = be32_to_cpup(p++); + +	dprintk("%s enter, number of extents %i\n", __func__, count); +	p = xdr_inline_decode(&stream, (28 + NFS4_DEVICEID4_SIZE) * count); +	if (unlikely(!p)) +		goto out_err; + +	/* Decode individual extents, putting them in temporary +	 * staging area until whole layout is decoded to make error +	 * recovery easier. +	 */ +	for (i = 0; i < count; i++) { +		be = bl_alloc_extent(); +		if (!be) { +			status = -ENOMEM; +			goto out_err; +		} +		memcpy(&be->be_devid, p, NFS4_DEVICEID4_SIZE); +		p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE); +		be->be_mdev = translate_devid(lo, &be->be_devid); +		if (!be->be_mdev) +			goto out_err; + +		/* The next three values are read in as bytes, +		 * but stored as 512-byte sector lengths +		 */ +		if (decode_sector_number(&p, &be->be_f_offset) < 0) +			goto out_err; +		if (decode_sector_number(&p, &be->be_length) < 0) +			goto out_err; +		if (decode_sector_number(&p, &be->be_v_offset) < 0) +			goto out_err; +		be->be_state = be32_to_cpup(p++); +		if (be->be_state == PNFS_BLOCK_INVALID_DATA) +			be->be_inval = &bl->bl_inval; +		if (verify_extent(be, &lv)) { +			dprintk("%s verify failed\n", __func__); +			goto out_err; +		} +		list_add_tail(&be->be_node, &extents); +	} +	if (lgr->range.offset + lgr->range.length != +			lv.start << SECTOR_SHIFT) { +		dprintk("%s Final length mismatch\n", __func__); +		be = NULL; +		goto out_err; +	} +	if (lv.start < lv.cowread) { +		dprintk("%s Final uncovered COW extent\n", __func__); +		be = NULL; +		goto out_err; +	} +	/* Extents decoded properly, now try to merge them in to +	 * existing layout extents. +	 */ +	spin_lock(&bl->bl_ext_lock); +	list_for_each_entry_safe(be, save, &extents, be_node) { +		list_del(&be->be_node); +		status = bl_add_merge_extent(bl, be); +		if (status) { +			spin_unlock(&bl->bl_ext_lock); +			/* This is a fairly catastrophic error, as the +			 * entire layout extent lists are now corrupted. +			 * We should have some way to distinguish this. +			 */ +			be = NULL; +			goto out_err; +		} +	} +	spin_unlock(&bl->bl_ext_lock); +	status = 0; + out: +	__free_page(scratch); +	dprintk("%s returns %i\n", __func__, status); +	return status; + + out_err: +	bl_put_extent(be); +	while (!list_empty(&extents)) { +		be = list_first_entry(&extents, struct pnfs_block_extent, +				      be_node); +		list_del(&be->be_node); +		bl_put_extent(be); +	} +	goto out; +} diff --git a/fs/nfs/blocklayout/blocklayoutdm.c b/fs/nfs/blocklayout/blocklayoutdm.c new file mode 100644 index 00000000000..d055c755807 --- /dev/null +++ b/fs/nfs/blocklayout/blocklayoutdm.c @@ -0,0 +1,111 @@ +/* + *  linux/fs/nfs/blocklayout/blocklayoutdm.c + * + *  Module for the NFSv4.1 pNFS block layout driver. + * + *  Copyright (c) 2007 The Regents of the University of Michigan. + *  All rights reserved. + * + *  Fred Isaman <iisaman@umich.edu> + *  Andy Adamson <andros@citi.umich.edu> + * + * permission is granted to use, copy, create derivative works and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the university of michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization.  if + * the above copyright notice or any other identification of the + * university of michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * this software is provided as is, without representation from the + * university of michigan as to its fitness for any purpose, and without + * warranty by the university of michigan of any kind, either express + * or implied, including without limitation the implied warranties of + * merchantability and fitness for a particular purpose.  the regents + * of the university of michigan shall not be liable for any damages, + * including special, indirect, incidental, or consequential damages, + * with respect to any claim arising out or in connection with the use + * of the software, even if it has been or is hereafter advised of the + * possibility of such damages. + */ + +#include <linux/genhd.h> /* gendisk - used in a dprintk*/ +#include <linux/sched.h> +#include <linux/hash.h> + +#include "blocklayout.h" + +#define NFSDBG_FACILITY         NFSDBG_PNFS_LD + +static void dev_remove(dev_t dev) +{ +	struct rpc_pipe_msg msg; +	struct bl_dev_msg bl_umount_request; +	struct bl_msg_hdr bl_msg = { +		.type = BL_DEVICE_UMOUNT, +		.totallen = sizeof(bl_umount_request), +	}; +	uint8_t *dataptr; +	DECLARE_WAITQUEUE(wq, current); + +	dprintk("Entering %s\n", __func__); + +	memset(&msg, 0, sizeof(msg)); +	msg.data = kzalloc(1 + sizeof(bl_umount_request), GFP_NOFS); +	if (!msg.data) +		goto out; + +	memset(&bl_umount_request, 0, sizeof(bl_umount_request)); +	bl_umount_request.major = MAJOR(dev); +	bl_umount_request.minor = MINOR(dev); + +	memcpy(msg.data, &bl_msg, sizeof(bl_msg)); +	dataptr = (uint8_t *) msg.data; +	memcpy(&dataptr[sizeof(bl_msg)], &bl_umount_request, sizeof(bl_umount_request)); +	msg.len = sizeof(bl_msg) + bl_msg.totallen; + +	add_wait_queue(&bl_wq, &wq); +	if (rpc_queue_upcall(bl_device_pipe->d_inode, &msg) < 0) { +		remove_wait_queue(&bl_wq, &wq); +		goto out; +	} + +	set_current_state(TASK_UNINTERRUPTIBLE); +	schedule(); +	__set_current_state(TASK_RUNNING); +	remove_wait_queue(&bl_wq, &wq); + +out: +	kfree(msg.data); +} + +/* + * Release meta device + */ +static void nfs4_blk_metadev_release(struct pnfs_block_dev *bdev) +{ +	int rv; + +	dprintk("%s Releasing\n", __func__); +	rv = nfs4_blkdev_put(bdev->bm_mdev); +	if (rv) +		printk(KERN_ERR "%s nfs4_blkdev_put returns %d\n", +				__func__, rv); + +	dev_remove(bdev->bm_mdev->bd_dev); +} + +void bl_free_block_dev(struct pnfs_block_dev *bdev) +{ +	if (bdev) { +		if (bdev->bm_mdev) { +			dprintk("%s Removing DM device: %d:%d\n", +				__func__, +				MAJOR(bdev->bm_mdev->bd_dev), +				MINOR(bdev->bm_mdev->bd_dev)); +			nfs4_blk_metadev_release(bdev); +		} +		kfree(bdev); +	} +} diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c new file mode 100644 index 00000000000..19fa7b0b8c0 --- /dev/null +++ b/fs/nfs/blocklayout/extents.c @@ -0,0 +1,935 @@ +/* + *  linux/fs/nfs/blocklayout/blocklayout.h + * + *  Module for the NFSv4.1 pNFS block layout driver. + * + *  Copyright (c) 2006 The Regents of the University of Michigan. + *  All rights reserved. + * + *  Andy Adamson <andros@citi.umich.edu> + *  Fred Isaman <iisaman@umich.edu> + * + * permission is granted to use, copy, create derivative works and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the university of michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization.  if + * the above copyright notice or any other identification of the + * university of michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * this software is provided as is, without representation from the + * university of michigan as to its fitness for any purpose, and without + * warranty by the university of michigan of any kind, either express + * or implied, including without limitation the implied warranties of + * merchantability and fitness for a particular purpose.  the regents + * of the university of michigan shall not be liable for any damages, + * including special, indirect, incidental, or consequential damages, + * with respect to any claim arising out or in connection with the use + * of the software, even if it has been or is hereafter advised of the + * possibility of such damages. + */ + +#include "blocklayout.h" +#define NFSDBG_FACILITY         NFSDBG_PNFS_LD + +/* Bit numbers */ +#define EXTENT_INITIALIZED 0 +#define EXTENT_WRITTEN     1 +#define EXTENT_IN_COMMIT   2 +#define INTERNAL_EXISTS    MY_MAX_TAGS +#define INTERNAL_MASK      ((1 << INTERNAL_EXISTS) - 1) + +/* Returns largest t<=s s.t. t%base==0 */ +static inline sector_t normalize(sector_t s, int base) +{ +	sector_t tmp = s; /* Since do_div modifies its argument */ +	return s - do_div(tmp, base); +} + +static inline sector_t normalize_up(sector_t s, int base) +{ +	return normalize(s + base - 1, base); +} + +/* Complete stub using list while determine API wanted */ + +/* Returns tags, or negative */ +static int32_t _find_entry(struct my_tree *tree, u64 s) +{ +	struct pnfs_inval_tracking *pos; + +	dprintk("%s(%llu) enter\n", __func__, s); +	list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { +		if (pos->it_sector > s) +			continue; +		else if (pos->it_sector == s) +			return pos->it_tags & INTERNAL_MASK; +		else +			break; +	} +	return -ENOENT; +} + +static inline +int _has_tag(struct my_tree *tree, u64 s, int32_t tag) +{ +	int32_t tags; + +	dprintk("%s(%llu, %i) enter\n", __func__, s, tag); +	s = normalize(s, tree->mtt_step_size); +	tags = _find_entry(tree, s); +	if ((tags < 0) || !(tags & (1 << tag))) +		return 0; +	else +		return 1; +} + +/* Creates entry with tag, or if entry already exists, unions tag to it. + * If storage is not NULL, newly created entry will use it. + * Returns number of entries added, or negative on error. + */ +static int _add_entry(struct my_tree *tree, u64 s, int32_t tag, +		      struct pnfs_inval_tracking *storage) +{ +	int found = 0; +	struct pnfs_inval_tracking *pos; + +	dprintk("%s(%llu, %i, %p) enter\n", __func__, s, tag, storage); +	list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { +		if (pos->it_sector > s) +			continue; +		else if (pos->it_sector == s) { +			found = 1; +			break; +		} else +			break; +	} +	if (found) { +		pos->it_tags |= (1 << tag); +		return 0; +	} else { +		struct pnfs_inval_tracking *new; +		if (storage) +			new = storage; +		else { +			new = kmalloc(sizeof(*new), GFP_NOFS); +			if (!new) +				return -ENOMEM; +		} +		new->it_sector = s; +		new->it_tags = (1 << tag); +		list_add(&new->it_link, &pos->it_link); +		return 1; +	} +} + +/* XXXX Really want option to not create */ +/* Over range, unions tag with existing entries, else creates entry with tag */ +static int _set_range(struct my_tree *tree, int32_t tag, u64 s, u64 length) +{ +	u64 i; + +	dprintk("%s(%i, %llu, %llu) enter\n", __func__, tag, s, length); +	for (i = normalize(s, tree->mtt_step_size); i < s + length; +	     i += tree->mtt_step_size) +		if (_add_entry(tree, i, tag, NULL)) +			return -ENOMEM; +	return 0; +} + +/* Ensure that future operations on given range of tree will not malloc */ +static int _preload_range(struct my_tree *tree, u64 offset, u64 length) +{ +	u64 start, end, s; +	int count, i, used = 0, status = -ENOMEM; +	struct pnfs_inval_tracking **storage; + +	dprintk("%s(%llu, %llu) enter\n", __func__, offset, length); +	start = normalize(offset, tree->mtt_step_size); +	end = normalize_up(offset + length, tree->mtt_step_size); +	count = (int)(end - start) / (int)tree->mtt_step_size; + +	/* Pre-malloc what memory we might need */ +	storage = kmalloc(sizeof(*storage) * count, GFP_NOFS); +	if (!storage) +		return -ENOMEM; +	for (i = 0; i < count; i++) { +		storage[i] = kmalloc(sizeof(struct pnfs_inval_tracking), +				     GFP_NOFS); +		if (!storage[i]) +			goto out_cleanup; +	} + +	/* Now need lock - HOW??? */ + +	for (s = start; s < end; s += tree->mtt_step_size) +		used += _add_entry(tree, s, INTERNAL_EXISTS, storage[used]); + +	/* Unlock - HOW??? */ +	status = 0; + + out_cleanup: +	for (i = used; i < count; i++) { +		if (!storage[i]) +			break; +		kfree(storage[i]); +	} +	kfree(storage); +	return status; +} + +static void set_needs_init(sector_t *array, sector_t offset) +{ +	sector_t *p = array; + +	dprintk("%s enter\n", __func__); +	if (!p) +		return; +	while (*p < offset) +		p++; +	if (*p == offset) +		return; +	else if (*p == ~0) { +		*p++ = offset; +		*p = ~0; +		return; +	} else { +		sector_t *save = p; +		dprintk("%s Adding %llu\n", __func__, (u64)offset); +		while (*p != ~0) +			p++; +		p++; +		memmove(save + 1, save, (char *)p - (char *)save); +		*save = offset; +		return; +	} +} + +/* We are relying on page lock to serialize this */ +int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect) +{ +	int rv; + +	spin_lock(&marks->im_lock); +	rv = _has_tag(&marks->im_tree, isect, EXTENT_INITIALIZED); +	spin_unlock(&marks->im_lock); +	return rv; +} + +/* Assume start, end already sector aligned */ +static int +_range_has_tag(struct my_tree *tree, u64 start, u64 end, int32_t tag) +{ +	struct pnfs_inval_tracking *pos; +	u64 expect = 0; + +	dprintk("%s(%llu, %llu, %i) enter\n", __func__, start, end, tag); +	list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { +		if (pos->it_sector >= end) +			continue; +		if (!expect) { +			if ((pos->it_sector == end - tree->mtt_step_size) && +			    (pos->it_tags & (1 << tag))) { +				expect = pos->it_sector - tree->mtt_step_size; +				if (pos->it_sector < tree->mtt_step_size || expect < start) +					return 1; +				continue; +			} else { +				return 0; +			} +		} +		if (pos->it_sector != expect || !(pos->it_tags & (1 << tag))) +			return 0; +		expect -= tree->mtt_step_size; +		if (expect < start) +			return 1; +	} +	return 0; +} + +static int is_range_written(struct pnfs_inval_markings *marks, +			    sector_t start, sector_t end) +{ +	int rv; + +	spin_lock(&marks->im_lock); +	rv = _range_has_tag(&marks->im_tree, start, end, EXTENT_WRITTEN); +	spin_unlock(&marks->im_lock); +	return rv; +} + +/* Marks sectors in [offest, offset_length) as having been initialized. + * All lengths are step-aligned, where step is min(pagesize, blocksize). + * Notes where partial block is initialized, and helps prepare it for + * complete initialization later. + */ +/* Currently assumes offset is page-aligned */ +int bl_mark_sectors_init(struct pnfs_inval_markings *marks, +			     sector_t offset, sector_t length, +			     sector_t **pages) +{ +	sector_t s, start, end; +	sector_t *array = NULL; /* Pages to mark */ + +	dprintk("%s(offset=%llu,len=%llu) enter\n", +		__func__, (u64)offset, (u64)length); +	s = max((sector_t) 3, +		2 * (marks->im_block_size / (PAGE_CACHE_SECTORS))); +	dprintk("%s set max=%llu\n", __func__, (u64)s); +	if (pages) { +		array = kmalloc(s * sizeof(sector_t), GFP_NOFS); +		if (!array) +			goto outerr; +		array[0] = ~0; +	} + +	start = normalize(offset, marks->im_block_size); +	end = normalize_up(offset + length, marks->im_block_size); +	if (_preload_range(&marks->im_tree, start, end - start)) +		goto outerr; + +	spin_lock(&marks->im_lock); + +	for (s = normalize_up(start, PAGE_CACHE_SECTORS); +	     s < offset; s += PAGE_CACHE_SECTORS) { +		dprintk("%s pre-area pages\n", __func__); +		/* Portion of used block is not initialized */ +		if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED)) +			set_needs_init(array, s); +	} +	if (_set_range(&marks->im_tree, EXTENT_INITIALIZED, offset, length)) +		goto out_unlock; +	for (s = normalize_up(offset + length, PAGE_CACHE_SECTORS); +	     s < end; s += PAGE_CACHE_SECTORS) { +		dprintk("%s post-area pages\n", __func__); +		if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED)) +			set_needs_init(array, s); +	} + +	spin_unlock(&marks->im_lock); + +	if (pages) { +		if (array[0] == ~0) { +			kfree(array); +			*pages = NULL; +		} else +			*pages = array; +	} +	return 0; + + out_unlock: +	spin_unlock(&marks->im_lock); + outerr: +	if (pages) { +		kfree(array); +		*pages = NULL; +	} +	return -ENOMEM; +} + +/* Marks sectors in [offest, offset+length) as having been written to disk. + * All lengths should be block aligned. + */ +static int mark_written_sectors(struct pnfs_inval_markings *marks, +				sector_t offset, sector_t length) +{ +	int status; + +	dprintk("%s(offset=%llu,len=%llu) enter\n", __func__, +		(u64)offset, (u64)length); +	spin_lock(&marks->im_lock); +	status = _set_range(&marks->im_tree, EXTENT_WRITTEN, offset, length); +	spin_unlock(&marks->im_lock); +	return status; +} + +static void print_short_extent(struct pnfs_block_short_extent *be) +{ +	dprintk("PRINT SHORT EXTENT extent %p\n", be); +	if (be) { +		dprintk("        be_f_offset %llu\n", (u64)be->bse_f_offset); +		dprintk("        be_length   %llu\n", (u64)be->bse_length); +	} +} + +static void print_clist(struct list_head *list, unsigned int count) +{ +	struct pnfs_block_short_extent *be; +	unsigned int i = 0; + +	ifdebug(FACILITY) { +		printk(KERN_DEBUG "****************\n"); +		printk(KERN_DEBUG "Extent list looks like:\n"); +		list_for_each_entry(be, list, bse_node) { +			i++; +			print_short_extent(be); +		} +		if (i != count) +			printk(KERN_DEBUG "\n\nExpected %u entries\n\n\n", count); +		printk(KERN_DEBUG "****************\n"); +	} +} + +/* Note: In theory, we should do more checking that devid's match between + * old and new, but if they don't, the lists are too corrupt to salvage anyway. + */ +/* Note this is very similar to bl_add_merge_extent */ +static void add_to_commitlist(struct pnfs_block_layout *bl, +			      struct pnfs_block_short_extent *new) +{ +	struct list_head *clist = &bl->bl_commit; +	struct pnfs_block_short_extent *old, *save; +	sector_t end = new->bse_f_offset + new->bse_length; + +	dprintk("%s enter\n", __func__); +	print_short_extent(new); +	print_clist(clist, bl->bl_count); +	bl->bl_count++; +	/* Scan for proper place to insert, extending new to the left +	 * as much as possible. +	 */ +	list_for_each_entry_safe(old, save, clist, bse_node) { +		if (new->bse_f_offset < old->bse_f_offset) +			break; +		if (end <= old->bse_f_offset + old->bse_length) { +			/* Range is already in list */ +			bl->bl_count--; +			kfree(new); +			return; +		} else if (new->bse_f_offset <= +				old->bse_f_offset + old->bse_length) { +			/* new overlaps or abuts existing be */ +			if (new->bse_mdev == old->bse_mdev) { +				/* extend new to fully replace old */ +				new->bse_length += new->bse_f_offset - +						old->bse_f_offset; +				new->bse_f_offset = old->bse_f_offset; +				list_del(&old->bse_node); +				bl->bl_count--; +				kfree(old); +			} +		} +	} +	/* Note that if we never hit the above break, old will not point to a +	 * valid extent.  However, in that case &old->bse_node==list. +	 */ +	list_add_tail(&new->bse_node, &old->bse_node); +	/* Scan forward for overlaps.  If we find any, extend new and +	 * remove the overlapped extent. +	 */ +	old = list_prepare_entry(new, clist, bse_node); +	list_for_each_entry_safe_continue(old, save, clist, bse_node) { +		if (end < old->bse_f_offset) +			break; +		/* new overlaps or abuts old */ +		if (new->bse_mdev == old->bse_mdev) { +			if (end < old->bse_f_offset + old->bse_length) { +				/* extend new to fully cover old */ +				end = old->bse_f_offset + old->bse_length; +				new->bse_length = end - new->bse_f_offset; +			} +			list_del(&old->bse_node); +			bl->bl_count--; +			kfree(old); +		} +	} +	dprintk("%s: after merging\n", __func__); +	print_clist(clist, bl->bl_count); +} + +/* Note the range described by offset, length is guaranteed to be contained + * within be. + */ +int bl_mark_for_commit(struct pnfs_block_extent *be, +		    sector_t offset, sector_t length) +{ +	sector_t new_end, end = offset + length; +	struct pnfs_block_short_extent *new; +	struct pnfs_block_layout *bl = container_of(be->be_inval, +						    struct pnfs_block_layout, +						    bl_inval); + +	new = kmalloc(sizeof(*new), GFP_NOFS); +	if (!new) +		return -ENOMEM; + +	mark_written_sectors(be->be_inval, offset, length); +	/* We want to add the range to commit list, but it must be +	 * block-normalized, and verified that the normalized range has +	 * been entirely written to disk. +	 */ +	new->bse_f_offset = offset; +	offset = normalize(offset, bl->bl_blocksize); +	if (offset < new->bse_f_offset) { +		if (is_range_written(be->be_inval, offset, new->bse_f_offset)) +			new->bse_f_offset = offset; +		else +			new->bse_f_offset = offset + bl->bl_blocksize; +	} +	new_end = normalize_up(end, bl->bl_blocksize); +	if (end < new_end) { +		if (is_range_written(be->be_inval, end, new_end)) +			end = new_end; +		else +			end = new_end - bl->bl_blocksize; +	} +	if (end <= new->bse_f_offset) { +		kfree(new); +		return 0; +	} +	new->bse_length = end - new->bse_f_offset; +	new->bse_devid = be->be_devid; +	new->bse_mdev = be->be_mdev; + +	spin_lock(&bl->bl_ext_lock); +	/* new will be freed, either by add_to_commitlist if it decides not +	 * to use it, or after LAYOUTCOMMIT uses it in the commitlist. +	 */ +	add_to_commitlist(bl, new); +	spin_unlock(&bl->bl_ext_lock); +	return 0; +} + +static void print_bl_extent(struct pnfs_block_extent *be) +{ +	dprintk("PRINT EXTENT extent %p\n", be); +	if (be) { +		dprintk("        be_f_offset %llu\n", (u64)be->be_f_offset); +		dprintk("        be_length   %llu\n", (u64)be->be_length); +		dprintk("        be_v_offset %llu\n", (u64)be->be_v_offset); +		dprintk("        be_state    %d\n", be->be_state); +	} +} + +static void +destroy_extent(struct kref *kref) +{ +	struct pnfs_block_extent *be; + +	be = container_of(kref, struct pnfs_block_extent, be_refcnt); +	dprintk("%s be=%p\n", __func__, be); +	kfree(be); +} + +void +bl_put_extent(struct pnfs_block_extent *be) +{ +	if (be) { +		dprintk("%s enter %p (%i)\n", __func__, be, +			atomic_read(&be->be_refcnt.refcount)); +		kref_put(&be->be_refcnt, destroy_extent); +	} +} + +struct pnfs_block_extent *bl_alloc_extent(void) +{ +	struct pnfs_block_extent *be; + +	be = kmalloc(sizeof(struct pnfs_block_extent), GFP_NOFS); +	if (!be) +		return NULL; +	INIT_LIST_HEAD(&be->be_node); +	kref_init(&be->be_refcnt); +	be->be_inval = NULL; +	return be; +} + +static void print_elist(struct list_head *list) +{ +	struct pnfs_block_extent *be; +	dprintk("****************\n"); +	dprintk("Extent list looks like:\n"); +	list_for_each_entry(be, list, be_node) { +		print_bl_extent(be); +	} +	dprintk("****************\n"); +} + +static inline int +extents_consistent(struct pnfs_block_extent *old, struct pnfs_block_extent *new) +{ +	/* Note this assumes new->be_f_offset >= old->be_f_offset */ +	return (new->be_state == old->be_state) && +		((new->be_state == PNFS_BLOCK_NONE_DATA) || +		 ((new->be_v_offset - old->be_v_offset == +		   new->be_f_offset - old->be_f_offset) && +		  new->be_mdev == old->be_mdev)); +} + +/* Adds new to appropriate list in bl, modifying new and removing existing + * extents as appropriate to deal with overlaps. + * + * See bl_find_get_extent for list constraints. + * + * Refcount on new is already set.  If end up not using it, or error out, + * need to put the reference. + * + * bl->bl_ext_lock is held by caller. + */ +int +bl_add_merge_extent(struct pnfs_block_layout *bl, +		     struct pnfs_block_extent *new) +{ +	struct pnfs_block_extent *be, *tmp; +	sector_t end = new->be_f_offset + new->be_length; +	struct list_head *list; + +	dprintk("%s enter with be=%p\n", __func__, new); +	print_bl_extent(new); +	list = &bl->bl_extents[bl_choose_list(new->be_state)]; +	print_elist(list); + +	/* Scan for proper place to insert, extending new to the left +	 * as much as possible. +	 */ +	list_for_each_entry_safe_reverse(be, tmp, list, be_node) { +		if (new->be_f_offset >= be->be_f_offset + be->be_length) +			break; +		if (new->be_f_offset >= be->be_f_offset) { +			if (end <= be->be_f_offset + be->be_length) { +				/* new is a subset of existing be*/ +				if (extents_consistent(be, new)) { +					dprintk("%s: new is subset, ignoring\n", +						__func__); +					bl_put_extent(new); +					return 0; +				} else { +					goto out_err; +				} +			} else { +				/* |<--   be   -->| +				 *          |<--   new   -->| */ +				if (extents_consistent(be, new)) { +					/* extend new to fully replace be */ +					new->be_length += new->be_f_offset - +						be->be_f_offset; +					new->be_f_offset = be->be_f_offset; +					new->be_v_offset = be->be_v_offset; +					dprintk("%s: removing %p\n", __func__, be); +					list_del(&be->be_node); +					bl_put_extent(be); +				} else { +					goto out_err; +				} +			} +		} else if (end >= be->be_f_offset + be->be_length) { +			/* new extent overlap existing be */ +			if (extents_consistent(be, new)) { +				/* extend new to fully replace be */ +				dprintk("%s: removing %p\n", __func__, be); +				list_del(&be->be_node); +				bl_put_extent(be); +			} else { +				goto out_err; +			} +		} else if (end > be->be_f_offset) { +			/*           |<--   be   -->| +			 *|<--   new   -->| */ +			if (extents_consistent(new, be)) { +				/* extend new to fully replace be */ +				new->be_length += be->be_f_offset + be->be_length - +					new->be_f_offset - new->be_length; +				dprintk("%s: removing %p\n", __func__, be); +				list_del(&be->be_node); +				bl_put_extent(be); +			} else { +				goto out_err; +			} +		} +	} +	/* Note that if we never hit the above break, be will not point to a +	 * valid extent.  However, in that case &be->be_node==list. +	 */ +	list_add(&new->be_node, &be->be_node); +	dprintk("%s: inserting new\n", __func__); +	print_elist(list); +	/* FIXME - The per-list consistency checks have all been done, +	 * should now check cross-list consistency. +	 */ +	return 0; + + out_err: +	bl_put_extent(new); +	return -EIO; +} + +/* Returns extent, or NULL.  If a second READ extent exists, it is returned + * in cow_read, if given. + * + * The extents are kept in two seperate ordered lists, one for READ and NONE, + * one for READWRITE and INVALID.  Within each list, we assume: + * 1. Extents are ordered by file offset. + * 2. For any given isect, there is at most one extents that matches. + */ +struct pnfs_block_extent * +bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect, +	    struct pnfs_block_extent **cow_read) +{ +	struct pnfs_block_extent *be, *cow, *ret; +	int i; + +	dprintk("%s enter with isect %llu\n", __func__, (u64)isect); +	cow = ret = NULL; +	spin_lock(&bl->bl_ext_lock); +	for (i = 0; i < EXTENT_LISTS; i++) { +		list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) { +			if (isect >= be->be_f_offset + be->be_length) +				break; +			if (isect >= be->be_f_offset) { +				/* We have found an extent */ +				dprintk("%s Get %p (%i)\n", __func__, be, +					atomic_read(&be->be_refcnt.refcount)); +				kref_get(&be->be_refcnt); +				if (!ret) +					ret = be; +				else if (be->be_state != PNFS_BLOCK_READ_DATA) +					bl_put_extent(be); +				else +					cow = be; +				break; +			} +		} +		if (ret && +		    (!cow_read || ret->be_state != PNFS_BLOCK_INVALID_DATA)) +			break; +	} +	spin_unlock(&bl->bl_ext_lock); +	if (cow_read) +		*cow_read = cow; +	print_bl_extent(ret); +	return ret; +} + +/* Similar to bl_find_get_extent, but called with lock held, and ignores cow */ +static struct pnfs_block_extent * +bl_find_get_extent_locked(struct pnfs_block_layout *bl, sector_t isect) +{ +	struct pnfs_block_extent *be, *ret = NULL; +	int i; + +	dprintk("%s enter with isect %llu\n", __func__, (u64)isect); +	for (i = 0; i < EXTENT_LISTS; i++) { +		if (ret) +			break; +		list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) { +			if (isect >= be->be_f_offset + be->be_length) +				break; +			if (isect >= be->be_f_offset) { +				/* We have found an extent */ +				dprintk("%s Get %p (%i)\n", __func__, be, +					atomic_read(&be->be_refcnt.refcount)); +				kref_get(&be->be_refcnt); +				ret = be; +				break; +			} +		} +	} +	print_bl_extent(ret); +	return ret; +} + +int +encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, +			       struct xdr_stream *xdr, +			       const struct nfs4_layoutcommit_args *arg) +{ +	struct pnfs_block_short_extent *lce, *save; +	unsigned int count = 0; +	__be32 *p, *xdr_start; + +	dprintk("%s enter\n", __func__); +	/* BUG - creation of bl_commit is buggy - need to wait for +	 * entire block to be marked WRITTEN before it can be added. +	 */ +	spin_lock(&bl->bl_ext_lock); +	/* Want to adjust for possible truncate */ +	/* We now want to adjust argument range */ + +	/* XDR encode the ranges found */ +	xdr_start = xdr_reserve_space(xdr, 8); +	if (!xdr_start) +		goto out; +	list_for_each_entry_safe(lce, save, &bl->bl_commit, bse_node) { +		p = xdr_reserve_space(xdr, 7 * 4 + sizeof(lce->bse_devid.data)); +		if (!p) +			break; +		p = xdr_encode_opaque_fixed(p, lce->bse_devid.data, NFS4_DEVICEID4_SIZE); +		p = xdr_encode_hyper(p, lce->bse_f_offset << SECTOR_SHIFT); +		p = xdr_encode_hyper(p, lce->bse_length << SECTOR_SHIFT); +		p = xdr_encode_hyper(p, 0LL); +		*p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA); +		list_del(&lce->bse_node); +		list_add_tail(&lce->bse_node, &bl->bl_committing); +		bl->bl_count--; +		count++; +	} +	xdr_start[0] = cpu_to_be32((xdr->p - xdr_start - 1) * 4); +	xdr_start[1] = cpu_to_be32(count); +out: +	spin_unlock(&bl->bl_ext_lock); +	dprintk("%s found %i ranges\n", __func__, count); +	return 0; +} + +/* Helper function to set_to_rw that initialize a new extent */ +static void +_prep_new_extent(struct pnfs_block_extent *new, +		 struct pnfs_block_extent *orig, +		 sector_t offset, sector_t length, int state) +{ +	kref_init(&new->be_refcnt); +	/* don't need to INIT_LIST_HEAD(&new->be_node) */ +	memcpy(&new->be_devid, &orig->be_devid, sizeof(struct nfs4_deviceid)); +	new->be_mdev = orig->be_mdev; +	new->be_f_offset = offset; +	new->be_length = length; +	new->be_v_offset = orig->be_v_offset - orig->be_f_offset + offset; +	new->be_state = state; +	new->be_inval = orig->be_inval; +} + +/* Tries to merge be with extent in front of it in list. + * Frees storage if not used. + */ +static struct pnfs_block_extent * +_front_merge(struct pnfs_block_extent *be, struct list_head *head, +	     struct pnfs_block_extent *storage) +{ +	struct pnfs_block_extent *prev; + +	if (!storage) +		goto no_merge; +	if (&be->be_node == head || be->be_node.prev == head) +		goto no_merge; +	prev = list_entry(be->be_node.prev, struct pnfs_block_extent, be_node); +	if ((prev->be_f_offset + prev->be_length != be->be_f_offset) || +	    !extents_consistent(prev, be)) +		goto no_merge; +	_prep_new_extent(storage, prev, prev->be_f_offset, +			 prev->be_length + be->be_length, prev->be_state); +	list_replace(&prev->be_node, &storage->be_node); +	bl_put_extent(prev); +	list_del(&be->be_node); +	bl_put_extent(be); +	return storage; + + no_merge: +	kfree(storage); +	return be; +} + +static u64 +set_to_rw(struct pnfs_block_layout *bl, u64 offset, u64 length) +{ +	u64 rv = offset + length; +	struct pnfs_block_extent *be, *e1, *e2, *e3, *new, *old; +	struct pnfs_block_extent *children[3]; +	struct pnfs_block_extent *merge1 = NULL, *merge2 = NULL; +	int i = 0, j; + +	dprintk("%s(%llu, %llu)\n", __func__, offset, length); +	/* Create storage for up to three new extents e1, e2, e3 */ +	e1 = kmalloc(sizeof(*e1), GFP_ATOMIC); +	e2 = kmalloc(sizeof(*e2), GFP_ATOMIC); +	e3 = kmalloc(sizeof(*e3), GFP_ATOMIC); +	/* BUG - we are ignoring any failure */ +	if (!e1 || !e2 || !e3) +		goto out_nosplit; + +	spin_lock(&bl->bl_ext_lock); +	be = bl_find_get_extent_locked(bl, offset); +	rv = be->be_f_offset + be->be_length; +	if (be->be_state != PNFS_BLOCK_INVALID_DATA) { +		spin_unlock(&bl->bl_ext_lock); +		goto out_nosplit; +	} +	/* Add e* to children, bumping e*'s krefs */ +	if (be->be_f_offset != offset) { +		_prep_new_extent(e1, be, be->be_f_offset, +				 offset - be->be_f_offset, +				 PNFS_BLOCK_INVALID_DATA); +		children[i++] = e1; +		print_bl_extent(e1); +	} else +		merge1 = e1; +	_prep_new_extent(e2, be, offset, +			 min(length, be->be_f_offset + be->be_length - offset), +			 PNFS_BLOCK_READWRITE_DATA); +	children[i++] = e2; +	print_bl_extent(e2); +	if (offset + length < be->be_f_offset + be->be_length) { +		_prep_new_extent(e3, be, e2->be_f_offset + e2->be_length, +				 be->be_f_offset + be->be_length - +				 offset - length, +				 PNFS_BLOCK_INVALID_DATA); +		children[i++] = e3; +		print_bl_extent(e3); +	} else +		merge2 = e3; + +	/* Remove be from list, and insert the e* */ +	/* We don't get refs on e*, since this list is the base reference +	 * set when init'ed. +	 */ +	if (i < 3) +		children[i] = NULL; +	new = children[0]; +	list_replace(&be->be_node, &new->be_node); +	bl_put_extent(be); +	new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge1); +	for (j = 1; j < i; j++) { +		old = new; +		new = children[j]; +		list_add(&new->be_node, &old->be_node); +	} +	if (merge2) { +		/* This is a HACK, should just create a _back_merge function */ +		new = list_entry(new->be_node.next, +				 struct pnfs_block_extent, be_node); +		new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge2); +	} +	spin_unlock(&bl->bl_ext_lock); + +	/* Since we removed the base reference above, be is now scheduled for +	 * destruction. +	 */ +	bl_put_extent(be); +	dprintk("%s returns %llu after split\n", __func__, rv); +	return rv; + + out_nosplit: +	kfree(e1); +	kfree(e2); +	kfree(e3); +	dprintk("%s returns %llu without splitting\n", __func__, rv); +	return rv; +} + +void +clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, +			      const struct nfs4_layoutcommit_args *arg, +			      int status) +{ +	struct pnfs_block_short_extent *lce, *save; + +	dprintk("%s status %d\n", __func__, status); +	list_for_each_entry_safe(lce, save, &bl->bl_committing, bse_node) { +		if (likely(!status)) { +			u64 offset = lce->bse_f_offset; +			u64 end = offset + lce->bse_length; + +			do { +				offset = set_to_rw(bl, offset, end - offset); +			} while (offset < end); +			list_del(&lce->bse_node); + +			kfree(lce); +		} else { +			list_del(&lce->bse_node); +			spin_lock(&bl->bl_ext_lock); +			add_to_commitlist(bl, lce); +			spin_unlock(&bl->bl_ext_lock); +		} +	} +} diff --git a/fs/nfs/cache_lib.h b/fs/nfs/cache_lib.h index 76f856e284e..7cf6cafcc00 100644 --- a/fs/nfs/cache_lib.h +++ b/fs/nfs/cache_lib.h @@ -6,7 +6,7 @@  #include <linux/completion.h>  #include <linux/sunrpc/cache.h> -#include <asm/atomic.h> +#include <linux/atomic.h>  /*   * Deferred request handling diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h index b257383bb56..07df5f1d85e 100644 --- a/fs/nfs/callback.h +++ b/fs/nfs/callback.h @@ -38,6 +38,7 @@ enum nfs4_callback_opnum {  struct cb_process_state {  	__be32			drc_status;  	struct nfs_client	*clp; +	int			slotid;  };  struct cb_compound_hdr_arg { @@ -166,7 +167,6 @@ extern unsigned nfs4_callback_layoutrecall(  	void *dummy, struct cb_process_state *cps);  extern void nfs4_check_drain_bc_complete(struct nfs4_session *ses); -extern void nfs4_cb_take_slot(struct nfs_client *clp);  struct cb_devicenotifyitem {  	uint32_t		cbd_notify_type; diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index d4d1954e9bb..43926add945 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -111,6 +111,7 @@ int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, const nf  static u32 initiate_file_draining(struct nfs_client *clp,  				  struct cb_layoutrecallargs *args)  { +	struct nfs_server *server;  	struct pnfs_layout_hdr *lo;  	struct inode *ino;  	bool found = false; @@ -118,21 +119,28 @@ static u32 initiate_file_draining(struct nfs_client *clp,  	LIST_HEAD(free_me_list);  	spin_lock(&clp->cl_lock); -	list_for_each_entry(lo, &clp->cl_layouts, plh_layouts) { -		if (nfs_compare_fh(&args->cbl_fh, -				   &NFS_I(lo->plh_inode)->fh)) -			continue; -		ino = igrab(lo->plh_inode); -		if (!ino) -			continue; -		found = true; -		/* Without this, layout can be freed as soon -		 * as we release cl_lock. -		 */ -		get_layout_hdr(lo); -		break; +	rcu_read_lock(); +	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { +		list_for_each_entry(lo, &server->layouts, plh_layouts) { +			if (nfs_compare_fh(&args->cbl_fh, +					   &NFS_I(lo->plh_inode)->fh)) +				continue; +			ino = igrab(lo->plh_inode); +			if (!ino) +				continue; +			found = true; +			/* Without this, layout can be freed as soon +			 * as we release cl_lock. +			 */ +			get_layout_hdr(lo); +			break; +		} +		if (found) +			break;  	} +	rcu_read_unlock();  	spin_unlock(&clp->cl_lock); +  	if (!found)  		return NFS4ERR_NOMATCHING_LAYOUT; @@ -154,6 +162,7 @@ static u32 initiate_file_draining(struct nfs_client *clp,  static u32 initiate_bulk_draining(struct nfs_client *clp,  				  struct cb_layoutrecallargs *args)  { +	struct nfs_server *server;  	struct pnfs_layout_hdr *lo;  	struct inode *ino;  	u32 rv = NFS4ERR_NOMATCHING_LAYOUT; @@ -167,18 +176,24 @@ static u32 initiate_bulk_draining(struct nfs_client *clp,  	};  	spin_lock(&clp->cl_lock); -	list_for_each_entry(lo, &clp->cl_layouts, plh_layouts) { +	rcu_read_lock(); +	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {  		if ((args->cbl_recall_type == RETURN_FSID) && -		    memcmp(&NFS_SERVER(lo->plh_inode)->fsid, -			   &args->cbl_fsid, sizeof(struct nfs_fsid))) -			continue; -		if (!igrab(lo->plh_inode)) +		    memcmp(&server->fsid, &args->cbl_fsid, +			   sizeof(struct nfs_fsid)))  			continue; -		get_layout_hdr(lo); -		BUG_ON(!list_empty(&lo->plh_bulk_recall)); -		list_add(&lo->plh_bulk_recall, &recall_list); + +		list_for_each_entry(lo, &server->layouts, plh_layouts) { +			if (!igrab(lo->plh_inode)) +				continue; +			get_layout_hdr(lo); +			BUG_ON(!list_empty(&lo->plh_bulk_recall)); +			list_add(&lo->plh_bulk_recall, &recall_list); +		}  	} +	rcu_read_unlock();  	spin_unlock(&clp->cl_lock); +  	list_for_each_entry_safe(lo, tmp,  				 &recall_list, plh_bulk_recall) {  		ino = lo->plh_inode; @@ -333,7 +348,7 @@ validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args)  	/* Normal */  	if (likely(args->csa_sequenceid == slot->seq_nr + 1)) {  		slot->seq_nr++; -		return htonl(NFS4_OK); +		goto out_ok;  	}  	/* Replay */ @@ -352,11 +367,14 @@ validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args)  	/* Wraparound */  	if (args->csa_sequenceid == 1 && (slot->seq_nr + 1) == 0) {  		slot->seq_nr = 1; -		return htonl(NFS4_OK); +		goto out_ok;  	}  	/* Misordered request */  	return htonl(NFS4ERR_SEQ_MISORDERED); +out_ok: +	tbl->highest_used_slotid = args->csa_slotid; +	return htonl(NFS4_OK);  }  /* @@ -418,26 +436,37 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,  			      struct cb_sequenceres *res,  			      struct cb_process_state *cps)  { +	struct nfs4_slot_table *tbl;  	struct nfs_client *clp;  	int i;  	__be32 status = htonl(NFS4ERR_BADSESSION); -	cps->clp = NULL; -  	clp = nfs4_find_client_sessionid(args->csa_addr, &args->csa_sessionid);  	if (clp == NULL)  		goto out; +	tbl = &clp->cl_session->bc_slot_table; + +	spin_lock(&tbl->slot_tbl_lock);  	/* state manager is resetting the session */  	if (test_bit(NFS4_SESSION_DRAINING, &clp->cl_session->session_state)) { -		status = NFS4ERR_DELAY; +		spin_unlock(&tbl->slot_tbl_lock); +		status = htonl(NFS4ERR_DELAY); +		/* Return NFS4ERR_BADSESSION if we're draining the session +		 * in order to reset it. +		 */ +		if (test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state)) +			status = htonl(NFS4ERR_BADSESSION);  		goto out;  	}  	status = validate_seqid(&clp->cl_session->bc_slot_table, args); +	spin_unlock(&tbl->slot_tbl_lock);  	if (status)  		goto out; +	cps->slotid = args->csa_slotid; +  	/*  	 * Check for pending referring calls.  If a match is found, a  	 * related callback was received before the response to the original @@ -454,7 +483,6 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,  	res->csr_slotid = args->csa_slotid;  	res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;  	res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1; -	nfs4_cb_take_slot(clp);  out:  	cps->clp = clp; /* put in nfs4_callback_compound */ diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index c6c86a77e04..918ad647afe 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -754,26 +754,15 @@ static void nfs4_callback_free_slot(struct nfs4_session *session)  	 * Let the state manager know callback processing done.  	 * A single slot, so highest used slotid is either 0 or -1  	 */ -	tbl->highest_used_slotid--; +	tbl->highest_used_slotid = -1;  	nfs4_check_drain_bc_complete(session);  	spin_unlock(&tbl->slot_tbl_lock);  } -static void nfs4_cb_free_slot(struct nfs_client *clp) +static void nfs4_cb_free_slot(struct cb_process_state *cps)  { -	if (clp && clp->cl_session) -		nfs4_callback_free_slot(clp->cl_session); -} - -/* A single slot, so highest used slotid is either 0 or -1 */ -void nfs4_cb_take_slot(struct nfs_client *clp) -{ -	struct nfs4_slot_table *tbl = &clp->cl_session->bc_slot_table; - -	spin_lock(&tbl->slot_tbl_lock); -	tbl->highest_used_slotid++; -	BUG_ON(tbl->highest_used_slotid != 0); -	spin_unlock(&tbl->slot_tbl_lock); +	if (cps->slotid != -1) +		nfs4_callback_free_slot(cps->clp->cl_session);  }  #else /* CONFIG_NFS_V4_1 */ @@ -784,7 +773,7 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)  	return htonl(NFS4ERR_MINOR_VERS_MISMATCH);  } -static void nfs4_cb_free_slot(struct nfs_client *clp) +static void nfs4_cb_free_slot(struct cb_process_state *cps)  {  }  #endif /* CONFIG_NFS_V4_1 */ @@ -866,6 +855,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r  	struct cb_process_state cps = {  		.drc_status = 0,  		.clp = NULL, +		.slotid = -1,  	};  	unsigned int nops = 0; @@ -906,7 +896,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r  	*hdr_res.status = status;  	*hdr_res.nops = htonl(nops); -	nfs4_cb_free_slot(cps.clp); +	nfs4_cb_free_slot(&cps);  	nfs_put_client(cps.clp);  	dprintk("%s: done, status = %u\n", __func__, ntohl(status));  	return rpc_success; diff --git a/fs/nfs/client.c b/fs/nfs/client.c index b3dc2b88b65..5833fbbf59b 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -105,7 +105,7 @@ struct rpc_program nfs_program = {  	.nrvers			= ARRAY_SIZE(nfs_version),  	.version		= nfs_version,  	.stats			= &nfs_rpcstat, -	.pipe_dir_name		= "/nfs", +	.pipe_dir_name		= NFS_PIPE_DIRNAME,  };  struct rpc_stat nfs_rpcstat = { @@ -188,9 +188,6 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_  	cred = rpc_lookup_machine_cred();  	if (!IS_ERR(cred))  		clp->cl_machine_cred = cred; -#if defined(CONFIG_NFS_V4_1) -	INIT_LIST_HEAD(&clp->cl_layouts); -#endif  	nfs_fscache_get_client_cookie(clp);  	return clp; @@ -293,6 +290,7 @@ static void nfs_free_client(struct nfs_client *clp)  	nfs4_deviceid_purge_client(clp);  	kfree(clp->cl_hostname); +	kfree(clp->server_scope);  	kfree(clp);  	dprintk("<-- nfs_free_client()\n"); @@ -906,7 +904,9 @@ error:  /*   * Load up the server record from information gained in an fsinfo record   */ -static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *fsinfo) +static void nfs_server_set_fsinfo(struct nfs_server *server, +				  struct nfs_fh *mntfh, +				  struct nfs_fsinfo *fsinfo)  {  	unsigned long max_rpc_payload; @@ -936,7 +936,8 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *  	if (server->wsize > NFS_MAX_FILE_IO_SIZE)  		server->wsize = NFS_MAX_FILE_IO_SIZE;  	server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; -	set_pnfs_layoutdriver(server, fsinfo->layouttype); +	server->pnfs_blksize = fsinfo->blksize; +	set_pnfs_layoutdriver(server, mntfh, fsinfo->layouttype);  	server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL); @@ -982,7 +983,7 @@ static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, str  	if (error < 0)  		goto out_error; -	nfs_server_set_fsinfo(server, &fsinfo); +	nfs_server_set_fsinfo(server, mntfh, &fsinfo);  	/* Get some general file system info */  	if (server->namelen == 0) { @@ -1062,6 +1063,7 @@ static struct nfs_server *nfs_alloc_server(void)  	INIT_LIST_HEAD(&server->client_link);  	INIT_LIST_HEAD(&server->master_link);  	INIT_LIST_HEAD(&server->delegations); +	INIT_LIST_HEAD(&server->layouts);  	atomic_set(&server->active, 0); @@ -1464,7 +1466,7 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,  	dprintk("<-- %s %p\n", __func__, clp);  	return clp;  } -EXPORT_SYMBOL(nfs4_set_ds_client); +EXPORT_SYMBOL_GPL(nfs4_set_ds_client);  /*   * Session has been established, and the client marked ready. diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index dd25c2aec37..321a66bc384 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -398,12 +398,11 @@ int nfs_inode_return_delegation(struct inode *inode)  	return err;  } -static void nfs_mark_return_delegation(struct nfs_delegation *delegation) +static void nfs_mark_return_delegation(struct nfs_server *server, +		struct nfs_delegation *delegation)  { -	struct nfs_client *clp = NFS_SERVER(delegation->inode)->nfs_client; -  	set_bit(NFS_DELEGATION_RETURN, &delegation->flags); -	set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state); +	set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state);  }  /** @@ -441,7 +440,7 @@ static void nfs_mark_return_all_delegation_types(struct nfs_server *server,  		if ((delegation->type == (FMODE_READ|FMODE_WRITE)) && !(flags & FMODE_WRITE))  			continue;  		if (delegation->type & flags) -			nfs_mark_return_delegation(delegation); +			nfs_mark_return_delegation(server, delegation);  	}  } @@ -508,7 +507,7 @@ static void nfs_mark_return_unreferenced_delegations(struct nfs_server *server)  	list_for_each_entry_rcu(delegation, &server->delegations, super_list) {  		if (test_and_clear_bit(NFS_DELEGATION_REFERENCED, &delegation->flags))  			continue; -		nfs_mark_return_delegation(delegation); +		nfs_mark_return_delegation(server, delegation);  	}  } @@ -539,7 +538,8 @@ void nfs_expire_unreferenced_delegations(struct nfs_client *clp)  int nfs_async_inode_return_delegation(struct inode *inode,  				      const nfs4_stateid *stateid)  { -	struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; +	struct nfs_server *server = NFS_SERVER(inode); +	struct nfs_client *clp = server->nfs_client;  	struct nfs_delegation *delegation;  	rcu_read_lock(); @@ -549,7 +549,7 @@ int nfs_async_inode_return_delegation(struct inode *inode,  		rcu_read_unlock();  		return -ENOENT;  	} -	nfs_mark_return_delegation(delegation); +	nfs_mark_return_delegation(server, delegation);  	rcu_read_unlock();  	nfs_delegation_run_state_manager(clp); diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 57f578e2560..b238d95ac48 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -134,18 +134,19 @@ const struct inode_operations nfs4_dir_inode_operations = {  #endif /* CONFIG_NFS_V4 */ -static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct rpc_cred *cred) +static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir, struct rpc_cred *cred)  {  	struct nfs_open_dir_context *ctx;  	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);  	if (ctx != NULL) {  		ctx->duped = 0; +		ctx->attr_gencount = NFS_I(dir)->attr_gencount;  		ctx->dir_cookie = 0;  		ctx->dup_cookie = 0;  		ctx->cred = get_rpccred(cred); -	} else -		ctx = ERR_PTR(-ENOMEM); -	return ctx; +		return ctx; +	} +	return  ERR_PTR(-ENOMEM);  }  static void put_nfs_open_dir_context(struct nfs_open_dir_context *ctx) @@ -173,7 +174,7 @@ nfs_opendir(struct inode *inode, struct file *filp)  	cred = rpc_lookup_cred();  	if (IS_ERR(cred))  		return PTR_ERR(cred); -	ctx = alloc_nfs_open_dir_context(cred); +	ctx = alloc_nfs_open_dir_context(inode, cred);  	if (IS_ERR(ctx)) {  		res = PTR_ERR(ctx);  		goto out; @@ -323,7 +324,6 @@ int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descri  {  	loff_t diff = desc->file->f_pos - desc->current_index;  	unsigned int index; -	struct nfs_open_dir_context *ctx = desc->file->private_data;  	if (diff < 0)  		goto out_eof; @@ -336,7 +336,6 @@ int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descri  	index = (unsigned int)diff;  	*desc->dir_cookie = array->array[index].cookie;  	desc->cache_entry_index = index; -	ctx->duped = 0;  	return 0;  out_eof:  	desc->eof = 1; @@ -349,14 +348,34 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des  	int i;  	loff_t new_pos;  	int status = -EAGAIN; -	struct nfs_open_dir_context *ctx = desc->file->private_data;  	for (i = 0; i < array->size; i++) {  		if (array->array[i].cookie == *desc->dir_cookie) { +			struct nfs_inode *nfsi = NFS_I(desc->file->f_path.dentry->d_inode); +			struct nfs_open_dir_context *ctx = desc->file->private_data; +  			new_pos = desc->current_index + i; -			if (new_pos < desc->file->f_pos) { +			if (ctx->attr_gencount != nfsi->attr_gencount +			    || (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))) { +				ctx->duped = 0; +				ctx->attr_gencount = nfsi->attr_gencount; +			} else if (new_pos < desc->file->f_pos) { +				if (ctx->duped > 0 +				    && ctx->dup_cookie == *desc->dir_cookie) { +					if (printk_ratelimit()) { +						pr_notice("NFS: directory %s/%s contains a readdir loop." +								"Please contact your server vendor.  " +								"The file: %s has duplicate cookie %llu\n", +								desc->file->f_dentry->d_parent->d_name.name, +								desc->file->f_dentry->d_name.name, +								array->array[i].string.name, +								*desc->dir_cookie); +					} +					status = -ELOOP; +					goto out; +				}  				ctx->dup_cookie = *desc->dir_cookie; -				ctx->duped = 1; +				ctx->duped = -1;  			}  			desc->file->f_pos = new_pos;  			desc->cache_entry_index = i; @@ -368,6 +387,7 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des  		if (*desc->dir_cookie == array->last_cookie)  			desc->eof = 1;  	} +out:  	return status;  } @@ -740,19 +760,6 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent,  	struct nfs_cache_array *array = NULL;  	struct nfs_open_dir_context *ctx = file->private_data; -	if (ctx->duped != 0 && ctx->dup_cookie == *desc->dir_cookie) { -		if (printk_ratelimit()) { -			pr_notice("NFS: directory %s/%s contains a readdir loop.  " -				"Please contact your server vendor.  " -				"Offending cookie: %llu\n", -				file->f_dentry->d_parent->d_name.name, -				file->f_dentry->d_name.name, -				*desc->dir_cookie); -		} -		res = -ELOOP; -		goto out; -	} -  	array = nfs_readdir_get_array(desc->page);  	if (IS_ERR(array)) {  		res = PTR_ERR(array); @@ -774,6 +781,8 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent,  			*desc->dir_cookie = array->array[i+1].cookie;  		else  			*desc->dir_cookie = array->last_cookie; +		if (ctx->duped != 0) +			ctx->duped = 1;  	}  	if (array->eof_index >= 0)  		desc->eof = 1; @@ -805,6 +814,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,  	struct page	*page = NULL;  	int		status;  	struct inode *inode = desc->file->f_path.dentry->d_inode; +	struct nfs_open_dir_context *ctx = desc->file->private_data;  	dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n",  			(unsigned long long)*desc->dir_cookie); @@ -818,6 +828,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,  	desc->page_index = 0;  	desc->last_cookie = *desc->dir_cookie;  	desc->page = page; +	ctx->duped = 0;  	status = nfs_readdir_xdr_to_array(desc, page, inode);  	if (status < 0) diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index b35d25b98da..1940f1a56a5 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -53,7 +53,7 @@  #include <asm/system.h>  #include <asm/uaccess.h> -#include <asm/atomic.h> +#include <linux/atomic.h>  #include "internal.h"  #include "iostat.h" diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 2a55347a2da..ab12913dd47 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -277,6 +277,9 @@ extern void nfs_sb_deactive(struct super_block *sb);  extern char *nfs_path(char **p, struct dentry *dentry,  		      char *buffer, ssize_t buflen);  extern struct vfsmount *nfs_d_automount(struct path *path); +#ifdef CONFIG_NFS_V4 +rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *); +#endif  /* getroot.c */  extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *, @@ -288,12 +291,22 @@ extern struct dentry *nfs4_get_root(struct super_block *, struct nfs_fh *,  extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh);  #endif +struct nfs_pageio_descriptor;  /* read.c */  extern int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt,  			     const struct rpc_call_ops *call_ops);  extern void nfs_read_prepare(struct rpc_task *task, void *calldata); +extern int nfs_generic_pagein(struct nfs_pageio_descriptor *desc, +		struct list_head *head); + +extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio); +extern void nfs_readdata_release(struct nfs_read_data *rdata);  /* write.c */ +extern int nfs_generic_flush(struct nfs_pageio_descriptor *desc, +		struct list_head *head); +extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio); +extern void nfs_writedata_release(struct nfs_write_data *wdata);  extern void nfs_commit_free(struct nfs_write_data *p);  extern int nfs_initiate_write(struct nfs_write_data *data,  			      struct rpc_clnt *clnt, diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index 1f063bacd28..8102391bb37 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -119,7 +119,7 @@ Elong:  }  #ifdef CONFIG_NFS_V4 -static rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *flavors) +rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *flavors)  {  	struct gss_api_mech *mech;  	struct xdr_netobj oid; diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index e49e73107e6..7ef23979896 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -415,7 +415,7 @@ fail:  }  int nfs3_proc_set_default_acl(struct inode *dir, struct inode *inode, -		mode_t mode) +		umode_t mode)  {  	struct posix_acl *dfacl, *acl;  	int error = 0; diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 38053d823eb..85f1690ca08 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -316,7 +316,7 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,  		 int flags, struct nfs_open_context *ctx)  {  	struct nfs3_createdata *data; -	mode_t mode = sattr->ia_mode; +	umode_t mode = sattr->ia_mode;  	int status = -ENOMEM;  	dprintk("NFS call  create %s\n", dentry->d_name.name); @@ -562,7 +562,7 @@ static int  nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)  {  	struct nfs3_createdata *data; -	int mode = sattr->ia_mode; +	umode_t mode = sattr->ia_mode;  	int status = -ENOMEM;  	dprintk("NFS call  mkdir %s\n", dentry->d_name.name); @@ -681,7 +681,7 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,  		dev_t rdev)  {  	struct nfs3_createdata *data; -	mode_t mode = sattr->ia_mode; +	umode_t mode = sattr->ia_mode;  	int status = -ENOMEM;  	dprintk("NFS call  mknod %s %u:%u\n", dentry->d_name.name, diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index b788f2eb1ba..1ec1a85fa71 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -48,6 +48,7 @@ enum nfs4_client_state {  	NFS4CLNT_SESSION_RESET,  	NFS4CLNT_RECALL_SLOT,  	NFS4CLNT_LEASE_CONFIRM, +	NFS4CLNT_SERVER_SCOPE_MISMATCH,  };  enum nfs4_session_state { @@ -66,6 +67,8 @@ struct nfs4_minor_version_ops {  			int cache_reply);  	int	(*validate_stateid)(struct nfs_delegation *,  			const nfs4_stateid *); +	int	(*find_root_sec)(struct nfs_server *, struct nfs_fh *, +			struct nfs_fsinfo *);  	const struct nfs4_state_recovery_ops *reboot_recovery_ops;  	const struct nfs4_state_recovery_ops *nograce_recovery_ops;  	const struct nfs4_state_maintenance_ops *state_renewal_ops; @@ -315,7 +318,7 @@ extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[];  extern const u32 nfs4_fattr_bitmap[2];  extern const u32 nfs4_statfs_bitmap[2];  extern const u32 nfs4_pathconf_bitmap[2]; -extern const u32 nfs4_fsinfo_bitmap[2]; +extern const u32 nfs4_fsinfo_bitmap[3];  extern const u32 nfs4_fs_locations_bitmap[2];  /* nfs4renewd.c */ @@ -349,6 +352,8 @@ extern void nfs4_schedule_state_manager(struct nfs_client *);  extern void nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *);  extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags);  extern void nfs41_handle_recall_slot(struct nfs_client *clp); +extern void nfs41_handle_server_scope(struct nfs_client *, +				      struct server_scope **);  extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);  extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl);  extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t, pid_t); diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index f9d03abcd04..e8915d4840a 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -170,7 +170,7 @@ filelayout_set_layoutcommit(struct nfs_write_data *wdata)  	pnfs_set_layoutcommit(wdata);  	dprintk("%s ionde %lu pls_end_pos %lu\n", __func__, wdata->inode->i_ino, -		(unsigned long) wdata->lseg->pls_end_pos); +		(unsigned long) NFS_I(wdata->inode)->layout->plh_lwb);  }  /* @@ -334,6 +334,9 @@ filelayout_read_pagelist(struct nfs_read_data *data)  		__func__, data->inode->i_ino,  		data->args.pgbase, (size_t)data->args.count, offset); +	if (test_bit(NFS_DEVICEID_INVALID, &FILELAYOUT_DEVID_NODE(lseg)->flags)) +		return PNFS_NOT_ATTEMPTED; +  	/* Retrieve the correct rpc_client for the byte range */  	j = nfs4_fl_calc_j_index(lseg, offset);  	idx = nfs4_fl_calc_ds_index(lseg, j); @@ -344,8 +347,7 @@ filelayout_read_pagelist(struct nfs_read_data *data)  		set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);  		return PNFS_NOT_ATTEMPTED;  	} -	dprintk("%s USE DS:ip %x %hu\n", __func__, -		ntohl(ds->ds_ip_addr), ntohs(ds->ds_port)); +	dprintk("%s USE DS: %s\n", __func__, ds->ds_remotestr);  	/* No multipath support. Use first DS */  	data->ds_clp = ds->ds_clp; @@ -374,6 +376,9 @@ filelayout_write_pagelist(struct nfs_write_data *data, int sync)  	struct nfs_fh *fh;  	int status; +	if (test_bit(NFS_DEVICEID_INVALID, &FILELAYOUT_DEVID_NODE(lseg)->flags)) +		return PNFS_NOT_ATTEMPTED; +  	/* Retrieve the correct rpc_client for the byte range */  	j = nfs4_fl_calc_j_index(lseg, offset);  	idx = nfs4_fl_calc_ds_index(lseg, j); @@ -384,9 +389,9 @@ filelayout_write_pagelist(struct nfs_write_data *data, int sync)  		set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);  		return PNFS_NOT_ATTEMPTED;  	} -	dprintk("%s ino %lu sync %d req %Zu@%llu DS:%x:%hu\n", __func__, +	dprintk("%s ino %lu sync %d req %Zu@%llu DS: %s\n", __func__,  		data->inode->i_ino, sync, (size_t) data->args.count, offset, -		ntohl(ds->ds_ip_addr), ntohs(ds->ds_port)); +		ds->ds_remotestr);  	data->write_done_cb = filelayout_write_done_cb;  	data->ds_clp = ds->ds_clp; @@ -428,6 +433,14 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,  	dprintk("--> %s\n", __func__); +	/* FIXME: remove this check when layout segment support is added */ +	if (lgr->range.offset != 0 || +	    lgr->range.length != NFS4_MAX_UINT64) { +		dprintk("%s Only whole file layouts supported. Use MDS i/o\n", +			__func__); +		goto out; +	} +  	if (fl->pattern_offset > lgr->range.offset) {  		dprintk("%s pattern_offset %lld too large\n",  				__func__, fl->pattern_offset); @@ -449,6 +462,10 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,  			goto out;  	} else  		dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node); +	/* Found deviceid is being reaped */ +	if (test_bit(NFS_DEVICEID_INVALID, &dsaddr->id_node.flags)) +			goto out_put; +  	fl->dsaddr = dsaddr;  	if (fl->first_stripe_index < 0 || @@ -659,7 +676,7 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,   * return true  : coalesce page   * return false : don't coalesce page   */ -bool +static bool  filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,  		   struct nfs_page *req)  { @@ -670,8 +687,6 @@ filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,  	    !nfs_generic_pg_test(pgio, prev, req))  		return false; -	if (!pgio->pg_lseg) -		return 1;  	p_stripe = (u64)prev->wb_index << PAGE_CACHE_SHIFT;  	r_stripe = (u64)req->wb_index << PAGE_CACHE_SHIFT;  	stripe_unit = FILELAYOUT_LSEG(pgio->pg_lseg)->stripe_unit; @@ -682,6 +697,52 @@ filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,  	return (p_stripe == r_stripe);  } +void +filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio, +			struct nfs_page *req) +{ +	BUG_ON(pgio->pg_lseg != NULL); + +	pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, +					   req->wb_context, +					   0, +					   NFS4_MAX_UINT64, +					   IOMODE_READ, +					   GFP_KERNEL); +	/* If no lseg, fall back to read through mds */ +	if (pgio->pg_lseg == NULL) +		nfs_pageio_reset_read_mds(pgio); +} + +void +filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio, +			 struct nfs_page *req) +{ +	BUG_ON(pgio->pg_lseg != NULL); + +	pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, +					   req->wb_context, +					   0, +					   NFS4_MAX_UINT64, +					   IOMODE_RW, +					   GFP_NOFS); +	/* If no lseg, fall back to write through mds */ +	if (pgio->pg_lseg == NULL) +		nfs_pageio_reset_write_mds(pgio); +} + +static const struct nfs_pageio_ops filelayout_pg_read_ops = { +	.pg_init = filelayout_pg_init_read, +	.pg_test = filelayout_pg_test, +	.pg_doio = pnfs_generic_pg_readpages, +}; + +static const struct nfs_pageio_ops filelayout_pg_write_ops = { +	.pg_init = filelayout_pg_init_write, +	.pg_test = filelayout_pg_test, +	.pg_doio = pnfs_generic_pg_writepages, +}; +  static bool filelayout_mark_pnfs_commit(struct pnfs_layout_segment *lseg)  {  	return !FILELAYOUT_LSEG(lseg)->commit_through_mds; @@ -879,7 +940,8 @@ static struct pnfs_layoutdriver_type filelayout_type = {  	.owner			= THIS_MODULE,  	.alloc_lseg		= filelayout_alloc_lseg,  	.free_lseg		= filelayout_free_lseg, -	.pg_test		= filelayout_pg_test, +	.pg_read_ops		= &filelayout_pg_read_ops, +	.pg_write_ops		= &filelayout_pg_write_ops,  	.mark_pnfs_commit	= filelayout_mark_pnfs_commit,  	.choose_commit_list	= filelayout_choose_commit_list,  	.commit_pagelist	= filelayout_commit_pagelist, @@ -902,5 +964,7 @@ static void __exit nfs4filelayout_exit(void)  	pnfs_unregister_layoutdriver(&filelayout_type);  } +MODULE_ALIAS("nfs-layouttype4-1"); +  module_init(nfs4filelayout_init);  module_exit(nfs4filelayout_exit); diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h index cebe01e3795..2e42284253f 100644 --- a/fs/nfs/nfs4filelayout.h +++ b/fs/nfs/nfs4filelayout.h @@ -47,10 +47,17 @@ enum stripetype4 {  };  /* Individual ip address */ +struct nfs4_pnfs_ds_addr { +	struct sockaddr_storage	da_addr; +	size_t			da_addrlen; +	struct list_head	da_node;  /* nfs4_pnfs_dev_hlist dev_dslist */ +	char			*da_remotestr;	/* human readable addr+port */ +}; +  struct nfs4_pnfs_ds {  	struct list_head	ds_node;  /* nfs4_pnfs_dev_hlist dev_dslist */ -	u32			ds_ip_addr; -	u32			ds_port; +	char			*ds_remotestr;	/* comma sep list of addrs */ +	struct list_head	ds_addrs;  	struct nfs_client	*ds_clp;  	atomic_t		ds_count;  }; @@ -89,6 +96,12 @@ FILELAYOUT_LSEG(struct pnfs_layout_segment *lseg)  			    generic_hdr);  } +static inline struct nfs4_deviceid_node * +FILELAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg) +{ +	return &FILELAYOUT_LSEG(lseg)->dsaddr->id_node; +} +  extern struct nfs_fh *  nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j); diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c index 3b7bf137726..ed388aae968 100644 --- a/fs/nfs/nfs4filelayoutdev.c +++ b/fs/nfs/nfs4filelayoutdev.c @@ -56,54 +56,139 @@ print_ds(struct nfs4_pnfs_ds *ds)  		printk("%s NULL device\n", __func__);  		return;  	} -	printk("        ip_addr %x port %hu\n" +	printk("        ds %s\n"  		"        ref count %d\n"  		"        client %p\n"  		"        cl_exchange_flags %x\n", -		ntohl(ds->ds_ip_addr), ntohs(ds->ds_port), +		ds->ds_remotestr,  		atomic_read(&ds->ds_count), ds->ds_clp,  		ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0);  } -/* nfs4_ds_cache_lock is held */ +static bool +same_sockaddr(struct sockaddr *addr1, struct sockaddr *addr2) +{ +	struct sockaddr_in *a, *b; +	struct sockaddr_in6 *a6, *b6; + +	if (addr1->sa_family != addr2->sa_family) +		return false; + +	switch (addr1->sa_family) { +	case AF_INET: +		a = (struct sockaddr_in *)addr1; +		b = (struct sockaddr_in *)addr2; + +		if (a->sin_addr.s_addr == b->sin_addr.s_addr && +		    a->sin_port == b->sin_port) +			return true; +		break; + +	case AF_INET6: +		a6 = (struct sockaddr_in6 *)addr1; +		b6 = (struct sockaddr_in6 *)addr2; + +		/* LINKLOCAL addresses must have matching scope_id */ +		if (ipv6_addr_scope(&a6->sin6_addr) == +		    IPV6_ADDR_SCOPE_LINKLOCAL && +		    a6->sin6_scope_id != b6->sin6_scope_id) +			return false; + +		if (ipv6_addr_equal(&a6->sin6_addr, &b6->sin6_addr) && +		    a6->sin6_port == b6->sin6_port) +			return true; +		break; + +	default: +		dprintk("%s: unhandled address family: %u\n", +			__func__, addr1->sa_family); +		return false; +	} + +	return false; +} + +/* + * Lookup DS by addresses.  The first matching address returns true. + * nfs4_ds_cache_lock is held + */  static struct nfs4_pnfs_ds * -_data_server_lookup_locked(u32 ip_addr, u32 port) +_data_server_lookup_locked(struct list_head *dsaddrs)  {  	struct nfs4_pnfs_ds *ds; +	struct nfs4_pnfs_ds_addr *da1, *da2; -	dprintk("_data_server_lookup: ip_addr=%x port=%hu\n", -			ntohl(ip_addr), ntohs(port)); - -	list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) { -		if (ds->ds_ip_addr == ip_addr && -		    ds->ds_port == port) { -			return ds; +	list_for_each_entry(da1, dsaddrs, da_node) { +		list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) { +			list_for_each_entry(da2, &ds->ds_addrs, da_node) { +				if (same_sockaddr( +					(struct sockaddr *)&da1->da_addr, +					(struct sockaddr *)&da2->da_addr)) +					return ds; +			}  		}  	}  	return NULL;  }  /* + * Compare two lists of addresses. + */ +static bool +_data_server_match_all_addrs_locked(struct list_head *dsaddrs1, +				    struct list_head *dsaddrs2) +{ +	struct nfs4_pnfs_ds_addr *da1, *da2; +	size_t count1 = 0, +	       count2 = 0; + +	list_for_each_entry(da1, dsaddrs1, da_node) +		count1++; + +	list_for_each_entry(da2, dsaddrs2, da_node) { +		bool found = false; +		count2++; +		list_for_each_entry(da1, dsaddrs1, da_node) { +			if (same_sockaddr((struct sockaddr *)&da1->da_addr, +				(struct sockaddr *)&da2->da_addr)) { +				found = true; +				break; +			} +		} +		if (!found) +			return false; +	} + +	return (count1 == count2); +} + +/*   * Create an rpc connection to the nfs4_pnfs_ds data server - * Currently only support IPv4 + * Currently only supports IPv4 and IPv6 addresses   */  static int  nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds)  { -	struct nfs_client *clp; -	struct sockaddr_in sin; +	struct nfs_client *clp = ERR_PTR(-EIO); +	struct nfs4_pnfs_ds_addr *da;  	int status = 0; -	dprintk("--> %s ip:port %x:%hu au_flavor %d\n", __func__, -		ntohl(ds->ds_ip_addr), ntohs(ds->ds_port), +	dprintk("--> %s DS %s au_flavor %d\n", __func__, ds->ds_remotestr,  		mds_srv->nfs_client->cl_rpcclient->cl_auth->au_flavor); -	sin.sin_family = AF_INET; -	sin.sin_addr.s_addr = ds->ds_ip_addr; -	sin.sin_port = ds->ds_port; +	BUG_ON(list_empty(&ds->ds_addrs)); + +	list_for_each_entry(da, &ds->ds_addrs, da_node) { +		dprintk("%s: DS %s: trying address %s\n", +			__func__, ds->ds_remotestr, da->da_remotestr); + +		clp = nfs4_set_ds_client(mds_srv->nfs_client, +				 (struct sockaddr *)&da->da_addr, +				 da->da_addrlen, IPPROTO_TCP); +		if (!IS_ERR(clp)) +			break; +	} -	clp = nfs4_set_ds_client(mds_srv->nfs_client, (struct sockaddr *)&sin, -				 sizeof(sin), IPPROTO_TCP);  	if (IS_ERR(clp)) {  		status = PTR_ERR(clp);  		goto out; @@ -115,8 +200,8 @@ nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds)  			goto out_put;  		}  		ds->ds_clp = clp; -		dprintk("%s [existing] ip=%x, port=%hu\n", __func__, -			ntohl(ds->ds_ip_addr), ntohs(ds->ds_port)); +		dprintk("%s [existing] server=%s\n", __func__, +			ds->ds_remotestr);  		goto out;  	} @@ -135,8 +220,7 @@ nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds)  		goto out_put;  	ds->ds_clp = clp; -	dprintk("%s [new] ip=%x, port=%hu\n", __func__, ntohl(ds->ds_ip_addr), -		ntohs(ds->ds_port)); +	dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr);  out:  	return status;  out_put: @@ -147,12 +231,25 @@ out_put:  static void  destroy_ds(struct nfs4_pnfs_ds *ds)  { +	struct nfs4_pnfs_ds_addr *da; +  	dprintk("--> %s\n", __func__);  	ifdebug(FACILITY)  		print_ds(ds);  	if (ds->ds_clp)  		nfs_put_client(ds->ds_clp); + +	while (!list_empty(&ds->ds_addrs)) { +		da = list_first_entry(&ds->ds_addrs, +				      struct nfs4_pnfs_ds_addr, +				      da_node); +		list_del_init(&da->da_node); +		kfree(da->da_remotestr); +		kfree(da); +	} + +	kfree(ds->ds_remotestr);  	kfree(ds);  } @@ -179,31 +276,96 @@ nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)  	kfree(dsaddr);  } +/* + * Create a string with a human readable address and port to avoid + * complicated setup around many dprinks. + */ +static char * +nfs4_pnfs_remotestr(struct list_head *dsaddrs, gfp_t gfp_flags) +{ +	struct nfs4_pnfs_ds_addr *da; +	char *remotestr; +	size_t len; +	char *p; + +	len = 3;        /* '{', '}' and eol */ +	list_for_each_entry(da, dsaddrs, da_node) { +		len += strlen(da->da_remotestr) + 1;    /* string plus comma */ +	} + +	remotestr = kzalloc(len, gfp_flags); +	if (!remotestr) +		return NULL; + +	p = remotestr; +	*(p++) = '{'; +	len--; +	list_for_each_entry(da, dsaddrs, da_node) { +		size_t ll = strlen(da->da_remotestr); + +		if (ll > len) +			goto out_err; + +		memcpy(p, da->da_remotestr, ll); +		p += ll; +		len -= ll; + +		if (len < 1) +			goto out_err; +		(*p++) = ','; +		len--; +	} +	if (len < 2) +		goto out_err; +	*(p++) = '}'; +	*p = '\0'; +	return remotestr; +out_err: +	kfree(remotestr); +	return NULL; +} +  static struct nfs4_pnfs_ds * -nfs4_pnfs_ds_add(struct inode *inode, u32 ip_addr, u32 port, gfp_t gfp_flags) +nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags)  { -	struct nfs4_pnfs_ds *tmp_ds, *ds; +	struct nfs4_pnfs_ds *tmp_ds, *ds = NULL; +	char *remotestr; + +	if (list_empty(dsaddrs)) { +		dprintk("%s: no addresses defined\n", __func__); +		goto out; +	} -	ds = kzalloc(sizeof(*tmp_ds), gfp_flags); +	ds = kzalloc(sizeof(*ds), gfp_flags);  	if (!ds)  		goto out; +	/* this is only used for debugging, so it's ok if its NULL */ +	remotestr = nfs4_pnfs_remotestr(dsaddrs, gfp_flags); +  	spin_lock(&nfs4_ds_cache_lock); -	tmp_ds = _data_server_lookup_locked(ip_addr, port); +	tmp_ds = _data_server_lookup_locked(dsaddrs);  	if (tmp_ds == NULL) { -		ds->ds_ip_addr = ip_addr; -		ds->ds_port = port; +		INIT_LIST_HEAD(&ds->ds_addrs); +		list_splice_init(dsaddrs, &ds->ds_addrs); +		ds->ds_remotestr = remotestr;  		atomic_set(&ds->ds_count, 1);  		INIT_LIST_HEAD(&ds->ds_node);  		ds->ds_clp = NULL;  		list_add(&ds->ds_node, &nfs4_data_server_cache); -		dprintk("%s add new data server ip 0x%x\n", __func__, -			ds->ds_ip_addr); +		dprintk("%s add new data server %s\n", __func__, +			ds->ds_remotestr);  	} else { +		if (!_data_server_match_all_addrs_locked(&tmp_ds->ds_addrs, +							 dsaddrs)) { +			dprintk("%s:  multipath address mismatch: %s != %s", +				__func__, tmp_ds->ds_remotestr, remotestr); +		} +		kfree(remotestr);  		kfree(ds);  		atomic_inc(&tmp_ds->ds_count); -		dprintk("%s data server found ip 0x%x, inc'ed ds_count to %d\n", -			__func__, tmp_ds->ds_ip_addr, +		dprintk("%s data server %s found, inc'ed ds_count to %d\n", +			__func__, tmp_ds->ds_remotestr,  			atomic_read(&tmp_ds->ds_count));  		ds = tmp_ds;  	} @@ -213,18 +375,22 @@ out:  }  /* - * Currently only support ipv4, and one multi-path address. + * Currently only supports ipv4, ipv6 and one multi-path address.   */ -static struct nfs4_pnfs_ds * -decode_and_add_ds(struct xdr_stream *streamp, struct inode *inode, gfp_t gfp_flags) +static struct nfs4_pnfs_ds_addr * +decode_ds_addr(struct xdr_stream *streamp, gfp_t gfp_flags)  { -	struct nfs4_pnfs_ds *ds = NULL; -	char *buf; -	const char *ipend, *pstr; -	u32 ip_addr, port; -	int nlen, rlen, i; +	struct nfs4_pnfs_ds_addr *da = NULL; +	char *buf, *portstr; +	u32 port; +	int nlen, rlen;  	int tmp[2];  	__be32 *p; +	char *netid, *match_netid; +	size_t len, match_netid_len; +	char *startsep = ""; +	char *endsep = ""; +  	/* r_netid */  	p = xdr_inline_decode(streamp, 4); @@ -236,64 +402,123 @@ decode_and_add_ds(struct xdr_stream *streamp, struct inode *inode, gfp_t gfp_fla  	if (unlikely(!p))  		goto out_err; -	/* Check that netid is "tcp" */ -	if (nlen != 3 ||  memcmp((char *)p, "tcp", 3)) { -		dprintk("%s: ERROR: non ipv4 TCP r_netid\n", __func__); +	netid = kmalloc(nlen+1, gfp_flags); +	if (unlikely(!netid))  		goto out_err; -	} -	/* r_addr */ +	netid[nlen] = '\0'; +	memcpy(netid, p, nlen); + +	/* r_addr: ip/ip6addr with port in dec octets - see RFC 5665 */  	p = xdr_inline_decode(streamp, 4);  	if (unlikely(!p)) -		goto out_err; +		goto out_free_netid;  	rlen = be32_to_cpup(p);  	p = xdr_inline_decode(streamp, rlen);  	if (unlikely(!p)) -		goto out_err; +		goto out_free_netid; -	/* ipv6 length plus port is legal */ -	if (rlen > INET6_ADDRSTRLEN + 8) { +	/* port is ".ABC.DEF", 8 chars max */ +	if (rlen > INET6_ADDRSTRLEN + IPV6_SCOPE_ID_LEN + 8) {  		dprintk("%s: Invalid address, length %d\n", __func__,  			rlen); -		goto out_err; +		goto out_free_netid;  	}  	buf = kmalloc(rlen + 1, gfp_flags);  	if (!buf) {  		dprintk("%s: Not enough memory\n", __func__); -		goto out_err; +		goto out_free_netid;  	}  	buf[rlen] = '\0';  	memcpy(buf, p, rlen); -	/* replace the port dots with dashes for the in4_pton() delimiter*/ -	for (i = 0; i < 2; i++) { -		char *res = strrchr(buf, '.'); -		if (!res) { -			dprintk("%s: Failed finding expected dots in port\n", -				__func__); -			goto out_free; -		} -		*res = '-'; +	/* replace port '.' with '-' */ +	portstr = strrchr(buf, '.'); +	if (!portstr) { +		dprintk("%s: Failed finding expected dot in port\n", +			__func__); +		goto out_free_buf;  	} +	*portstr = '-'; -	/* Currently only support ipv4 address */ -	if (in4_pton(buf, rlen, (u8 *)&ip_addr, '-', &ipend) == 0) { -		dprintk("%s: Only ipv4 addresses supported\n", __func__); -		goto out_free; +	/* find '.' between address and port */ +	portstr = strrchr(buf, '.'); +	if (!portstr) { +		dprintk("%s: Failed finding expected dot between address and " +			"port\n", __func__); +		goto out_free_buf; +	} +	*portstr = '\0'; + +	da = kzalloc(sizeof(*da), gfp_flags); +	if (unlikely(!da)) +		goto out_free_buf; + +	INIT_LIST_HEAD(&da->da_node); + +	if (!rpc_pton(buf, portstr-buf, (struct sockaddr *)&da->da_addr, +		      sizeof(da->da_addr))) { +		dprintk("%s: error parsing address %s\n", __func__, buf); +		goto out_free_da;  	} -	/* port */ -	pstr = ipend; -	sscanf(pstr, "-%d-%d", &tmp[0], &tmp[1]); +	portstr++; +	sscanf(portstr, "%d-%d", &tmp[0], &tmp[1]);  	port = htons((tmp[0] << 8) | (tmp[1])); -	ds = nfs4_pnfs_ds_add(inode, ip_addr, port, gfp_flags); -	dprintk("%s: Decoded address and port %s\n", __func__, buf); -out_free: +	switch (da->da_addr.ss_family) { +	case AF_INET: +		((struct sockaddr_in *)&da->da_addr)->sin_port = port; +		da->da_addrlen = sizeof(struct sockaddr_in); +		match_netid = "tcp"; +		match_netid_len = 3; +		break; + +	case AF_INET6: +		((struct sockaddr_in6 *)&da->da_addr)->sin6_port = port; +		da->da_addrlen = sizeof(struct sockaddr_in6); +		match_netid = "tcp6"; +		match_netid_len = 4; +		startsep = "["; +		endsep = "]"; +		break; + +	default: +		dprintk("%s: unsupported address family: %u\n", +			__func__, da->da_addr.ss_family); +		goto out_free_da; +	} + +	if (nlen != match_netid_len || strncmp(netid, match_netid, nlen)) { +		dprintk("%s: ERROR: r_netid \"%s\" != \"%s\"\n", +			__func__, netid, match_netid); +		goto out_free_da; +	} + +	/* save human readable address */ +	len = strlen(startsep) + strlen(buf) + strlen(endsep) + 7; +	da->da_remotestr = kzalloc(len, gfp_flags); + +	/* NULL is ok, only used for dprintk */ +	if (da->da_remotestr) +		snprintf(da->da_remotestr, len, "%s%s%s:%u", startsep, +			 buf, endsep, ntohs(port)); + +	dprintk("%s: Parsed DS addr %s\n", __func__, da->da_remotestr); +	kfree(buf); +	kfree(netid); +	return da; + +out_free_da: +	kfree(da); +out_free_buf: +	dprintk("%s: Error parsing DS addr: %s\n", __func__, buf);  	kfree(buf); +out_free_netid: +	kfree(netid);  out_err: -	return ds; +	return NULL;  }  /* Decode opaque device data and return the result */ @@ -310,6 +535,8 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)  	struct xdr_stream stream;  	struct xdr_buf buf;  	struct page *scratch; +	struct list_head dsaddrs; +	struct nfs4_pnfs_ds_addr *da;  	/* set up xdr stream */  	scratch = alloc_page(gfp_flags); @@ -386,6 +613,8 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)  				NFS_SERVER(ino)->nfs_client,  				&pdev->dev_id); +	INIT_LIST_HEAD(&dsaddrs); +  	for (i = 0; i < dsaddr->ds_num; i++) {  		int j;  		u32 mp_count; @@ -395,48 +624,43 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)  			goto out_err_free_deviceid;  		mp_count = be32_to_cpup(p); /* multipath count */ -		if (mp_count > 1) { -			printk(KERN_WARNING -			       "%s: Multipath count %d not supported, " -			       "skipping all greater than 1\n", __func__, -				mp_count); -		}  		for (j = 0; j < mp_count; j++) { -			if (j == 0) { -				dsaddr->ds_list[i] = decode_and_add_ds(&stream, -					ino, gfp_flags); -				if (dsaddr->ds_list[i] == NULL) -					goto out_err_free_deviceid; -			} else { -				u32 len; -				/* skip extra multipath */ - -				/* read len, skip */ -				p = xdr_inline_decode(&stream, 4); -				if (unlikely(!p)) -					goto out_err_free_deviceid; -				len = be32_to_cpup(p); - -				p = xdr_inline_decode(&stream, len); -				if (unlikely(!p)) -					goto out_err_free_deviceid; +			da = decode_ds_addr(&stream, gfp_flags); +			if (da) +				list_add_tail(&da->da_node, &dsaddrs); +		} +		if (list_empty(&dsaddrs)) { +			dprintk("%s: no suitable DS addresses found\n", +				__func__); +			goto out_err_free_deviceid; +		} -				/* read len, skip */ -				p = xdr_inline_decode(&stream, 4); -				if (unlikely(!p)) -					goto out_err_free_deviceid; -				len = be32_to_cpup(p); +		dsaddr->ds_list[i] = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags); +		if (!dsaddr->ds_list[i]) +			goto out_err_drain_dsaddrs; -				p = xdr_inline_decode(&stream, len); -				if (unlikely(!p)) -					goto out_err_free_deviceid; -			} +		/* If DS was already in cache, free ds addrs */ +		while (!list_empty(&dsaddrs)) { +			da = list_first_entry(&dsaddrs, +					      struct nfs4_pnfs_ds_addr, +					      da_node); +			list_del_init(&da->da_node); +			kfree(da->da_remotestr); +			kfree(da);  		}  	}  	__free_page(scratch);  	return dsaddr; +out_err_drain_dsaddrs: +	while (!list_empty(&dsaddrs)) { +		da = list_first_entry(&dsaddrs, struct nfs4_pnfs_ds_addr, +				      da_node); +		list_del_init(&da->da_node); +		kfree(da->da_remotestr); +		kfree(da); +	}  out_err_free_deviceid:  	nfs4_fl_free_deviceid(dsaddr);  	/* stripe_indicies was part of dsaddr */ @@ -591,13 +815,13 @@ nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j)  static void  filelayout_mark_devid_negative(struct nfs4_file_layout_dsaddr *dsaddr, -			       int err, u32 ds_addr) +			       int err, const char *ds_remotestr)  {  	u32 *p = (u32 *)&dsaddr->id_node.deviceid; -	printk(KERN_ERR "NFS: data server %x connection error %d." +	printk(KERN_ERR "NFS: data server %s connection error %d."  		" Deviceid [%x%x%x%x] marked out of use.\n", -		ds_addr, err, p[0], p[1], p[2], p[3]); +		ds_remotestr, err, p[0], p[1], p[2], p[3]);  	spin_lock(&nfs4_ds_cache_lock);  	dsaddr->flags |= NFS4_DEVICE_ID_NEG_ENTRY; @@ -628,7 +852,7 @@ nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)  		err = nfs4_ds_connect(s, ds);  		if (err) {  			filelayout_mark_devid_negative(dsaddr, err, -						       ntohl(ds->ds_ip_addr)); +						       ds->ds_remotestr);  			return NULL;  		}  	} diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 26bece8f308..8c77039e7a8 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -80,7 +80,10 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,  static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,  			    struct nfs_fattr *fattr, struct iattr *sattr,  			    struct nfs4_state *state); - +#ifdef CONFIG_NFS_V4_1 +static int nfs41_test_stateid(struct nfs_server *, struct nfs4_state *); +static int nfs41_free_stateid(struct nfs_server *, struct nfs4_state *); +#endif  /* Prevent leaks of NFSv4 errors into userland */  static int nfs4_map_errors(int err)  { @@ -137,12 +140,13 @@ const u32 nfs4_pathconf_bitmap[2] = {  	0  }; -const u32 nfs4_fsinfo_bitmap[2] = { FATTR4_WORD0_MAXFILESIZE +const u32 nfs4_fsinfo_bitmap[3] = { FATTR4_WORD0_MAXFILESIZE  			| FATTR4_WORD0_MAXREAD  			| FATTR4_WORD0_MAXWRITE  			| FATTR4_WORD0_LEASE_TIME,  			FATTR4_WORD1_TIME_DELTA -			| FATTR4_WORD1_FS_LAYOUT_TYPES +			| FATTR4_WORD1_FS_LAYOUT_TYPES, +			FATTR4_WORD2_LAYOUT_BLKSIZE  };  const u32 nfs4_fs_locations_bitmap[2] = { @@ -1689,6 +1693,20 @@ static int nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *sta  	return ret;  } +#if defined(CONFIG_NFS_V4_1) +static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state) +{ +	int status; +	struct nfs_server *server = NFS_SERVER(state->inode); + +	status = nfs41_test_stateid(server, state); +	if (status == NFS_OK) +		return 0; +	nfs41_free_stateid(server, state); +	return nfs4_open_expired(sp, state); +} +#endif +  /*   * on an EXCLUSIVE create, the server should send back a bitmask with FATTR4-*   * fields corresponding to attributes that were used to store the verifier. @@ -2252,13 +2270,14 @@ static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,  static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,  			      struct nfs_fsinfo *info)  { +	int minor_version = server->nfs_client->cl_minorversion;  	int status = nfs4_lookup_root(server, fhandle, info);  	if ((status == -NFS4ERR_WRONGSEC) && !(server->flags & NFS_MOUNT_SECFLAVOUR))  		/*  		 * A status of -NFS4ERR_WRONGSEC will be mapped to -EPERM  		 * by nfs4_map_errors() as this function exits.  		 */ -		status = nfs4_find_root_sec(server, fhandle, info); +		status = nfs_v4_minor_ops[minor_version]->find_root_sec(server, fhandle, info);  	if (status == 0)  		status = nfs4_server_capabilities(server, fhandle);  	if (status == 0) @@ -4441,6 +4460,20 @@ out:  	return err;  } +#if defined(CONFIG_NFS_V4_1) +static int nfs41_lock_expired(struct nfs4_state *state, struct file_lock *request) +{ +	int status; +	struct nfs_server *server = NFS_SERVER(state->inode); + +	status = nfs41_test_stateid(server, state); +	if (status == NFS_OK) +		return 0; +	nfs41_free_stateid(server, state); +	return nfs4_lock_expired(state, request); +} +#endif +  static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)  {  	struct nfs_inode *nfsi = NFS_I(state->inode); @@ -4779,6 +4812,16 @@ out_inval:  	return -NFS4ERR_INVAL;  } +static bool +nfs41_same_server_scope(struct server_scope *a, struct server_scope *b) +{ +	if (a->server_scope_sz == b->server_scope_sz && +	    memcmp(a->server_scope, b->server_scope, a->server_scope_sz) == 0) +		return true; + +	return false; +} +  /*   * nfs4_proc_exchange_id()   * @@ -4821,9 +4864,31 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)  				init_utsname()->domainname,  				clp->cl_rpcclient->cl_auth->au_flavor); +	res.server_scope = kzalloc(sizeof(struct server_scope), GFP_KERNEL); +	if (unlikely(!res.server_scope)) +		return -ENOMEM; +  	status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);  	if (!status)  		status = nfs4_check_cl_exchange_flags(clp->cl_exchange_flags); + +	if (!status) { +		if (clp->server_scope && +		    !nfs41_same_server_scope(clp->server_scope, +					     res.server_scope)) { +			dprintk("%s: server_scope mismatch detected\n", +				__func__); +			set_bit(NFS4CLNT_SERVER_SCOPE_MISMATCH, &clp->cl_state); +			kfree(clp->server_scope); +			clp->server_scope = NULL; +		} + +		if (!clp->server_scope) +			clp->server_scope = res.server_scope; +		else +			kfree(res.server_scope); +	} +  	dprintk("<-- %s status= %d\n", __func__, status);  	return status;  } @@ -5704,7 +5769,7 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)  {  	struct nfs4_layoutreturn *lrp = calldata;  	struct nfs_server *server; -	struct pnfs_layout_hdr *lo = NFS_I(lrp->args.inode)->layout; +	struct pnfs_layout_hdr *lo = lrp->args.layout;  	dprintk("--> %s\n", __func__); @@ -5733,7 +5798,7 @@ static void nfs4_layoutreturn_release(void *calldata)  	struct nfs4_layoutreturn *lrp = calldata;  	dprintk("--> %s\n", __func__); -	put_layout_hdr(NFS_I(lrp->args.inode)->layout); +	put_layout_hdr(lrp->args.layout);  	kfree(calldata);  	dprintk("<-- %s\n", __func__);  } @@ -5770,6 +5835,54 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp)  	return status;  } +/* + * Retrieve the list of Data Server devices from the MDS. + */ +static int _nfs4_getdevicelist(struct nfs_server *server, +				    const struct nfs_fh *fh, +				    struct pnfs_devicelist *devlist) +{ +	struct nfs4_getdevicelist_args args = { +		.fh = fh, +		.layoutclass = server->pnfs_curr_ld->id, +	}; +	struct nfs4_getdevicelist_res res = { +		.devlist = devlist, +	}; +	struct rpc_message msg = { +		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICELIST], +		.rpc_argp = &args, +		.rpc_resp = &res, +	}; +	int status; + +	dprintk("--> %s\n", __func__); +	status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, +				&res.seq_res, 0); +	dprintk("<-- %s status=%d\n", __func__, status); +	return status; +} + +int nfs4_proc_getdevicelist(struct nfs_server *server, +			    const struct nfs_fh *fh, +			    struct pnfs_devicelist *devlist) +{ +	struct nfs4_exception exception = { }; +	int err; + +	do { +		err = nfs4_handle_exception(server, +				_nfs4_getdevicelist(server, fh, devlist), +				&exception); +	} while (exception.retry); + +	dprintk("%s: err=%d, num_devs=%u\n", __func__, +		err, devlist->num_devs); + +	return err; +} +EXPORT_SYMBOL_GPL(nfs4_proc_getdevicelist); +  static int  _nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)  { @@ -5848,9 +5961,16 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata)  static void nfs4_layoutcommit_release(void *calldata)  {  	struct nfs4_layoutcommit_data *data = calldata; +	struct pnfs_layout_segment *lseg, *tmp; +	pnfs_cleanup_layoutcommit(data);  	/* Matched by references in pnfs_set_layoutcommit */ -	put_lseg(data->lseg); +	list_for_each_entry_safe(lseg, tmp, &data->lseg_list, pls_lc_list) { +		list_del_init(&lseg->pls_lc_list); +		if (test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, +				       &lseg->pls_flags)) +			put_lseg(lseg); +	}  	put_rpccred(data->cred);  	kfree(data);  } @@ -5901,6 +6021,143 @@ out:  	rpc_put_task(task);  	return status;  } + +static int +_nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle, +		    struct nfs_fsinfo *info, struct nfs4_secinfo_flavors *flavors) +{ +	struct nfs41_secinfo_no_name_args args = { +		.style = SECINFO_STYLE_CURRENT_FH, +	}; +	struct nfs4_secinfo_res res = { +		.flavors = flavors, +	}; +	struct rpc_message msg = { +		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SECINFO_NO_NAME], +		.rpc_argp = &args, +		.rpc_resp = &res, +	}; +	return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); +} + +static int +nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle, +			   struct nfs_fsinfo *info, struct nfs4_secinfo_flavors *flavors) +{ +	struct nfs4_exception exception = { }; +	int err; +	do { +		err = _nfs41_proc_secinfo_no_name(server, fhandle, info, flavors); +		switch (err) { +		case 0: +		case -NFS4ERR_WRONGSEC: +		case -NFS4ERR_NOTSUPP: +			break; +		default: +			err = nfs4_handle_exception(server, err, &exception); +		} +	} while (exception.retry); +	return err; +} + +static int +nfs41_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, +		    struct nfs_fsinfo *info) +{ +	int err; +	struct page *page; +	rpc_authflavor_t flavor; +	struct nfs4_secinfo_flavors *flavors; + +	page = alloc_page(GFP_KERNEL); +	if (!page) { +		err = -ENOMEM; +		goto out; +	} + +	flavors = page_address(page); +	err = nfs41_proc_secinfo_no_name(server, fhandle, info, flavors); + +	/* +	 * Fall back on "guess and check" method if +	 * the server doesn't support SECINFO_NO_NAME +	 */ +	if (err == -NFS4ERR_WRONGSEC || err == -NFS4ERR_NOTSUPP) { +		err = nfs4_find_root_sec(server, fhandle, info); +		goto out_freepage; +	} +	if (err) +		goto out_freepage; + +	flavor = nfs_find_best_sec(flavors); +	if (err == 0) +		err = nfs4_lookup_root_sec(server, fhandle, info, flavor); + +out_freepage: +	put_page(page); +	if (err == -EACCES) +		return -EPERM; +out: +	return err; +} +static int _nfs41_test_stateid(struct nfs_server *server, struct nfs4_state *state) +{ +	int status; +	struct nfs41_test_stateid_args args = { +		.stateid = &state->stateid, +	}; +	struct nfs41_test_stateid_res res; +	struct rpc_message msg = { +		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_TEST_STATEID], +		.rpc_argp = &args, +		.rpc_resp = &res, +	}; +	args.seq_args.sa_session = res.seq_res.sr_session = NULL; +	status = nfs4_call_sync_sequence(server->client, server, &msg, &args.seq_args, &res.seq_res, 0, 1); +	return status; +} + +static int nfs41_test_stateid(struct nfs_server *server, struct nfs4_state *state) +{ +	struct nfs4_exception exception = { }; +	int err; +	do { +		err = nfs4_handle_exception(server, +				_nfs41_test_stateid(server, state), +				&exception); +	} while (exception.retry); +	return err; +} + +static int _nfs4_free_stateid(struct nfs_server *server, struct nfs4_state *state) +{ +	int status; +	struct nfs41_free_stateid_args args = { +		.stateid = &state->stateid, +	}; +	struct nfs41_free_stateid_res res; +	struct rpc_message msg = { +		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FREE_STATEID], +		.rpc_argp = &args, +		.rpc_resp = &res, +	}; + +	args.seq_args.sa_session = res.seq_res.sr_session = NULL; +	status = nfs4_call_sync_sequence(server->client, server, &msg, &args.seq_args, &res.seq_res, 0, 1); +	return status; +} + +static int nfs41_free_stateid(struct nfs_server *server, struct nfs4_state *state) +{ +	struct nfs4_exception exception = { }; +	int err; +	do { +		err = nfs4_handle_exception(server, +				_nfs4_free_stateid(server, state), +				&exception); +	} while (exception.retry); +	return err; +}  #endif /* CONFIG_NFS_V4_1 */  struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = { @@ -5937,8 +6194,8 @@ struct nfs4_state_recovery_ops nfs40_nograce_recovery_ops = {  struct nfs4_state_recovery_ops nfs41_nograce_recovery_ops = {  	.owner_flag_bit = NFS_OWNER_RECLAIM_NOGRACE,  	.state_flag_bit	= NFS_STATE_RECLAIM_NOGRACE, -	.recover_open	= nfs4_open_expired, -	.recover_lock	= nfs4_lock_expired, +	.recover_open	= nfs41_open_expired, +	.recover_lock	= nfs41_lock_expired,  	.establish_clid = nfs41_init_clientid,  	.get_clid_cred	= nfs4_get_exchange_id_cred,  }; @@ -5962,6 +6219,7 @@ static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = {  	.minor_version = 0,  	.call_sync = _nfs4_call_sync,  	.validate_stateid = nfs4_validate_delegation_stateid, +	.find_root_sec = nfs4_find_root_sec,  	.reboot_recovery_ops = &nfs40_reboot_recovery_ops,  	.nograce_recovery_ops = &nfs40_nograce_recovery_ops,  	.state_renewal_ops = &nfs40_state_renewal_ops, @@ -5972,6 +6230,7 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {  	.minor_version = 1,  	.call_sync = _nfs4_call_sync_session,  	.validate_stateid = nfs41_validate_delegation_stateid, +	.find_root_sec = nfs41_find_root_sec,  	.reboot_recovery_ops = &nfs41_reboot_recovery_ops,  	.nograce_recovery_ops = &nfs41_nograce_recovery_ops,  	.state_renewal_ops = &nfs41_state_renewal_ops, diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 7acfe884362..72ab97ef3d6 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1643,7 +1643,14 @@ static void nfs4_state_manager(struct nfs_client *clp)  				goto out_error;  			}  			clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state); -			set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state); + +			if (test_and_clear_bit(NFS4CLNT_SERVER_SCOPE_MISMATCH, +					       &clp->cl_state)) +				nfs4_state_start_reclaim_nograce(clp); +			else +				set_bit(NFS4CLNT_RECLAIM_REBOOT, +					&clp->cl_state); +  			pnfs_destroy_all_layouts(clp);  		} diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index e6e8f3b9a1d..1dce12f41a4 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -113,7 +113,11 @@ static int nfs4_stat_to_errno(int);  #define encode_restorefh_maxsz  (op_encode_hdr_maxsz)  #define decode_restorefh_maxsz  (op_decode_hdr_maxsz)  #define encode_fsinfo_maxsz	(encode_getattr_maxsz) -#define decode_fsinfo_maxsz	(op_decode_hdr_maxsz + 15) +/* The 5 accounts for the PNFS attributes, and assumes that at most three + * layout types will be returned. + */ +#define decode_fsinfo_maxsz	(op_decode_hdr_maxsz + \ +				 nfs4_fattr_bitmap_maxsz + 4 + 8 + 5)  #define encode_renew_maxsz	(op_encode_hdr_maxsz + 3)  #define decode_renew_maxsz	(op_decode_hdr_maxsz)  #define encode_setclientid_maxsz \ @@ -314,6 +318,17 @@ static int nfs4_stat_to_errno(int);  				XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5)  #define encode_reclaim_complete_maxsz	(op_encode_hdr_maxsz + 4)  #define decode_reclaim_complete_maxsz	(op_decode_hdr_maxsz + 4) +#define encode_getdevicelist_maxsz (op_encode_hdr_maxsz + 4 + \ +				encode_verifier_maxsz) +#define decode_getdevicelist_maxsz (op_decode_hdr_maxsz + \ +				2 /* nfs_cookie4 gdlr_cookie */ + \ +				decode_verifier_maxsz \ +				  /* verifier4 gdlr_verifier */ + \ +				1 /* gdlr_deviceid_list count */ + \ +				XDR_QUADLEN(NFS4_PNFS_GETDEVLIST_MAXNUM * \ +					    NFS4_DEVICEID4_SIZE) \ +				  /* gdlr_deviceid_list */ + \ +				1 /* bool gdlr_eof */)  #define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + 4 + \  				XDR_QUADLEN(NFS4_DEVICEID4_SIZE))  #define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \ @@ -343,6 +358,14 @@ static int nfs4_stat_to_errno(int);  				1 /* FIXME: opaque lrf_body always empty at the moment */)  #define decode_layoutreturn_maxsz (op_decode_hdr_maxsz + \  				1 + decode_stateid_maxsz) +#define encode_secinfo_no_name_maxsz (op_encode_hdr_maxsz + 1) +#define decode_secinfo_no_name_maxsz decode_secinfo_maxsz +#define encode_test_stateid_maxsz	(op_encode_hdr_maxsz + 2 + \ +					 XDR_QUADLEN(NFS4_STATEID_SIZE)) +#define decode_test_stateid_maxsz	(op_decode_hdr_maxsz + 2 + 1) +#define encode_free_stateid_maxsz	(op_encode_hdr_maxsz + 1 + \ +					 XDR_QUADLEN(NFS4_STATEID_SIZE)) +#define decode_free_stateid_maxsz	(op_decode_hdr_maxsz + 1)  #else /* CONFIG_NFS_V4_1 */  #define encode_sequence_maxsz	0  #define decode_sequence_maxsz	0 @@ -740,6 +763,14 @@ static int nfs4_stat_to_errno(int);  #define NFS4_dec_reclaim_complete_sz	(compound_decode_hdr_maxsz + \  					 decode_sequence_maxsz + \  					 decode_reclaim_complete_maxsz) +#define NFS4_enc_getdevicelist_sz (compound_encode_hdr_maxsz + \ +				encode_sequence_maxsz + \ +				encode_putfh_maxsz + \ +				encode_getdevicelist_maxsz) +#define NFS4_dec_getdevicelist_sz (compound_decode_hdr_maxsz + \ +				decode_sequence_maxsz + \ +				decode_putfh_maxsz + \ +				decode_getdevicelist_maxsz)  #define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz +    \  				encode_sequence_maxsz +\  				encode_getdeviceinfo_maxsz) @@ -772,6 +803,26 @@ static int nfs4_stat_to_errno(int);  				decode_sequence_maxsz + \  				decode_putfh_maxsz + \  				decode_layoutreturn_maxsz) +#define NFS4_enc_secinfo_no_name_sz	(compound_encode_hdr_maxsz + \ +					encode_sequence_maxsz + \ +					encode_putrootfh_maxsz +\ +					encode_secinfo_no_name_maxsz) +#define NFS4_dec_secinfo_no_name_sz	(compound_decode_hdr_maxsz + \ +					decode_sequence_maxsz + \ +					decode_putrootfh_maxsz + \ +					decode_secinfo_no_name_maxsz) +#define NFS4_enc_test_stateid_sz	(compound_encode_hdr_maxsz + \ +					 encode_sequence_maxsz + \ +					 encode_test_stateid_maxsz) +#define NFS4_dec_test_stateid_sz	(compound_decode_hdr_maxsz + \ +					 decode_sequence_maxsz + \ +					 decode_test_stateid_maxsz) +#define NFS4_enc_free_stateid_sz	(compound_encode_hdr_maxsz + \ +					 encode_sequence_maxsz + \ +					 encode_free_stateid_maxsz) +#define NFS4_dec_free_stateid_sz	(compound_decode_hdr_maxsz + \ +					 decode_sequence_maxsz + \ +					 decode_free_stateid_maxsz)  const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH +  				      compound_encode_hdr_maxsz + @@ -1076,6 +1127,35 @@ static void encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm  	hdr->replen += decode_getattr_maxsz;  } +static void +encode_getattr_three(struct xdr_stream *xdr, +		     uint32_t bm0, uint32_t bm1, uint32_t bm2, +		     struct compound_hdr *hdr) +{ +	__be32 *p; + +	p = reserve_space(xdr, 4); +	*p = cpu_to_be32(OP_GETATTR); +	if (bm2) { +		p = reserve_space(xdr, 16); +		*p++ = cpu_to_be32(3); +		*p++ = cpu_to_be32(bm0); +		*p++ = cpu_to_be32(bm1); +		*p = cpu_to_be32(bm2); +	} else if (bm1) { +		p = reserve_space(xdr, 12); +		*p++ = cpu_to_be32(2); +		*p++ = cpu_to_be32(bm0); +		*p = cpu_to_be32(bm1); +	} else { +		p = reserve_space(xdr, 8); +		*p++ = cpu_to_be32(1); +		*p = cpu_to_be32(bm0); +	} +	hdr->nops++; +	hdr->replen += decode_getattr_maxsz; +} +  static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)  {  	encode_getattr_two(xdr, bitmask[0] & nfs4_fattr_bitmap[0], @@ -1084,8 +1164,11 @@ static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct c  static void encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)  { -	encode_getattr_two(xdr, bitmask[0] & nfs4_fsinfo_bitmap[0], -			   bitmask[1] & nfs4_fsinfo_bitmap[1], hdr); +	encode_getattr_three(xdr, +			     bitmask[0] & nfs4_fsinfo_bitmap[0], +			     bitmask[1] & nfs4_fsinfo_bitmap[1], +			     bitmask[2] & nfs4_fsinfo_bitmap[2], +			     hdr);  }  static void encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) @@ -1827,6 +1910,26 @@ static void encode_sequence(struct xdr_stream *xdr,  #ifdef CONFIG_NFS_V4_1  static void +encode_getdevicelist(struct xdr_stream *xdr, +		     const struct nfs4_getdevicelist_args *args, +		     struct compound_hdr *hdr) +{ +	__be32 *p; +	nfs4_verifier dummy = { +		.data = "dummmmmy", +	}; + +	p = reserve_space(xdr, 20); +	*p++ = cpu_to_be32(OP_GETDEVICELIST); +	*p++ = cpu_to_be32(args->layoutclass); +	*p++ = cpu_to_be32(NFS4_PNFS_GETDEVLIST_MAXNUM); +	xdr_encode_hyper(p, 0ULL);                          /* cookie */ +	encode_nfs4_verifier(xdr, &dummy); +	hdr->nops++; +	hdr->replen += decode_getdevicelist_maxsz; +} + +static void  encode_getdeviceinfo(struct xdr_stream *xdr,  		     const struct nfs4_getdeviceinfo_args *args,  		     struct compound_hdr *hdr) @@ -1888,7 +1991,7 @@ encode_layoutcommit(struct xdr_stream *xdr,  	*p++ = cpu_to_be32(OP_LAYOUTCOMMIT);  	/* Only whole file layouts */  	p = xdr_encode_hyper(p, 0); /* offset */ -	p = xdr_encode_hyper(p, NFS4_MAX_UINT64); /* length */ +	p = xdr_encode_hyper(p, args->lastbytewritten + 1);	/* length */  	*p++ = cpu_to_be32(0); /* reclaim */  	p = xdr_encode_opaque_fixed(p, args->stateid.data, NFS4_STATEID_SIZE);  	*p++ = cpu_to_be32(1); /* newoffset = TRUE */ @@ -1938,6 +2041,46 @@ encode_layoutreturn(struct xdr_stream *xdr,  	hdr->nops++;  	hdr->replen += decode_layoutreturn_maxsz;  } + +static int +encode_secinfo_no_name(struct xdr_stream *xdr, +		       const struct nfs41_secinfo_no_name_args *args, +		       struct compound_hdr *hdr) +{ +	__be32 *p; +	p = reserve_space(xdr, 8); +	*p++ = cpu_to_be32(OP_SECINFO_NO_NAME); +	*p++ = cpu_to_be32(args->style); +	hdr->nops++; +	hdr->replen += decode_secinfo_no_name_maxsz; +	return 0; +} + +static void encode_test_stateid(struct xdr_stream *xdr, +				struct nfs41_test_stateid_args *args, +				struct compound_hdr *hdr) +{ +	__be32 *p; + +	p = reserve_space(xdr, 8 + NFS4_STATEID_SIZE); +	*p++ = cpu_to_be32(OP_TEST_STATEID); +	*p++ = cpu_to_be32(1); +	xdr_encode_opaque_fixed(p, args->stateid->data, NFS4_STATEID_SIZE); +	hdr->nops++; +	hdr->replen += decode_test_stateid_maxsz; +} + +static void encode_free_stateid(struct xdr_stream *xdr, +				struct nfs41_free_stateid_args *args, +				struct compound_hdr *hdr) +{ +	__be32 *p; +	p = reserve_space(xdr, 4 + NFS4_STATEID_SIZE); +	*p++ = cpu_to_be32(OP_FREE_STATEID); +	xdr_encode_opaque_fixed(p, args->stateid->data, NFS4_STATEID_SIZE); +	hdr->nops++; +	hdr->replen += decode_free_stateid_maxsz; +}  #endif /* CONFIG_NFS_V4_1 */  /* @@ -2536,7 +2679,7 @@ static void nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req,  	struct compound_hdr hdr = {  		.nops	= 0,  	}; -	const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 }; +	const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME };  	encode_compound_hdr(xdr, req, &hdr);  	encode_setclientid_confirm(xdr, arg, &hdr); @@ -2680,7 +2823,7 @@ static void nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req,  	struct compound_hdr hdr = {  		.minorversion = nfs4_xdr_minorversion(&args->la_seq_args),  	}; -	const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 }; +	const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME };  	encode_compound_hdr(xdr, req, &hdr);  	encode_sequence(xdr, &args->la_seq_args, &hdr); @@ -2707,6 +2850,24 @@ static void nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req,  }  /* + * Encode GETDEVICELIST request + */ +static void nfs4_xdr_enc_getdevicelist(struct rpc_rqst *req, +				       struct xdr_stream *xdr, +				       struct nfs4_getdevicelist_args *args) +{ +	struct compound_hdr hdr = { +		.minorversion = nfs4_xdr_minorversion(&args->seq_args), +	}; + +	encode_compound_hdr(xdr, req, &hdr); +	encode_sequence(xdr, &args->seq_args, &hdr); +	encode_putfh(xdr, args->fh, &hdr); +	encode_getdevicelist(xdr, args, &hdr); +	encode_nops(&hdr); +} + +/*   * Encode GETDEVICEINFO request   */  static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, @@ -2790,6 +2951,59 @@ static void nfs4_xdr_enc_layoutreturn(struct rpc_rqst *req,  	encode_layoutreturn(xdr, args, &hdr);  	encode_nops(&hdr);  } + +/* + * Encode SECINFO_NO_NAME request + */ +static int nfs4_xdr_enc_secinfo_no_name(struct rpc_rqst *req, +					struct xdr_stream *xdr, +					struct nfs41_secinfo_no_name_args *args) +{ +	struct compound_hdr hdr = { +		.minorversion = nfs4_xdr_minorversion(&args->seq_args), +	}; + +	encode_compound_hdr(xdr, req, &hdr); +	encode_sequence(xdr, &args->seq_args, &hdr); +	encode_putrootfh(xdr, &hdr); +	encode_secinfo_no_name(xdr, args, &hdr); +	encode_nops(&hdr); +	return 0; +} + +/* + *  Encode TEST_STATEID request + */ +static void nfs4_xdr_enc_test_stateid(struct rpc_rqst *req, +				      struct xdr_stream *xdr, +				      struct nfs41_test_stateid_args *args) +{ +	struct compound_hdr hdr = { +		.minorversion = nfs4_xdr_minorversion(&args->seq_args), +	}; + +	encode_compound_hdr(xdr, req, &hdr); +	encode_sequence(xdr, &args->seq_args, &hdr); +	encode_test_stateid(xdr, args, &hdr); +	encode_nops(&hdr); +} + +/* + *  Encode FREE_STATEID request + */ +static void nfs4_xdr_enc_free_stateid(struct rpc_rqst *req, +				     struct xdr_stream *xdr, +				     struct nfs41_free_stateid_args *args) +{ +	struct compound_hdr hdr = { +		.minorversion = nfs4_xdr_minorversion(&args->seq_args), +	}; + +	encode_compound_hdr(xdr, req, &hdr); +	encode_sequence(xdr, &args->seq_args, &hdr); +	encode_free_stateid(xdr, args, &hdr); +	encode_nops(&hdr); +}  #endif /* CONFIG_NFS_V4_1 */  static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) @@ -2890,14 +3104,17 @@ static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)  		goto out_overflow;  	bmlen = be32_to_cpup(p); -	bitmap[0] = bitmap[1] = 0; +	bitmap[0] = bitmap[1] = bitmap[2] = 0;  	p = xdr_inline_decode(xdr, (bmlen << 2));  	if (unlikely(!p))  		goto out_overflow;  	if (bmlen > 0) {  		bitmap[0] = be32_to_cpup(p++); -		if (bmlen > 1) -			bitmap[1] = be32_to_cpup(p); +		if (bmlen > 1) { +			bitmap[1] = be32_to_cpup(p++); +			if (bmlen > 2) +				bitmap[2] = be32_to_cpup(p); +		}  	}  	return 0;  out_overflow: @@ -2929,8 +3146,9 @@ static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint3  			return ret;  		bitmap[0] &= ~FATTR4_WORD0_SUPPORTED_ATTRS;  	} else -		bitmask[0] = bitmask[1] = 0; -	dprintk("%s: bitmask=%08x:%08x\n", __func__, bitmask[0], bitmask[1]); +		bitmask[0] = bitmask[1] = bitmask[2] = 0; +	dprintk("%s: bitmask=%08x:%08x:%08x\n", __func__, +		bitmask[0], bitmask[1], bitmask[2]);  	return 0;  } @@ -3984,7 +4202,7 @@ out_overflow:  static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res)  {  	__be32 *savep; -	uint32_t attrlen, bitmap[2] = {0}; +	uint32_t attrlen, bitmap[3] = {0};  	int status;  	if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) @@ -4010,7 +4228,7 @@ xdr_error:  static int decode_statfs(struct xdr_stream *xdr, struct nfs_fsstat *fsstat)  {  	__be32 *savep; -	uint32_t attrlen, bitmap[2] = {0}; +	uint32_t attrlen, bitmap[3] = {0};  	int status;  	if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) @@ -4042,7 +4260,7 @@ xdr_error:  static int decode_pathconf(struct xdr_stream *xdr, struct nfs_pathconf *pathconf)  {  	__be32 *savep; -	uint32_t attrlen, bitmap[2] = {0}; +	uint32_t attrlen, bitmap[3] = {0};  	int status;  	if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) @@ -4182,7 +4400,7 @@ static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fat  {  	__be32 *savep;  	uint32_t attrlen, -		 bitmap[2] = {0}; +		 bitmap[3] = {0};  	int status;  	status = decode_op_hdr(xdr, OP_GETATTR); @@ -4268,10 +4486,32 @@ static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap,  	return status;  } +/* + * The prefered block size for layout directed io + */ +static int decode_attr_layout_blksize(struct xdr_stream *xdr, uint32_t *bitmap, +				      uint32_t *res) +{ +	__be32 *p; + +	dprintk("%s: bitmap is %x\n", __func__, bitmap[2]); +	*res = 0; +	if (bitmap[2] & FATTR4_WORD2_LAYOUT_BLKSIZE) { +		p = xdr_inline_decode(xdr, 4); +		if (unlikely(!p)) { +			print_overflow_msg(__func__, xdr); +			return -EIO; +		} +		*res = be32_to_cpup(p); +		bitmap[2] &= ~FATTR4_WORD2_LAYOUT_BLKSIZE; +	} +	return 0; +} +  static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)  {  	__be32 *savep; -	uint32_t attrlen, bitmap[2]; +	uint32_t attrlen, bitmap[3];  	int status;  	if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) @@ -4299,6 +4539,9 @@ static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)  	status = decode_attr_pnfstype(xdr, bitmap, &fsinfo->layouttype);  	if (status != 0)  		goto xdr_error; +	status = decode_attr_layout_blksize(xdr, bitmap, &fsinfo->blksize); +	if (status) +		goto xdr_error;  	status = verify_attr_len(xdr, savep, attrlen);  xdr_error: @@ -4718,7 +4961,7 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,  {  	__be32 *savep;  	uint32_t attrlen, -		 bitmap[2] = {0}; +		 bitmap[3] = {0};  	struct kvec *iov = req->rq_rcv_buf.head;  	int status; @@ -4977,11 +5220,17 @@ static int decode_exchange_id(struct xdr_stream *xdr,  	if (unlikely(status))  		return status; -	/* Throw away server_scope */ +	/* Save server_scope */  	status = decode_opaque_inline(xdr, &dummy, &dummy_str);  	if (unlikely(status))  		return status; +	if (unlikely(dummy > NFS4_OPAQUE_LIMIT)) +		return -EIO; + +	memcpy(res->server_scope->server_scope, dummy_str, dummy); +	res->server_scope->server_scope_sz = dummy; +  	/* Throw away Implementation id array */  	status = decode_opaque_inline(xdr, &dummy, &dummy_str);  	if (unlikely(status)) @@ -5141,6 +5390,53 @@ out_overflow:  }  #if defined(CONFIG_NFS_V4_1) +/* + * TODO: Need to handle case when EOF != true; + */ +static int decode_getdevicelist(struct xdr_stream *xdr, +				struct pnfs_devicelist *res) +{ +	__be32 *p; +	int status, i; +	struct nfs_writeverf verftemp; + +	status = decode_op_hdr(xdr, OP_GETDEVICELIST); +	if (status) +		return status; + +	p = xdr_inline_decode(xdr, 8 + 8 + 4); +	if (unlikely(!p)) +		goto out_overflow; + +	/* TODO: Skip cookie for now */ +	p += 2; + +	/* Read verifier */ +	p = xdr_decode_opaque_fixed(p, verftemp.verifier, 8); + +	res->num_devs = be32_to_cpup(p); + +	dprintk("%s: num_dev %d\n", __func__, res->num_devs); + +	if (res->num_devs > NFS4_PNFS_GETDEVLIST_MAXNUM) { +		printk(KERN_ERR "%s too many result dev_num %u\n", +				__func__, res->num_devs); +		return -EIO; +	} + +	p = xdr_inline_decode(xdr, +			      res->num_devs * NFS4_DEVICEID4_SIZE + 4); +	if (unlikely(!p)) +		goto out_overflow; +	for (i = 0; i < res->num_devs; i++) +		p = xdr_decode_opaque_fixed(p, res->dev_id[i].data, +					    NFS4_DEVICEID4_SIZE); +	res->eof = be32_to_cpup(p); +	return 0; +out_overflow: +	print_overflow_msg(__func__, xdr); +	return -EIO; +}  static int decode_getdeviceinfo(struct xdr_stream *xdr,  				struct pnfs_device *pdev) @@ -5303,6 +5599,7 @@ static int decode_layoutcommit(struct xdr_stream *xdr,  	int status;  	status = decode_op_hdr(xdr, OP_LAYOUTCOMMIT); +	res->status = status;  	if (status)  		return status; @@ -5322,6 +5619,55 @@ out_overflow:  	print_overflow_msg(__func__, xdr);  	return -EIO;  } + +static int decode_test_stateid(struct xdr_stream *xdr, +			       struct nfs41_test_stateid_res *res) +{ +	__be32 *p; +	int status; +	int num_res; + +	status = decode_op_hdr(xdr, OP_TEST_STATEID); +	if (status) +		return status; + +	p = xdr_inline_decode(xdr, 4); +	if (unlikely(!p)) +		goto out_overflow; +	num_res = be32_to_cpup(p++); +	if (num_res != 1) +		goto out; + +	p = xdr_inline_decode(xdr, 4); +	if (unlikely(!p)) +		goto out_overflow; +	res->status = be32_to_cpup(p++); +	return res->status; +out_overflow: +	print_overflow_msg(__func__, xdr); +out: +	return -EIO; +} + +static int decode_free_stateid(struct xdr_stream *xdr, +			       struct nfs41_free_stateid_res *res) +{ +	__be32 *p; +	int status; + +	status = decode_op_hdr(xdr, OP_FREE_STATEID); +	if (status) +		return status; + +	p = xdr_inline_decode(xdr, 4); +	if (unlikely(!p)) +		goto out_overflow; +	res->status = be32_to_cpup(p++); +	return res->status; +out_overflow: +	print_overflow_msg(__func__, xdr); +	return -EIO; +}  #endif /* CONFIG_NFS_V4_1 */  /* @@ -6366,6 +6712,32 @@ static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp,  }  /* + * Decode GETDEVICELIST response + */ +static int nfs4_xdr_dec_getdevicelist(struct rpc_rqst *rqstp, +				      struct xdr_stream *xdr, +				      struct nfs4_getdevicelist_res *res) +{ +	struct compound_hdr hdr; +	int status; + +	dprintk("encoding getdevicelist!\n"); + +	status = decode_compound_hdr(xdr, &hdr); +	if (status != 0) +		goto out; +	status = decode_sequence(xdr, &res->seq_res, rqstp); +	if (status != 0) +		goto out; +	status = decode_putfh(xdr); +	if (status != 0) +		goto out; +	status = decode_getdevicelist(xdr, res->devlist); +out: +	return status; +} + +/*   * Decode GETDEVINFO response   */  static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp, @@ -6461,6 +6833,72 @@ static int nfs4_xdr_dec_layoutcommit(struct rpc_rqst *rqstp,  out:  	return status;  } + +/* + * Decode SECINFO_NO_NAME response + */ +static int nfs4_xdr_dec_secinfo_no_name(struct rpc_rqst *rqstp, +					struct xdr_stream *xdr, +					struct nfs4_secinfo_res *res) +{ +	struct compound_hdr hdr; +	int status; + +	status = decode_compound_hdr(xdr, &hdr); +	if (status) +		goto out; +	status = decode_sequence(xdr, &res->seq_res, rqstp); +	if (status) +		goto out; +	status = decode_putrootfh(xdr); +	if (status) +		goto out; +	status = decode_secinfo(xdr, res); +out: +	return status; +} + +/* + * Decode TEST_STATEID response + */ +static int nfs4_xdr_dec_test_stateid(struct rpc_rqst *rqstp, +				     struct xdr_stream *xdr, +				     struct nfs41_test_stateid_res *res) +{ +	struct compound_hdr hdr; +	int status; + +	status = decode_compound_hdr(xdr, &hdr); +	if (status) +		goto out; +	status = decode_sequence(xdr, &res->seq_res, rqstp); +	if (status) +		goto out; +	status = decode_test_stateid(xdr, res); +out: +	return status; +} + +/* + * Decode FREE_STATEID response + */ +static int nfs4_xdr_dec_free_stateid(struct rpc_rqst *rqstp, +				     struct xdr_stream *xdr, +				     struct nfs41_free_stateid_res *res) +{ +	struct compound_hdr hdr; +	int status; + +	status = decode_compound_hdr(xdr, &hdr); +	if (status) +		goto out; +	status = decode_sequence(xdr, &res->seq_res, rqstp); +	if (status) +		goto out; +	status = decode_free_stateid(xdr, res); +out: +	return status; +}  #endif /* CONFIG_NFS_V4_1 */  /** @@ -6480,7 +6918,7 @@ out:  int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,  		       int plus)  { -	uint32_t bitmap[2] = {0}; +	uint32_t bitmap[3] = {0};  	uint32_t len;  	__be32 *p = xdr_inline_decode(xdr, 4);  	if (unlikely(!p)) @@ -6663,6 +7101,10 @@ struct rpc_procinfo	nfs4_procedures[] = {  	PROC(LAYOUTGET,		enc_layoutget,		dec_layoutget),  	PROC(LAYOUTCOMMIT,	enc_layoutcommit,	dec_layoutcommit),  	PROC(LAYOUTRETURN,	enc_layoutreturn,	dec_layoutreturn), +	PROC(SECINFO_NO_NAME,	enc_secinfo_no_name,	dec_secinfo_no_name), +	PROC(TEST_STATEID,	enc_test_stateid,	dec_test_stateid), +	PROC(FREE_STATEID,	enc_free_stateid,	dec_free_stateid), +	PROC(GETDEVICELIST,	enc_getdevicelist,	dec_getdevicelist),  #endif /* CONFIG_NFS_V4_1 */  }; diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index 8ff2ea3f10e..d0cda12fddc 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -479,7 +479,6 @@ static int _io_check(struct objio_state *ios, bool is_write)  	for (i = 0; i <  ios->numdevs; i++) {  		struct osd_sense_info osi;  		struct osd_request *or = ios->per_dev[i].or; -		unsigned dev;  		int ret;  		if (!or) @@ -500,9 +499,8 @@ static int _io_check(struct objio_state *ios, bool is_write)  			continue; /* we recovered */  		} -		dev = ios->per_dev[i].dev; -		objlayout_io_set_result(&ios->ol_state, dev, -					&ios->layout->comps[dev].oc_object_id, +		objlayout_io_set_result(&ios->ol_state, i, +					&ios->layout->comps[i].oc_object_id,  					osd_pri_2_pnfs_err(osi.osd_err_pri),  					ios->per_dev[i].offset,  					ios->per_dev[i].length, @@ -589,22 +587,19 @@ static void _calc_stripe_info(struct objio_state *ios, u64 file_offset,  }  static int _add_stripe_unit(struct objio_state *ios,  unsigned *cur_pg, -		unsigned pgbase, struct _objio_per_comp *per_dev, int cur_len, +		unsigned pgbase, struct _objio_per_comp *per_dev, int len,  		gfp_t gfp_flags)  {  	unsigned pg = *cur_pg; +	int cur_len = len;  	struct request_queue *q =  			osd_request_queue(_io_od(ios, per_dev->dev)); -	per_dev->length += cur_len; -  	if (per_dev->bio == NULL) { -		unsigned stripes = ios->layout->num_comps / -						     ios->layout->mirrors_p1; -		unsigned pages_in_stripe = stripes * +		unsigned pages_in_stripe = ios->layout->group_width *  				      (ios->layout->stripe_unit / PAGE_SIZE);  		unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) / -				    stripes; +				    ios->layout->group_width;  		if (BIO_MAX_PAGES_KMALLOC < bio_size)  			bio_size = BIO_MAX_PAGES_KMALLOC; @@ -632,6 +627,7 @@ static int _add_stripe_unit(struct objio_state *ios,  unsigned *cur_pg,  	}  	BUG_ON(cur_len); +	per_dev->length += len;  	*cur_pg = pg;  	return 0;  } @@ -650,7 +646,7 @@ static int _prepare_one_group(struct objio_state *ios, u64 length,  	int ret = 0;  	while (length) { -		struct _objio_per_comp *per_dev = &ios->per_dev[dev]; +		struct _objio_per_comp *per_dev = &ios->per_dev[dev - first_dev];  		unsigned cur_len, page_off = 0;  		if (!per_dev->length) { @@ -670,8 +666,8 @@ static int _prepare_one_group(struct objio_state *ios, u64 length,  				cur_len = stripe_unit;  			} -			if (max_comp < dev) -				max_comp = dev; +			if (max_comp < dev - first_dev) +				max_comp = dev - first_dev;  		} else {  			cur_len = stripe_unit;  		} @@ -806,7 +802,7 @@ static int _read_mirrors(struct objio_state *ios, unsigned cur_comp)  	struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];  	unsigned dev = per_dev->dev;  	struct pnfs_osd_object_cred *cred = -			&ios->layout->comps[dev]; +			&ios->layout->comps[cur_comp];  	struct osd_obj_id obj = {  		.partition = cred->oc_object_id.oid_partition_id,  		.id = cred->oc_object_id.oid_object_id, @@ -904,7 +900,7 @@ static int _write_mirrors(struct objio_state *ios, unsigned cur_comp)  	for (; cur_comp < last_comp; ++cur_comp, ++dev) {  		struct osd_request *or = NULL;  		struct pnfs_osd_object_cred *cred = -					&ios->layout->comps[dev]; +					&ios->layout->comps[cur_comp];  		struct osd_obj_id obj = {  			.partition = cred->oc_object_id.oid_partition_id,  			.id = cred->oc_object_id.oid_object_id, @@ -1000,13 +996,22 @@ static bool objio_pg_test(struct nfs_pageio_descriptor *pgio,  	if (!pnfs_generic_pg_test(pgio, prev, req))  		return false; -	if (pgio->pg_lseg == NULL) -		return true; -  	return pgio->pg_count + req->wb_bytes <=  			OBJIO_LSEG(pgio->pg_lseg)->max_io_size;  } +static const struct nfs_pageio_ops objio_pg_read_ops = { +	.pg_init = pnfs_generic_pg_init_read, +	.pg_test = objio_pg_test, +	.pg_doio = pnfs_generic_pg_readpages, +}; + +static const struct nfs_pageio_ops objio_pg_write_ops = { +	.pg_init = pnfs_generic_pg_init_write, +	.pg_test = objio_pg_test, +	.pg_doio = pnfs_generic_pg_writepages, +}; +  static struct pnfs_layoutdriver_type objlayout_type = {  	.id = LAYOUT_OSD2_OBJECTS,  	.name = "LAYOUT_OSD2_OBJECTS", @@ -1020,7 +1025,8 @@ static struct pnfs_layoutdriver_type objlayout_type = {  	.read_pagelist           = objlayout_read_pagelist,  	.write_pagelist          = objlayout_write_pagelist, -	.pg_test                 = objio_pg_test, +	.pg_read_ops             = &objio_pg_read_ops, +	.pg_write_ops            = &objio_pg_write_ops,  	.free_deviceid_node	 = objio_free_deviceid_node, @@ -1055,5 +1061,7 @@ objlayout_exit(void)  	       __func__);  } +MODULE_ALIAS("nfs-layouttype4-2"); +  module_init(objlayout_init);  module_exit(objlayout_exit); diff --git a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c index 16fc758e912..b3918f7ac34 100644 --- a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c +++ b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c @@ -170,6 +170,9 @@ int pnfs_osd_xdr_decode_layout_map(struct pnfs_osd_layout *layout,  	p = _osd_xdr_decode_data_map(p, &layout->olo_map);  	layout->olo_comps_index = be32_to_cpup(p++);  	layout->olo_num_comps = be32_to_cpup(p++); +	dprintk("%s: olo_comps_index=%d olo_num_comps=%d\n", __func__, +		layout->olo_comps_index, layout->olo_num_comps); +  	iter->total_comps = layout->olo_num_comps;  	return 0;  } diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 18449f43c56..b60970cc7f1 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -230,7 +230,7 @@ EXPORT_SYMBOL_GPL(nfs_generic_pg_test);   */  void nfs_pageio_init(struct nfs_pageio_descriptor *desc,  		     struct inode *inode, -		     int (*doio)(struct nfs_pageio_descriptor *), +		     const struct nfs_pageio_ops *pg_ops,  		     size_t bsize,  		     int io_flags)  { @@ -240,13 +240,12 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,  	desc->pg_bsize = bsize;  	desc->pg_base = 0;  	desc->pg_moreio = 0; +	desc->pg_recoalesce = 0;  	desc->pg_inode = inode; -	desc->pg_doio = doio; +	desc->pg_ops = pg_ops;  	desc->pg_ioflags = io_flags;  	desc->pg_error = 0;  	desc->pg_lseg = NULL; -	desc->pg_test = nfs_generic_pg_test; -	pnfs_pageio_init(desc, inode);  }  /** @@ -276,7 +275,7 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev,  		return false;  	if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE)  		return false; -	return pgio->pg_test(pgio, prev, req); +	return pgio->pg_ops->pg_test(pgio, prev, req);  }  /** @@ -297,6 +296,8 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,  		if (!nfs_can_coalesce_requests(prev, req, desc))  			return 0;  	} else { +		if (desc->pg_ops->pg_init) +			desc->pg_ops->pg_init(desc, req);  		desc->pg_base = req->wb_pgbase;  	}  	nfs_list_remove_request(req); @@ -311,7 +312,7 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,  static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)  {  	if (!list_empty(&desc->pg_list)) { -		int error = desc->pg_doio(desc); +		int error = desc->pg_ops->pg_doio(desc);  		if (error < 0)  			desc->pg_error = error;  		else @@ -331,7 +332,7 @@ static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)   * Returns true if the request 'req' was successfully coalesced into the   * existing list of pages 'desc'.   */ -int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, +static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,  			   struct nfs_page *req)  {  	while (!nfs_pageio_do_add_request(desc, req)) { @@ -340,17 +341,67 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,  		if (desc->pg_error < 0)  			return 0;  		desc->pg_moreio = 0; +		if (desc->pg_recoalesce) +			return 0;  	}  	return 1;  } +static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc) +{ +	LIST_HEAD(head); + +	do { +		list_splice_init(&desc->pg_list, &head); +		desc->pg_bytes_written -= desc->pg_count; +		desc->pg_count = 0; +		desc->pg_base = 0; +		desc->pg_recoalesce = 0; + +		while (!list_empty(&head)) { +			struct nfs_page *req; + +			req = list_first_entry(&head, struct nfs_page, wb_list); +			nfs_list_remove_request(req); +			if (__nfs_pageio_add_request(desc, req)) +				continue; +			if (desc->pg_error < 0) +				return 0; +			break; +		} +	} while (desc->pg_recoalesce); +	return 1; +} + +int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, +		struct nfs_page *req) +{ +	int ret; + +	do { +		ret = __nfs_pageio_add_request(desc, req); +		if (ret) +			break; +		if (desc->pg_error < 0) +			break; +		ret = nfs_do_recoalesce(desc); +	} while (ret); +	return ret; +} +  /**   * nfs_pageio_complete - Complete I/O on an nfs_pageio_descriptor   * @desc: pointer to io descriptor   */  void nfs_pageio_complete(struct nfs_pageio_descriptor *desc)  { -	nfs_pageio_doio(desc); +	for (;;) { +		nfs_pageio_doio(desc); +		if (!desc->pg_recoalesce) +			break; +		if (!nfs_do_recoalesce(desc)) +			break; +	}  }  /** @@ -369,7 +420,7 @@ void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index)  	if (!list_empty(&desc->pg_list)) {  		struct nfs_page *prev = nfs_list_entry(desc->pg_list.prev);  		if (index != prev->wb_index + 1) -			nfs_pageio_doio(desc); +			nfs_pageio_complete(desc);  	}  } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 29c0ca7fc34..e550e8836c3 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -28,6 +28,7 @@   */  #include <linux/nfs_fs.h> +#include <linux/nfs_page.h>  #include "internal.h"  #include "pnfs.h"  #include "iostat.h" @@ -75,8 +76,11 @@ find_pnfs_driver(u32 id)  void  unset_pnfs_layoutdriver(struct nfs_server *nfss)  { -	if (nfss->pnfs_curr_ld) +	if (nfss->pnfs_curr_ld) { +		if (nfss->pnfs_curr_ld->clear_layoutdriver) +			nfss->pnfs_curr_ld->clear_layoutdriver(nfss);  		module_put(nfss->pnfs_curr_ld->owner); +	}  	nfss->pnfs_curr_ld = NULL;  } @@ -87,7 +91,8 @@ unset_pnfs_layoutdriver(struct nfs_server *nfss)   * @id layout type. Zero (illegal layout type) indicates pNFS not in use.   */  void -set_pnfs_layoutdriver(struct nfs_server *server, u32 id) +set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh, +		      u32 id)  {  	struct pnfs_layoutdriver_type *ld_type = NULL; @@ -114,6 +119,13 @@ set_pnfs_layoutdriver(struct nfs_server *server, u32 id)  		goto out_no_driver;  	}  	server->pnfs_curr_ld = ld_type; +	if (ld_type->set_layoutdriver +	    && ld_type->set_layoutdriver(server, mntfh)) { +		printk(KERN_ERR "%s: Error initializing pNFS layout driver %u.\n", +				__func__, id); +		module_put(ld_type->owner); +		goto out_no_driver; +	}  	dprintk("%s: pNFS module for %u set\n", __func__, id);  	return; @@ -189,6 +201,7 @@ static void  pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo)  {  	struct pnfs_layoutdriver_type *ld = NFS_SERVER(lo->plh_inode)->pnfs_curr_ld; +	put_rpccred(lo->plh_lc_cred);  	return ld->alloc_layout_hdr ? ld->free_layout_hdr(lo) : kfree(lo);  } @@ -223,6 +236,7 @@ static void  init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg)  {  	INIT_LIST_HEAD(&lseg->pls_list); +	INIT_LIST_HEAD(&lseg->pls_lc_list);  	atomic_set(&lseg->pls_refcount, 1);  	smp_mb();  	set_bit(NFS_LSEG_VALID, &lseg->pls_flags); @@ -448,11 +462,20 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)  void  pnfs_destroy_all_layouts(struct nfs_client *clp)  { +	struct nfs_server *server;  	struct pnfs_layout_hdr *lo;  	LIST_HEAD(tmp_list); +	nfs4_deviceid_mark_client_invalid(clp); +	nfs4_deviceid_purge_client(clp); +  	spin_lock(&clp->cl_lock); -	list_splice_init(&clp->cl_layouts, &tmp_list); +	rcu_read_lock(); +	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { +		if (!list_empty(&server->layouts)) +			list_splice_init(&server->layouts, &tmp_list); +	} +	rcu_read_unlock();  	spin_unlock(&clp->cl_lock);  	while (!list_empty(&tmp_list)) { @@ -661,6 +684,7 @@ _pnfs_return_layout(struct inode *ino)  	lrp->args.stateid = stateid;  	lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id;  	lrp->args.inode = ino; +	lrp->args.layout = lo;  	lrp->clp = NFS_SERVER(ino)->nfs_client;  	status = nfs4_proc_layoutreturn(lrp); @@ -805,7 +829,9 @@ out:  }  static struct pnfs_layout_hdr * -alloc_init_layout_hdr(struct inode *ino, gfp_t gfp_flags) +alloc_init_layout_hdr(struct inode *ino, +		      struct nfs_open_context *ctx, +		      gfp_t gfp_flags)  {  	struct pnfs_layout_hdr *lo; @@ -817,11 +843,14 @@ alloc_init_layout_hdr(struct inode *ino, gfp_t gfp_flags)  	INIT_LIST_HEAD(&lo->plh_segs);  	INIT_LIST_HEAD(&lo->plh_bulk_recall);  	lo->plh_inode = ino; +	lo->plh_lc_cred = get_rpccred(ctx->state->owner->so_cred);  	return lo;  }  static struct pnfs_layout_hdr * -pnfs_find_alloc_layout(struct inode *ino, gfp_t gfp_flags) +pnfs_find_alloc_layout(struct inode *ino, +		       struct nfs_open_context *ctx, +		       gfp_t gfp_flags)  {  	struct nfs_inode *nfsi = NFS_I(ino);  	struct pnfs_layout_hdr *new = NULL; @@ -836,7 +865,7 @@ pnfs_find_alloc_layout(struct inode *ino, gfp_t gfp_flags)  			return nfsi->layout;  	}  	spin_unlock(&ino->i_lock); -	new = alloc_init_layout_hdr(ino, gfp_flags); +	new = alloc_init_layout_hdr(ino, ctx, gfp_flags);  	spin_lock(&ino->i_lock);  	if (likely(nfsi->layout == NULL))	/* Won the race? */ @@ -920,7 +949,8 @@ pnfs_update_layout(struct inode *ino,  	};  	unsigned pg_offset;  	struct nfs_inode *nfsi = NFS_I(ino); -	struct nfs_client *clp = NFS_SERVER(ino)->nfs_client; +	struct nfs_server *server = NFS_SERVER(ino); +	struct nfs_client *clp = server->nfs_client;  	struct pnfs_layout_hdr *lo;  	struct pnfs_layout_segment *lseg = NULL;  	bool first = false; @@ -928,7 +958,7 @@ pnfs_update_layout(struct inode *ino,  	if (!pnfs_enabled_sb(NFS_SERVER(ino)))  		return NULL;  	spin_lock(&ino->i_lock); -	lo = pnfs_find_alloc_layout(ino, gfp_flags); +	lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags);  	if (lo == NULL) {  		dprintk("%s ERROR: can't get pnfs_layout_hdr\n", __func__);  		goto out_unlock; @@ -964,7 +994,7 @@ pnfs_update_layout(struct inode *ino,  		 */  		spin_lock(&clp->cl_lock);  		BUG_ON(!list_empty(&lo->plh_layouts)); -		list_add_tail(&lo->plh_layouts, &clp->cl_layouts); +		list_add_tail(&lo->plh_layouts, &server->layouts);  		spin_unlock(&clp->cl_lock);  	} @@ -973,7 +1003,8 @@ pnfs_update_layout(struct inode *ino,  		arg.offset -= pg_offset;  		arg.length += pg_offset;  	} -	arg.length = PAGE_CACHE_ALIGN(arg.length); +	if (arg.length != NFS4_MAX_UINT64) +		arg.length = PAGE_CACHE_ALIGN(arg.length);  	lseg = send_layoutget(lo, ctx, &arg, gfp_flags);  	if (!lseg && first) { @@ -991,6 +1022,7 @@ out_unlock:  	spin_unlock(&ino->i_lock);  	goto out;  } +EXPORT_SYMBOL_GPL(pnfs_update_layout);  int  pnfs_layout_process(struct nfs4_layoutget *lgp) @@ -1048,35 +1080,71 @@ out_forget_reply:  	goto out;  } +void +pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) +{ +	BUG_ON(pgio->pg_lseg != NULL); + +	pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, +					   req->wb_context, +					   req_offset(req), +					   req->wb_bytes, +					   IOMODE_READ, +					   GFP_KERNEL); +	/* If no lseg, fall back to read through mds */ +	if (pgio->pg_lseg == NULL) +		nfs_pageio_reset_read_mds(pgio); + +} +EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_read); + +void +pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) +{ +	BUG_ON(pgio->pg_lseg != NULL); + +	pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, +					   req->wb_context, +					   req_offset(req), +					   req->wb_bytes, +					   IOMODE_RW, +					   GFP_NOFS); +	/* If no lseg, fall back to write through mds */ +	if (pgio->pg_lseg == NULL) +		nfs_pageio_reset_write_mds(pgio); +} +EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_write); +  bool -pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, -		     struct nfs_page *req) +pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode)  { -	enum pnfs_iomode access_type; -	gfp_t gfp_flags; +	struct nfs_server *server = NFS_SERVER(inode); +	struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld; -	/* We assume that pg_ioflags == 0 iff we're reading a page */ -	if (pgio->pg_ioflags == 0) { -		access_type = IOMODE_READ; -		gfp_flags = GFP_KERNEL; -	} else { -		access_type = IOMODE_RW; -		gfp_flags = GFP_NOFS; -	} +	if (ld == NULL) +		return false; +	nfs_pageio_init(pgio, inode, ld->pg_read_ops, server->rsize, 0); +	return true; +} -	if (pgio->pg_lseg == NULL) { -		if (pgio->pg_count != prev->wb_bytes) -			return true; -		/* This is first coelesce call for a series of nfs_pages */ -		pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, -						   prev->wb_context, -						   req_offset(prev), -						   pgio->pg_count, -						   access_type, -						   gfp_flags); -		if (pgio->pg_lseg == NULL) -			return true; -	} +bool +pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode, int ioflags) +{ +	struct nfs_server *server = NFS_SERVER(inode); +	struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld; + +	if (ld == NULL) +		return false; +	nfs_pageio_init(pgio, inode, ld->pg_write_ops, server->wsize, ioflags); +	return true; +} + +bool +pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, +		     struct nfs_page *req) +{ +	if (pgio->pg_lseg == NULL) +		return nfs_generic_pg_test(pgio, prev, req);  	/*  	 * Test if a nfs_page is fully contained in the pnfs_layout_range. @@ -1120,15 +1188,30 @@ pnfs_ld_write_done(struct nfs_write_data *data)  }  EXPORT_SYMBOL_GPL(pnfs_ld_write_done); -enum pnfs_try_status +static void +pnfs_write_through_mds(struct nfs_pageio_descriptor *desc, +		struct nfs_write_data *data) +{ +	list_splice_tail_init(&data->pages, &desc->pg_list); +	if (data->req && list_empty(&data->req->wb_list)) +		nfs_list_add_request(data->req, &desc->pg_list); +	nfs_pageio_reset_write_mds(desc); +	desc->pg_recoalesce = 1; +	nfs_writedata_release(data); +} + +static enum pnfs_try_status  pnfs_try_to_write_data(struct nfs_write_data *wdata, -			const struct rpc_call_ops *call_ops, int how) +			const struct rpc_call_ops *call_ops, +			struct pnfs_layout_segment *lseg, +			int how)  {  	struct inode *inode = wdata->inode;  	enum pnfs_try_status trypnfs;  	struct nfs_server *nfss = NFS_SERVER(inode);  	wdata->mds_ops = call_ops; +	wdata->lseg = get_lseg(lseg);  	dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__,  		inode->i_ino, wdata->args.count, wdata->args.offset, how); @@ -1144,6 +1227,44 @@ pnfs_try_to_write_data(struct nfs_write_data *wdata,  	return trypnfs;  } +static void +pnfs_do_multiple_writes(struct nfs_pageio_descriptor *desc, struct list_head *head, int how) +{ +	struct nfs_write_data *data; +	const struct rpc_call_ops *call_ops = desc->pg_rpc_callops; +	struct pnfs_layout_segment *lseg = desc->pg_lseg; + +	desc->pg_lseg = NULL; +	while (!list_empty(head)) { +		enum pnfs_try_status trypnfs; + +		data = list_entry(head->next, struct nfs_write_data, list); +		list_del_init(&data->list); + +		trypnfs = pnfs_try_to_write_data(data, call_ops, lseg, how); +		if (trypnfs == PNFS_NOT_ATTEMPTED) +			pnfs_write_through_mds(desc, data); +	} +	put_lseg(lseg); +} + +int +pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc) +{ +	LIST_HEAD(head); +	int ret; + +	ret = nfs_generic_flush(desc, &head); +	if (ret != 0) { +		put_lseg(desc->pg_lseg); +		desc->pg_lseg = NULL; +		return ret; +	} +	pnfs_do_multiple_writes(desc, &head, desc->pg_ioflags); +	return 0; +} +EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages); +  /*   * Called by non rpc-based layout drivers   */ @@ -1167,18 +1288,32 @@ pnfs_ld_read_done(struct nfs_read_data *data)  }  EXPORT_SYMBOL_GPL(pnfs_ld_read_done); +static void +pnfs_read_through_mds(struct nfs_pageio_descriptor *desc, +		struct nfs_read_data *data) +{ +	list_splice_tail_init(&data->pages, &desc->pg_list); +	if (data->req && list_empty(&data->req->wb_list)) +		nfs_list_add_request(data->req, &desc->pg_list); +	nfs_pageio_reset_read_mds(desc); +	desc->pg_recoalesce = 1; +	nfs_readdata_release(data); +} +  /*   * Call the appropriate parallel I/O subsystem read function.   */ -enum pnfs_try_status +static enum pnfs_try_status  pnfs_try_to_read_data(struct nfs_read_data *rdata, -		       const struct rpc_call_ops *call_ops) +		       const struct rpc_call_ops *call_ops, +		       struct pnfs_layout_segment *lseg)  {  	struct inode *inode = rdata->inode;  	struct nfs_server *nfss = NFS_SERVER(inode);  	enum pnfs_try_status trypnfs;  	rdata->mds_ops = call_ops; +	rdata->lseg = get_lseg(lseg);  	dprintk("%s: Reading ino:%lu %u@%llu\n",  		__func__, inode->i_ino, rdata->args.count, rdata->args.offset); @@ -1194,17 +1329,56 @@ pnfs_try_to_read_data(struct nfs_read_data *rdata,  	return trypnfs;  } +static void +pnfs_do_multiple_reads(struct nfs_pageio_descriptor *desc, struct list_head *head) +{ +	struct nfs_read_data *data; +	const struct rpc_call_ops *call_ops = desc->pg_rpc_callops; +	struct pnfs_layout_segment *lseg = desc->pg_lseg; + +	desc->pg_lseg = NULL; +	while (!list_empty(head)) { +		enum pnfs_try_status trypnfs; + +		data = list_entry(head->next, struct nfs_read_data, list); +		list_del_init(&data->list); + +		trypnfs = pnfs_try_to_read_data(data, call_ops, lseg); +		if (trypnfs == PNFS_NOT_ATTEMPTED) +			pnfs_read_through_mds(desc, data); +	} +	put_lseg(lseg); +} + +int +pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc) +{ +	LIST_HEAD(head); +	int ret; + +	ret = nfs_generic_pagein(desc, &head); +	if (ret != 0) { +		put_lseg(desc->pg_lseg); +		desc->pg_lseg = NULL; +		return ret; +	} +	pnfs_do_multiple_reads(desc, &head); +	return 0; +} +EXPORT_SYMBOL_GPL(pnfs_generic_pg_readpages); +  /* - * Currently there is only one (whole file) write lseg. + * There can be multiple RW segments.   */ -static struct pnfs_layout_segment *pnfs_list_write_lseg(struct inode *inode) +static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp)  { -	struct pnfs_layout_segment *lseg, *rv = NULL; +	struct pnfs_layout_segment *lseg; -	list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) -		if (lseg->pls_range.iomode == IOMODE_RW) -			rv = lseg; -	return rv; +	list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) { +		if (lseg->pls_range.iomode == IOMODE_RW && +		    test_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) +			list_add(&lseg->pls_lc_list, listp); +	}  }  void @@ -1216,17 +1390,19 @@ pnfs_set_layoutcommit(struct nfs_write_data *wdata)  	spin_lock(&nfsi->vfs_inode.i_lock);  	if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { -		/* references matched in nfs4_layoutcommit_release */ -		get_lseg(wdata->lseg); -		wdata->lseg->pls_lc_cred = -			get_rpccred(wdata->args.context->state->owner->so_cred);  		mark_as_dirty = true;  		dprintk("%s: Set layoutcommit for inode %lu ",  			__func__, wdata->inode->i_ino);  	} -	if (end_pos > wdata->lseg->pls_end_pos) -		wdata->lseg->pls_end_pos = end_pos; +	if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &wdata->lseg->pls_flags)) { +		/* references matched in nfs4_layoutcommit_release */ +		get_lseg(wdata->lseg); +	} +	if (end_pos > nfsi->layout->plh_lwb) +		nfsi->layout->plh_lwb = end_pos;  	spin_unlock(&nfsi->vfs_inode.i_lock); +	dprintk("%s: lseg %p end_pos %llu\n", +		__func__, wdata->lseg, nfsi->layout->plh_lwb);  	/* if pnfs_layoutcommit_inode() runs between inode locks, the next one  	 * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */ @@ -1235,6 +1411,14 @@ pnfs_set_layoutcommit(struct nfs_write_data *wdata)  }  EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit); +void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data) +{ +	struct nfs_server *nfss = NFS_SERVER(data->args.inode); + +	if (nfss->pnfs_curr_ld->cleanup_layoutcommit) +		nfss->pnfs_curr_ld->cleanup_layoutcommit(data); +} +  /*   * For the LAYOUT4_NFSV4_1_FILES layout type, NFS_DATA_SYNC WRITEs and   * NFS_UNSTABLE WRITEs with a COMMIT to data servers must store enough @@ -1248,8 +1432,6 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)  {  	struct nfs4_layoutcommit_data *data;  	struct nfs_inode *nfsi = NFS_I(inode); -	struct pnfs_layout_segment *lseg; -	struct rpc_cred *cred;  	loff_t end_pos;  	int status = 0; @@ -1266,30 +1448,25 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)  		goto out;  	} +	INIT_LIST_HEAD(&data->lseg_list);  	spin_lock(&inode->i_lock);  	if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {  		spin_unlock(&inode->i_lock);  		kfree(data);  		goto out;  	} -	/* -	 * Currently only one (whole file) write lseg which is referenced -	 * in pnfs_set_layoutcommit and will be found. -	 */ -	lseg = pnfs_list_write_lseg(inode); -	end_pos = lseg->pls_end_pos; -	cred = lseg->pls_lc_cred; -	lseg->pls_end_pos = 0; -	lseg->pls_lc_cred = NULL; +	pnfs_list_write_lseg(inode, &data->lseg_list); + +	end_pos = nfsi->layout->plh_lwb; +	nfsi->layout->plh_lwb = 0;  	memcpy(&data->args.stateid.data, nfsi->layout->plh_stateid.data,  		sizeof(nfsi->layout->plh_stateid.data));  	spin_unlock(&inode->i_lock);  	data->args.inode = inode; -	data->lseg = lseg; -	data->cred = cred; +	data->cred = get_rpccred(nfsi->layout->plh_lc_cred);  	nfs_fattr_init(&data->fattr);  	data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask;  	data->res.fattr = &data->fattr; diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 96bf4e6f45b..01cbfd54f3c 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -36,16 +36,16 @@  enum {  	NFS_LSEG_VALID = 0,	/* cleared when lseg is recalled/returned */  	NFS_LSEG_ROC,		/* roc bit received from server */ +	NFS_LSEG_LAYOUTCOMMIT,	/* layoutcommit bit set for layoutcommit */  };  struct pnfs_layout_segment {  	struct list_head pls_list; +	struct list_head pls_lc_list;  	struct pnfs_layout_range pls_range;  	atomic_t pls_refcount;  	unsigned long pls_flags;  	struct pnfs_layout_hdr *pls_layout; -	struct rpc_cred	*pls_lc_cred; /* LAYOUTCOMMIT credential */ -	loff_t pls_end_pos; /* LAYOUTCOMMIT write end */  };  enum pnfs_try_status { @@ -80,6 +80,9 @@ struct pnfs_layoutdriver_type {  	struct module *owner;  	unsigned flags; +	int (*set_layoutdriver) (struct nfs_server *, const struct nfs_fh *); +	int (*clear_layoutdriver) (struct nfs_server *); +  	struct pnfs_layout_hdr * (*alloc_layout_hdr) (struct inode *inode, gfp_t gfp_flags);  	void (*free_layout_hdr) (struct pnfs_layout_hdr *); @@ -87,7 +90,8 @@ struct pnfs_layoutdriver_type {  	void (*free_lseg) (struct pnfs_layout_segment *lseg);  	/* test for nfs page cache coalescing */ -	bool (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *); +	const struct nfs_pageio_ops *pg_read_ops; +	const struct nfs_pageio_ops *pg_write_ops;  	/* Returns true if layoutdriver wants to divert this request to  	 * driver's commit routine. @@ -109,6 +113,8 @@ struct pnfs_layoutdriver_type {  				     struct xdr_stream *xdr,  				     const struct nfs4_layoutreturn_args *args); +	void (*cleanup_layoutcommit) (struct nfs4_layoutcommit_data *data); +  	void (*encode_layoutcommit) (struct pnfs_layout_hdr *layoutid,  				     struct xdr_stream *xdr,  				     const struct nfs4_layoutcommit_args *args); @@ -124,6 +130,8 @@ struct pnfs_layout_hdr {  	unsigned long		plh_block_lgets; /* block LAYOUTGET if >0 */  	u32			plh_barrier; /* ignore lower seqids */  	unsigned long		plh_flags; +	loff_t			plh_lwb; /* last write byte for layoutcommit */ +	struct rpc_cred		*plh_lc_cred; /* layoutcommit cred */  	struct inode		*plh_inode;  }; @@ -136,10 +144,21 @@ struct pnfs_device {  	unsigned int  pglen;  }; +#define NFS4_PNFS_GETDEVLIST_MAXNUM 16 + +struct pnfs_devicelist { +	unsigned int		eof; +	unsigned int		num_devs; +	struct nfs4_deviceid	dev_id[NFS4_PNFS_GETDEVLIST_MAXNUM]; +}; +  extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *);  extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);  /* nfs4proc.c */ +extern int nfs4_proc_getdevicelist(struct nfs_server *server, +				   const struct nfs_fh *fh, +				   struct pnfs_devicelist *devlist);  extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,  				   struct pnfs_device *dev);  extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp); @@ -148,16 +167,16 @@ extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp);  /* pnfs.c */  void get_layout_hdr(struct pnfs_layout_hdr *lo);  void put_lseg(struct pnfs_layout_segment *lseg); -struct pnfs_layout_segment * -pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, -		   loff_t pos, u64 count, enum pnfs_iomode access_type, -		   gfp_t gfp_flags); -void set_pnfs_layoutdriver(struct nfs_server *, u32 id); + +bool pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *); +bool pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *, int); + +void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, u32);  void unset_pnfs_layoutdriver(struct nfs_server *); -enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *, -					     const struct rpc_call_ops *, int); -enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *, -					    const struct rpc_call_ops *); +void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *); +int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc); +void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *, struct nfs_page *); +int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc);  bool pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req);  int pnfs_layout_process(struct nfs4_layoutget *lgp);  void pnfs_free_lseg_list(struct list_head *tmp_list); @@ -178,10 +197,24 @@ void pnfs_roc_release(struct inode *ino);  void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);  bool pnfs_roc_drain(struct inode *ino, u32 *barrier);  void pnfs_set_layoutcommit(struct nfs_write_data *wdata); +void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data);  int pnfs_layoutcommit_inode(struct inode *inode, bool sync);  int _pnfs_return_layout(struct inode *);  int pnfs_ld_write_done(struct nfs_write_data *);  int pnfs_ld_read_done(struct nfs_read_data *); +struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino, +					       struct nfs_open_context *ctx, +					       loff_t pos, +					       u64 count, +					       enum pnfs_iomode iomode, +					       gfp_t gfp_flags); + +void nfs4_deviceid_mark_client_invalid(struct nfs_client *clp); + +/* nfs4_deviceid_flags */ +enum { +	NFS_DEVICEID_INVALID = 0,       /* set when MDS clientid recalled */ +};  /* pnfs_dev.c */  struct nfs4_deviceid_node { @@ -189,13 +222,13 @@ struct nfs4_deviceid_node {  	struct hlist_node		tmpnode;  	const struct pnfs_layoutdriver_type *ld;  	const struct nfs_client		*nfs_client; +	unsigned long 			flags;  	struct nfs4_deviceid		deviceid;  	atomic_t			ref;  };  void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id);  struct nfs4_deviceid_node *nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *); -struct nfs4_deviceid_node *nfs4_unhash_put_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);  void nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);  void nfs4_init_deviceid_node(struct nfs4_deviceid_node *,  			     const struct pnfs_layoutdriver_type *, @@ -293,15 +326,6 @@ static inline int pnfs_return_layout(struct inode *ino)  	return 0;  } -static inline void pnfs_pageio_init(struct nfs_pageio_descriptor *pgio, -				    struct inode *inode) -{ -	struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; - -	if (ld) -		pgio->pg_test = ld->pg_test; -} -  #else  /* CONFIG_NFS_V4_1 */  static inline void pnfs_destroy_all_layouts(struct nfs_client *clp) @@ -322,28 +346,6 @@ static inline void put_lseg(struct pnfs_layout_segment *lseg)  {  } -static inline struct pnfs_layout_segment * -pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, -		   loff_t pos, u64 count, enum pnfs_iomode access_type, -		   gfp_t gfp_flags) -{ -	return NULL; -} - -static inline enum pnfs_try_status -pnfs_try_to_read_data(struct nfs_read_data *data, -		      const struct rpc_call_ops *call_ops) -{ -	return PNFS_NOT_ATTEMPTED; -} - -static inline enum pnfs_try_status -pnfs_try_to_write_data(struct nfs_write_data *data, -		       const struct rpc_call_ops *call_ops, int how) -{ -	return PNFS_NOT_ATTEMPTED; -} -  static inline int pnfs_return_layout(struct inode *ino)  {  	return 0; @@ -377,7 +379,8 @@ pnfs_roc_drain(struct inode *ino, u32 *barrier)  	return false;  } -static inline void set_pnfs_layoutdriver(struct nfs_server *s, u32 id) +static inline void set_pnfs_layoutdriver(struct nfs_server *s, +					 const struct nfs_fh *mntfh, u32 id)  {  } @@ -385,9 +388,14 @@ static inline void unset_pnfs_layoutdriver(struct nfs_server *s)  {  } -static inline void pnfs_pageio_init(struct nfs_pageio_descriptor *pgio, -				    struct inode *inode) +static inline bool pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode) +{ +	return false; +} + +static inline bool pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode, int ioflags)  { +	return false;  }  static inline void diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c index f0f8e1e22f6..6fda5228ef5 100644 --- a/fs/nfs/pnfs_dev.c +++ b/fs/nfs/pnfs_dev.c @@ -100,8 +100,8 @@ _find_get_deviceid(const struct pnfs_layoutdriver_type *ld,  	rcu_read_lock();  	d = _lookup_deviceid(ld, clp, id, hash); -	if (d && !atomic_inc_not_zero(&d->ref)) -		d = NULL; +	if (d != NULL) +		atomic_inc(&d->ref);  	rcu_read_unlock();  	return d;  } @@ -115,15 +115,15 @@ nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *ld,  EXPORT_SYMBOL_GPL(nfs4_find_get_deviceid);  /* - * Unhash and put deviceid + * Remove a deviceid from cache   *   * @clp nfs_client associated with deviceid   * @id the deviceid to unhash   *   * @ret the unhashed node, if found and dereferenced to zero, NULL otherwise.   */ -struct nfs4_deviceid_node * -nfs4_unhash_put_deviceid(const struct pnfs_layoutdriver_type *ld, +void +nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *ld,  			 const struct nfs_client *clp, const struct nfs4_deviceid *id)  {  	struct nfs4_deviceid_node *d; @@ -134,7 +134,7 @@ nfs4_unhash_put_deviceid(const struct pnfs_layoutdriver_type *ld,  	rcu_read_unlock();  	if (!d) {  		spin_unlock(&nfs4_deviceid_lock); -		return NULL; +		return;  	}  	hlist_del_init_rcu(&d->node);  	spin_unlock(&nfs4_deviceid_lock); @@ -142,28 +142,7 @@ nfs4_unhash_put_deviceid(const struct pnfs_layoutdriver_type *ld,  	/* balance the initial ref set in pnfs_insert_deviceid */  	if (atomic_dec_and_test(&d->ref)) -		return d; - -	return NULL; -} -EXPORT_SYMBOL_GPL(nfs4_unhash_put_deviceid); - -/* - * Delete a deviceid from cache - * - * @clp struct nfs_client qualifying the deviceid - * @id deviceid to delete - */ -void -nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *ld, -		     const struct nfs_client *clp, const struct nfs4_deviceid *id) -{ -	struct nfs4_deviceid_node *d; - -	d = nfs4_unhash_put_deviceid(ld, clp, id); -	if (!d) -		return; -	d->ld->free_deviceid_node(d); +		d->ld->free_deviceid_node(d);  }  EXPORT_SYMBOL_GPL(nfs4_delete_deviceid); @@ -177,6 +156,7 @@ nfs4_init_deviceid_node(struct nfs4_deviceid_node *d,  	INIT_HLIST_NODE(&d->tmpnode);  	d->ld = ld;  	d->nfs_client = nfs_client; +	d->flags = 0;  	d->deviceid = *id;  	atomic_set(&d->ref, 1);  } @@ -221,16 +201,15 @@ EXPORT_SYMBOL_GPL(nfs4_insert_deviceid_node);   *   * @d deviceid node to put   * - * @ret true iff the node was deleted + * return true iff the node was deleted + * Note that since the test for d->ref == 0 is sufficient to establish + * that the node is no longer hashed in the global device id cache.   */  bool  nfs4_put_deviceid_node(struct nfs4_deviceid_node *d)  { -	if (!atomic_dec_and_lock(&d->ref, &nfs4_deviceid_lock)) +	if (!atomic_dec_and_test(&d->ref))  		return false; -	hlist_del_init_rcu(&d->node); -	spin_unlock(&nfs4_deviceid_lock); -	synchronize_rcu();  	d->ld->free_deviceid_node(d);  	return true;  } @@ -275,3 +254,22 @@ nfs4_deviceid_purge_client(const struct nfs_client *clp)  	for (h = 0; h < NFS4_DEVICE_ID_HASH_SIZE; h++)  		_deviceid_purge_client(clp, h);  } + +/* + * Stop use of all deviceids associated with an nfs_client + */ +void +nfs4_deviceid_mark_client_invalid(struct nfs_client *clp) +{ +	struct nfs4_deviceid_node *d; +	struct hlist_node *n; +	int i; + +	rcu_read_lock(); +	for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE; i ++){ +		hlist_for_each_entry_rcu(d, n, &nfs4_deviceid_cache[i], node) +			if (d->nfs_client == clp) +				set_bit(NFS_DEVICEID_INVALID, &d->flags); +	} +	rcu_read_unlock(); +} diff --git a/fs/nfs/read.c b/fs/nfs/read.c index a68679f538f..2171c043ab0 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -30,8 +30,7 @@  #define NFSDBG_FACILITY		NFSDBG_PAGECACHE -static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc); -static int nfs_pagein_one(struct nfs_pageio_descriptor *desc); +static const struct nfs_pageio_ops nfs_pageio_read_ops;  static const struct rpc_call_ops nfs_read_partial_ops;  static const struct rpc_call_ops nfs_read_full_ops; @@ -68,7 +67,7 @@ void nfs_readdata_free(struct nfs_read_data *p)  	mempool_free(p, nfs_rdata_mempool);  } -static void nfs_readdata_release(struct nfs_read_data *rdata) +void nfs_readdata_release(struct nfs_read_data *rdata)  {  	put_lseg(rdata->lseg);  	put_nfs_open_context(rdata->args.context); @@ -113,6 +112,27 @@ static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data)  	}  } +static void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio, +		struct inode *inode) +{ +	nfs_pageio_init(pgio, inode, &nfs_pageio_read_ops, +			NFS_SERVER(inode)->rsize, 0); +} + +void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio) +{ +	pgio->pg_ops = &nfs_pageio_read_ops; +	pgio->pg_bsize = NFS_SERVER(pgio->pg_inode)->rsize; +} +EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds); + +static void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, +		struct inode *inode) +{ +	if (!pnfs_pageio_init_read(pgio, inode)) +		nfs_pageio_init_read_mds(pgio, inode); +} +  int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,  		       struct page *page)  { @@ -131,14 +151,9 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,  	if (len < PAGE_CACHE_SIZE)  		zero_user_segment(page, len, PAGE_CACHE_SIZE); -	nfs_pageio_init(&pgio, inode, NULL, 0, 0); -	nfs_list_add_request(new, &pgio.pg_list); -	pgio.pg_count = len; - -	if (NFS_SERVER(inode)->rsize < PAGE_CACHE_SIZE) -		nfs_pagein_multi(&pgio); -	else -		nfs_pagein_one(&pgio); +	nfs_pageio_init_read(&pgio, inode); +	nfs_pageio_add_request(&pgio, new); +	nfs_pageio_complete(&pgio);  	return 0;  } @@ -202,17 +217,14 @@ EXPORT_SYMBOL_GPL(nfs_initiate_read);  /*   * Set up the NFS read request struct   */ -static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, -		const struct rpc_call_ops *call_ops, -		unsigned int count, unsigned int offset, -		struct pnfs_layout_segment *lseg) +static void nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, +		unsigned int count, unsigned int offset)  {  	struct inode *inode = req->wb_context->dentry->d_inode;  	data->req	  = req;  	data->inode	  = inode;  	data->cred	  = req->wb_context->cred; -	data->lseg	  = get_lseg(lseg);  	data->args.fh     = NFS_FH(inode);  	data->args.offset = req_offset(req) + offset; @@ -226,14 +238,36 @@ static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,  	data->res.count   = count;  	data->res.eof     = 0;  	nfs_fattr_init(&data->fattr); +} -	if (data->lseg && -	    (pnfs_try_to_read_data(data, call_ops) == PNFS_ATTEMPTED)) -		return 0; +static int nfs_do_read(struct nfs_read_data *data, +		const struct rpc_call_ops *call_ops) +{ +	struct inode *inode = data->args.context->dentry->d_inode;  	return nfs_initiate_read(data, NFS_CLIENT(inode), call_ops);  } +static int +nfs_do_multiple_reads(struct list_head *head, +		const struct rpc_call_ops *call_ops) +{ +	struct nfs_read_data *data; +	int ret = 0; + +	while (!list_empty(head)) { +		int ret2; + +		data = list_entry(head->next, struct nfs_read_data, list); +		list_del_init(&data->list); + +		ret2 = nfs_do_read(data, call_ops); +		if (ret == 0) +			ret = ret2; +	} +	return ret; +} +  static void  nfs_async_read_error(struct list_head *head)  { @@ -260,20 +294,19 @@ nfs_async_read_error(struct list_head *head)   * won't see the new data until our attribute cache is updated.  This is more   * or less conventional NFS client behavior.   */ -static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc) +static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc, struct list_head *res)  {  	struct nfs_page *req = nfs_list_entry(desc->pg_list.next);  	struct page *page = req->wb_page;  	struct nfs_read_data *data; -	size_t rsize = NFS_SERVER(desc->pg_inode)->rsize, nbytes; +	size_t rsize = desc->pg_bsize, nbytes;  	unsigned int offset;  	int requests = 0;  	int ret = 0; -	struct pnfs_layout_segment *lseg; -	LIST_HEAD(list);  	nfs_list_remove_request(req); +	offset = 0;  	nbytes = desc->pg_count;  	do {  		size_t len = min(nbytes,rsize); @@ -281,45 +314,21 @@ static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc)  		data = nfs_readdata_alloc(1);  		if (!data)  			goto out_bad; -		list_add(&data->pages, &list); +		data->pagevec[0] = page; +		nfs_read_rpcsetup(req, data, len, offset); +		list_add(&data->list, res);  		requests++;  		nbytes -= len; +		offset += len;  	} while(nbytes != 0);  	atomic_set(&req->wb_complete, requests); - -	BUG_ON(desc->pg_lseg != NULL); -	lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, -				  req_offset(req), desc->pg_count, -				  IOMODE_READ, GFP_KERNEL);  	ClearPageError(page); -	offset = 0; -	nbytes = desc->pg_count; -	do { -		int ret2; - -		data = list_entry(list.next, struct nfs_read_data, pages); -		list_del_init(&data->pages); - -		data->pagevec[0] = page; - -		if (nbytes < rsize) -			rsize = nbytes; -		ret2 = nfs_read_rpcsetup(req, data, &nfs_read_partial_ops, -					 rsize, offset, lseg); -		if (ret == 0) -			ret = ret2; -		offset += rsize; -		nbytes -= rsize; -	} while (nbytes != 0); -	put_lseg(lseg); -	desc->pg_lseg = NULL; - +	desc->pg_rpc_callops = &nfs_read_partial_ops;  	return ret; -  out_bad: -	while (!list_empty(&list)) { -		data = list_entry(list.next, struct nfs_read_data, pages); -		list_del(&data->pages); +	while (!list_empty(res)) { +		data = list_entry(res->next, struct nfs_read_data, list); +		list_del(&data->list);  		nfs_readdata_free(data);  	}  	SetPageError(page); @@ -327,19 +336,19 @@ out_bad:  	return -ENOMEM;  } -static int nfs_pagein_one(struct nfs_pageio_descriptor *desc) +static int nfs_pagein_one(struct nfs_pageio_descriptor *desc, struct list_head *res)  {  	struct nfs_page		*req;  	struct page		**pages;  	struct nfs_read_data	*data;  	struct list_head *head = &desc->pg_list; -	struct pnfs_layout_segment *lseg = desc->pg_lseg; -	int ret = -ENOMEM; +	int ret = 0;  	data = nfs_readdata_alloc(nfs_page_array_len(desc->pg_base,  						     desc->pg_count));  	if (!data) {  		nfs_async_read_error(head); +		ret = -ENOMEM;  		goto out;  	} @@ -352,19 +361,37 @@ static int nfs_pagein_one(struct nfs_pageio_descriptor *desc)  		*pages++ = req->wb_page;  	}  	req = nfs_list_entry(data->pages.next); -	if ((!lseg) && list_is_singular(&data->pages)) -		lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, -					  req_offset(req), desc->pg_count, -					  IOMODE_READ, GFP_KERNEL); -	ret = nfs_read_rpcsetup(req, data, &nfs_read_full_ops, desc->pg_count, -				0, lseg); +	nfs_read_rpcsetup(req, data, desc->pg_count, 0); +	list_add(&data->list, res); +	desc->pg_rpc_callops = &nfs_read_full_ops;  out: -	put_lseg(lseg); -	desc->pg_lseg = NULL;  	return ret;  } +int nfs_generic_pagein(struct nfs_pageio_descriptor *desc, struct list_head *head) +{ +	if (desc->pg_bsize < PAGE_CACHE_SIZE) +		return nfs_pagein_multi(desc, head); +	return nfs_pagein_one(desc, head); +} + +static int nfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc) +{ +	LIST_HEAD(head); +	int ret; + +	ret = nfs_generic_pagein(desc, &head); +	if (ret == 0) +		ret = nfs_do_multiple_reads(&head, desc->pg_rpc_callops); +	return ret; +} + +static const struct nfs_pageio_ops nfs_pageio_read_ops = { +	.pg_test = nfs_generic_pg_test, +	.pg_doio = nfs_generic_pg_readpages, +}; +  /*   * This is the callback from RPC telling us whether a reply was   * received or some error occurred (timeout or socket shutdown). @@ -635,8 +662,6 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,  		.pgio = &pgio,  	};  	struct inode *inode = mapping->host; -	struct nfs_server *server = NFS_SERVER(inode); -	size_t rsize = server->rsize;  	unsigned long npages;  	int ret = -ESTALE; @@ -664,10 +689,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,  	if (ret == 0)  		goto read_complete; /* all pages were read */ -	if (rsize < PAGE_CACHE_SIZE) -		nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0); -	else -		nfs_pageio_init(&pgio, inode, nfs_pagein_one, rsize, 0); +	nfs_pageio_init_read(&pgio, inode);  	ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc); diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index 8d6864c2a5f..b2fbbde58e4 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -147,7 +147,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n  	alias = d_lookup(parent, &data->args.name);  	if (alias != NULL) { -		int ret = 0; +		int ret;  		void *devname_garbage = NULL;  		/* @@ -155,14 +155,16 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n  		 * the sillyrename information to the aliased dentry.  		 */  		nfs_free_dname(data); +		ret = nfs_copy_dname(alias, data);  		spin_lock(&alias->d_lock); -		if (alias->d_inode != NULL && +		if (ret == 0 && alias->d_inode != NULL &&  		    !(alias->d_flags & DCACHE_NFSFS_RENAMED)) {  			devname_garbage = alias->d_fsdata;  			alias->d_fsdata = data;  			alias->d_flags |= DCACHE_NFSFS_RENAMED;  			ret = 1; -		} +		} else +			ret = 0;  		spin_unlock(&alias->d_lock);  		nfs_dec_sillycount(dir);  		dput(alias); @@ -171,8 +173,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n  		 * point dentry is definitely not a root, so we won't need  		 * that anymore.  		 */ -		if (devname_garbage) -			kfree(devname_garbage); +		kfree(devname_garbage);  		return ret;  	}  	data->dir = igrab(dir); @@ -204,8 +205,6 @@ static int nfs_call_unlink(struct dentry *dentry, struct nfs_unlinkdata *data)  	if (parent == NULL)  		goto out_free;  	dir = parent->d_inode; -	if (nfs_copy_dname(dentry, data) != 0) -		goto out_dput;  	/* Non-exclusive lock protects against concurrent lookup() calls */  	spin_lock(&dir->i_lock);  	if (atomic_inc_not_zero(&NFS_I(dir)->silly_count) == 0) { @@ -366,6 +365,8 @@ static void nfs_async_rename_done(struct rpc_task *task, void *calldata)  	struct nfs_renamedata *data = calldata;  	struct inode *old_dir = data->old_dir;  	struct inode *new_dir = data->new_dir; +	struct dentry *old_dentry = data->old_dentry; +	struct dentry *new_dentry = data->new_dentry;  	if (!NFS_PROTO(old_dir)->rename_done(task, old_dir, new_dir)) {  		nfs_restart_rpc(task, NFS_SERVER(old_dir)->nfs_client); @@ -373,12 +374,12 @@ static void nfs_async_rename_done(struct rpc_task *task, void *calldata)  	}  	if (task->tk_status != 0) { -		nfs_cancel_async_unlink(data->old_dentry); +		nfs_cancel_async_unlink(old_dentry);  		return;  	} -	nfs_set_verifier(data->old_dentry, nfs_save_change_attribute(old_dir)); -	d_move(data->old_dentry, data->new_dentry); +	d_drop(old_dentry); +	d_drop(new_dentry);  }  /** @@ -501,6 +502,14 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir,   * and only performs the unlink once the last reference to it is put.   *   * The final cleanup is done during dentry_iput. + * + * (Note: NFSv4 is stateful, and has opens, so in theory an NFSv4 server + * could take responsibility for keeping open files referenced.  The server + * would also need to ensure that opened-but-deleted files were kept over + * reboots.  However, we may not assume a server does so.  (RFC 5661 + * does provide an OPEN4_RESULT_PRESERVE_UNLINKED flag that a server can + * use to advertise that it does this; some day we may take advantage of + * it.))   */  int  nfs_sillyrename(struct inode *dir, struct dentry *dentry) @@ -560,6 +569,14 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry)  	if (error)  		goto out_dput; +	/* populate unlinkdata with the right dname */ +	error = nfs_copy_dname(sdentry, +				(struct nfs_unlinkdata *)dentry->d_fsdata); +	if (error) { +		nfs_cancel_async_unlink(dentry); +		goto out_dput; +	} +  	/* run the rename task, undo unlink if it fails */  	task = nfs_async_rename(dir, dir, dentry, sdentry);  	if (IS_ERR(task)) { diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 08579312c57..b39b37f8091 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -97,7 +97,7 @@ void nfs_writedata_free(struct nfs_write_data *p)  	mempool_free(p, nfs_wdata_mempool);  } -static void nfs_writedata_release(struct nfs_write_data *wdata) +void nfs_writedata_release(struct nfs_write_data *wdata)  {  	put_lseg(wdata->lseg);  	put_nfs_open_context(wdata->args.context); @@ -845,11 +845,9 @@ EXPORT_SYMBOL_GPL(nfs_initiate_write);  /*   * Set up the argument/result storage required for the RPC call.   */ -static int nfs_write_rpcsetup(struct nfs_page *req, +static void nfs_write_rpcsetup(struct nfs_page *req,  		struct nfs_write_data *data, -		const struct rpc_call_ops *call_ops,  		unsigned int count, unsigned int offset, -		struct pnfs_layout_segment *lseg,  		int how)  {  	struct inode *inode = req->wb_context->dentry->d_inode; @@ -860,7 +858,6 @@ static int nfs_write_rpcsetup(struct nfs_page *req,  	data->req = req;  	data->inode = inode = req->wb_context->dentry->d_inode;  	data->cred = req->wb_context->cred; -	data->lseg = get_lseg(lseg);  	data->args.fh     = NFS_FH(inode);  	data->args.offset = req_offset(req) + offset; @@ -872,24 +869,51 @@ static int nfs_write_rpcsetup(struct nfs_page *req,  	data->args.context = get_nfs_open_context(req->wb_context);  	data->args.lock_context = req->wb_lock_context;  	data->args.stable  = NFS_UNSTABLE; -	if (how & (FLUSH_STABLE | FLUSH_COND_STABLE)) { -		data->args.stable = NFS_DATA_SYNC; -		if (!nfs_need_commit(NFS_I(inode))) -			data->args.stable = NFS_FILE_SYNC; +	switch (how & (FLUSH_STABLE | FLUSH_COND_STABLE)) { +	case 0: +		break; +	case FLUSH_COND_STABLE: +		if (nfs_need_commit(NFS_I(inode))) +			break; +	default: +		data->args.stable = NFS_FILE_SYNC;  	}  	data->res.fattr   = &data->fattr;  	data->res.count   = count;  	data->res.verf    = &data->verf;  	nfs_fattr_init(&data->fattr); +} -	if (data->lseg && -	    (pnfs_try_to_write_data(data, call_ops, how) == PNFS_ATTEMPTED)) -		return 0; +static int nfs_do_write(struct nfs_write_data *data, +		const struct rpc_call_ops *call_ops, +		int how) +{ +	struct inode *inode = data->args.context->dentry->d_inode;  	return nfs_initiate_write(data, NFS_CLIENT(inode), call_ops, how);  } +static int nfs_do_multiple_writes(struct list_head *head, +		const struct rpc_call_ops *call_ops, +		int how) +{ +	struct nfs_write_data *data; +	int ret = 0; + +	while (!list_empty(head)) { +		int ret2; + +		data = list_entry(head->next, struct nfs_write_data, list); +		list_del_init(&data->list); +		 +		ret2 = nfs_do_write(data, call_ops, how); +		 if (ret == 0) +			 ret = ret2; +	} +	return ret; +} +  /* If a nfs_flush_* function fails, it should remove reqs from @head and   * call this on each, which will prepare them to be retried on next   * writeback using standard nfs. @@ -907,17 +931,15 @@ static void nfs_redirty_request(struct nfs_page *req)   * Generate multiple small requests to write out a single   * contiguous dirty area on one page.   */ -static int nfs_flush_multi(struct nfs_pageio_descriptor *desc) +static int nfs_flush_multi(struct nfs_pageio_descriptor *desc, struct list_head *res)  {  	struct nfs_page *req = nfs_list_entry(desc->pg_list.next);  	struct page *page = req->wb_page;  	struct nfs_write_data *data; -	size_t wsize = NFS_SERVER(desc->pg_inode)->wsize, nbytes; +	size_t wsize = desc->pg_bsize, nbytes;  	unsigned int offset;  	int requests = 0;  	int ret = 0; -	struct pnfs_layout_segment *lseg; -	LIST_HEAD(list);  	nfs_list_remove_request(req); @@ -927,6 +949,7 @@ static int nfs_flush_multi(struct nfs_pageio_descriptor *desc)  		desc->pg_ioflags &= ~FLUSH_COND_STABLE; +	offset = 0;  	nbytes = desc->pg_count;  	do {  		size_t len = min(nbytes, wsize); @@ -934,45 +957,21 @@ static int nfs_flush_multi(struct nfs_pageio_descriptor *desc)  		data = nfs_writedata_alloc(1);  		if (!data)  			goto out_bad; -		list_add(&data->pages, &list); +		data->pagevec[0] = page; +		nfs_write_rpcsetup(req, data, wsize, offset, desc->pg_ioflags); +		list_add(&data->list, res);  		requests++;  		nbytes -= len; +		offset += len;  	} while (nbytes != 0);  	atomic_set(&req->wb_complete, requests); - -	BUG_ON(desc->pg_lseg); -	lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, -				  req_offset(req), desc->pg_count, -				  IOMODE_RW, GFP_NOFS); -	ClearPageError(page); -	offset = 0; -	nbytes = desc->pg_count; -	do { -		int ret2; - -		data = list_entry(list.next, struct nfs_write_data, pages); -		list_del_init(&data->pages); - -		data->pagevec[0] = page; - -		if (nbytes < wsize) -			wsize = nbytes; -		ret2 = nfs_write_rpcsetup(req, data, &nfs_write_partial_ops, -					  wsize, offset, lseg, desc->pg_ioflags); -		if (ret == 0) -			ret = ret2; -		offset += wsize; -		nbytes -= wsize; -	} while (nbytes != 0); - -	put_lseg(lseg); -	desc->pg_lseg = NULL; +	desc->pg_rpc_callops = &nfs_write_partial_ops;  	return ret;  out_bad: -	while (!list_empty(&list)) { -		data = list_entry(list.next, struct nfs_write_data, pages); -		list_del(&data->pages); +	while (!list_empty(res)) { +		data = list_entry(res->next, struct nfs_write_data, list); +		list_del(&data->list);  		nfs_writedata_free(data);  	}  	nfs_redirty_request(req); @@ -987,14 +986,13 @@ out_bad:   * This is the case if nfs_updatepage detects a conflicting request   * that has been written but not committed.   */ -static int nfs_flush_one(struct nfs_pageio_descriptor *desc) +static int nfs_flush_one(struct nfs_pageio_descriptor *desc, struct list_head *res)  {  	struct nfs_page		*req;  	struct page		**pages;  	struct nfs_write_data	*data;  	struct list_head *head = &desc->pg_list; -	struct pnfs_layout_segment *lseg = desc->pg_lseg; -	int ret; +	int ret = 0;  	data = nfs_writedata_alloc(nfs_page_array_len(desc->pg_base,  						      desc->pg_count)); @@ -1016,32 +1014,62 @@ static int nfs_flush_one(struct nfs_pageio_descriptor *desc)  		*pages++ = req->wb_page;  	}  	req = nfs_list_entry(data->pages.next); -	if ((!lseg) && list_is_singular(&data->pages)) -		lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, -					  req_offset(req), desc->pg_count, -					  IOMODE_RW, GFP_NOFS);  	if ((desc->pg_ioflags & FLUSH_COND_STABLE) &&  	    (desc->pg_moreio || NFS_I(desc->pg_inode)->ncommit))  		desc->pg_ioflags &= ~FLUSH_COND_STABLE;  	/* Set up the argument struct */ -	ret = nfs_write_rpcsetup(req, data, &nfs_write_full_ops, desc->pg_count, 0, lseg, desc->pg_ioflags); +	nfs_write_rpcsetup(req, data, desc->pg_count, 0, desc->pg_ioflags); +	list_add(&data->list, res); +	desc->pg_rpc_callops = &nfs_write_full_ops;  out: -	put_lseg(lseg); /* Cleans any gotten in ->pg_test */ -	desc->pg_lseg = NULL;  	return ret;  } -static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, +int nfs_generic_flush(struct nfs_pageio_descriptor *desc, struct list_head *head) +{ +	if (desc->pg_bsize < PAGE_CACHE_SIZE) +		return nfs_flush_multi(desc, head); +	return nfs_flush_one(desc, head); +} + +static int nfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc) +{ +	LIST_HEAD(head); +	int ret; + +	ret = nfs_generic_flush(desc, &head); +	if (ret == 0) +		ret = nfs_do_multiple_writes(&head, desc->pg_rpc_callops, +				desc->pg_ioflags); +	return ret; +} + +static const struct nfs_pageio_ops nfs_pageio_write_ops = { +	.pg_test = nfs_generic_pg_test, +	.pg_doio = nfs_generic_pg_writepages, +}; + +static void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio,  				  struct inode *inode, int ioflags)  { -	size_t wsize = NFS_SERVER(inode)->wsize; +	nfs_pageio_init(pgio, inode, &nfs_pageio_write_ops, +				NFS_SERVER(inode)->wsize, ioflags); +} + +void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio) +{ +	pgio->pg_ops = &nfs_pageio_write_ops; +	pgio->pg_bsize = NFS_SERVER(pgio->pg_inode)->wsize; +} +EXPORT_SYMBOL_GPL(nfs_pageio_reset_write_mds); -	if (wsize < PAGE_CACHE_SIZE) -		nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags); -	else -		nfs_pageio_init(pgio, inode, nfs_flush_one, wsize, ioflags); +static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, +				  struct inode *inode, int ioflags) +{ +	if (!pnfs_pageio_init_write(pgio, inode, ioflags)) +		nfs_pageio_init_write_mds(pgio, inode, ioflags);  }  /* @@ -1566,8 +1594,7 @@ int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)  		int status;  		bool sync = true; -		if (wbc->sync_mode == WB_SYNC_NONE || wbc->nonblocking || -		    wbc->for_background) +		if (wbc->sync_mode == WB_SYNC_NONE)  			sync = false;  		status = pnfs_layoutcommit_inode(inode, sync);  |